From 793da934082ec2a6fc738bce751eb43998806403 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Tue, 5 Oct 2021 14:05:54 -0700
Subject: [PATCH 01/69] Update links in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 50fcf66..be0801b 100644
--- a/README.md
+++ b/README.md
@@ -38,8 +38,8 @@ The repository has implementations for the following Bayesian layers:
 Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
 
 Other features include:
-- [x] AvUC: Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
-- [x] MOPED: specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
+- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
+- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/738df2ddfef8a1c9eaa0463053d926723c9bb9ec/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
 - [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`)
  
 
From b9245101b53fbda0d6a968e35f8ee9fdc50bccd5 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Tue, 5 Oct 2021 07:25:33 -0700
Subject: [PATCH 02/69] update MOPED layer example utility function

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 .../main_bayesian_flipout_imagenet.py         |  2 +-
 bayesian_torch/utils/util.py                  | 62 +++++++++++--------
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/bayesian_torch/examples/main_bayesian_flipout_imagenet.py b/bayesian_torch/examples/main_bayesian_flipout_imagenet.py
index 067c896..212a1e9 100644
--- a/bayesian_torch/examples/main_bayesian_flipout_imagenet.py
+++ b/bayesian_torch/examples/main_bayesian_flipout_imagenet.py
@@ -212,7 +212,7 @@ def MOPED_layer(layer, det_layer, delta):
         print(str(layer))
         layer.weight.data = det_layer.weight.data
         if layer.bias is not None:
-            layer.bias.data = det_layer.bias.data2
+            layer.bias.data = det_layer.bias.data
 
     elif (str(layer) == 'LinearFlipout()'
           or str(layer) == 'LinearReparameterization()'):
diff --git a/bayesian_torch/utils/util.py b/bayesian_torch/utils/util.py
index 24f5fde..df51f41 100644
--- a/bayesian_torch/utils/util.py
+++ b/bayesian_torch/utils/util.py
@@ -55,9 +55,9 @@ def mutual_information(mc_preds):
     Compute the difference between the entropy of the mean of the
     predictive distribution and the mean of the entropy.
     """
-    MI = entropy(np.mean(mc_preds, axis=0)) - np.mean(entropy(mc_preds),
-                                                      axis=0)
-    return MI
+    mutual_info = entropy(np.mean(mc_preds, axis=0)) - np.mean(entropy(mc_preds),
+                                                               axis=0)
+    return mutual_info
 
 
 def get_rho(sigma, delta):
@@ -86,39 +86,51 @@ def MOPED(model, det_model, det_checkpoint, delta):
     for (idx, layer), (det_idx,
                        det_layer) in zip(enumerate(model.modules()),
                                          enumerate(det_model.modules())):
-        if (str(layer) == 'Conv1dVariational()'
-                or str(layer) == 'Conv2dVariational()'
-                or str(layer) == 'Conv3dVariational()'
-                or str(layer) == 'ConvTranspose1dVariational()'
-                or str(layer) == 'ConvTranspose2dVariational()'
-                or str(layer) == 'ConvTranspose3dVariational()'):
+        if (str(layer) == 'Conv1dReparametrization()'
+                or str(layer) == 'Conv2dReparameterization()'
+                or str(layer) == 'Conv3dReparameterization()'
+                or str(layer) == 'ConvTranspose1dReparameterization()'
+                or str(layer) == 'ConvTranspose2dReparameterization()'
+                or str(layer) == 'ConvTranspose3dReparameterization()'
+                or str(layer) == 'Conv1dFlipout()'
+                or str(layer) == 'Conv2dFlipout()'
+                or str(layer) == 'Conv3dFlipout()'
+                or str(layer) == 'ConvTranspose1dFlipout()'
+                or str(layer) == 'ConvTranspose2dFlipout()'
+                or str(layer) == 'ConvTranspose3dFlipout()'):
             #set the priors
-            layer.prior_weight_mu.data = det_layer.weight
-            layer.prior_bias_mu.data = det_layer.bias
+            layer.prior_weight_mu = det_layer.weight.data
+            if layer.prior_bias_mu is not None:
+               layer.prior_bias_mu = det_layer.bias.data
 
             #initialize surrogate posteriors
-            layer.mu_kernel.data = det_layer.weight
+            layer.mu_kernel.data = det_layer.weight.data
             layer.rho_kernel.data = get_rho(det_layer.weight.data, delta)
-            layer.mu_bias.data = det_layer.bias
-            layer.rho_bias.data = get_rho(det_layer.bias.data, delta)
-        elif (str(layer) == 'LinearVariational()'):
+            if layer.mu_bias is not None:
+               layer.mu_bias.data = det_layer.bias.data
+               layer.rho_bias.data = get_rho(det_layer.bias.data, delta)
+        elif (str(layer) == 'LinearReparameterization()'
+                or str(layer) == 'LinearFlipout()'):
             #set the priors
-            layer.prior_weight_mu.data = det_layer.weight
-            layer.prior_bias_mu.data = det_layer.bias
+            layer.prior_weight_mu = det_layer.weight.data
+            if layer.prior_bias_mu is not None:
+               layer.prior_bias_mu.data = det_layer.bias
 
             #initialize the surrogate posteriors
-            layer.mu_weight.data = det_layer.weight
+            layer.mu_weight.data = det_layer.weight.data
             layer.rho_weight.data = get_rho(det_layer.weight.data, delta)
-            layer.mu_bias.data = det_layer.bias
-            layer.rho_bias.data = get_rho(det_layer.bias.data, delta)
+            if layer.mu_bias is not None:
+               layer.mu_bias.data = det_layer.bias.data
+               layer.rho_bias.data = get_rho(det_layer.bias.data, delta)
 
         elif str(layer).startswith('Batch'):
             #initialize parameters
-            layer.weight.data = det_layer.weight
-            layer.bias.data = det_layer.bias
-            layer.running_mean.data = det_layer.running_mean
-            layer.running_var.data = det_layer.running_var
-            layer.num_batches_tracked.data = det_layer.num_batches_tracked
+            layer.weight.data = det_layer.weight.data
+            if layer.bias is not None:
+               layer.bias.data = det_layer.bias
+            layer.running_mean.data = det_layer.running_mean.data
+            layer.running_var.data = det_layer.running_var.data
+            layer.num_batches_tracked.data = det_layer.num_batches_tracked.data
 
     model.state_dict()
     return model

From 81648f90589b9be74623434674ef4c083e7422a9 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Tue, 5 Oct 2021 07:29:39 -0700
Subject: [PATCH 03/69] fix minor typo.

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 bayesian_torch/utils/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bayesian_torch/utils/util.py b/bayesian_torch/utils/util.py
index df51f41..7418679 100644
--- a/bayesian_torch/utils/util.py
+++ b/bayesian_torch/utils/util.py
@@ -86,7 +86,7 @@ def MOPED(model, det_model, det_checkpoint, delta):
     for (idx, layer), (det_idx,
                        det_layer) in zip(enumerate(model.modules()),
                                          enumerate(det_model.modules())):
-        if (str(layer) == 'Conv1dReparametrization()'
+        if (str(layer) == 'Conv1dReparameterization()'
                 or str(layer) == 'Conv2dReparameterization()'
                 or str(layer) == 'Conv3dReparameterization()'
                 or str(layer) == 'ConvTranspose1dReparameterization()'

From a5065302947199152a453ed65e893625ba54d368 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Tue, 5 Oct 2021 14:32:18 -0700
Subject: [PATCH 04/69] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index be0801b..9d77441 100644
--- a/README.md
+++ b/README.md
@@ -38,8 +38,8 @@ The repository has implementations for the following Bayesian layers:
 Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
 
 Other features include:
-- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan et al. 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
-- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/738df2ddfef8a1c9eaa0463053d926723c9bb9ec/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
+- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
+- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
 - [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`)
  
 
From ade5f9bf8f64b934df20eab9163422f8dc958da5 Mon Sep 17 00:00:00 2001
From: Pi <piero.skywalker@gmail.com>
Date: Fri, 26 Nov 2021 10:50:15 -0300
Subject: [PATCH 05/69] feat: add possibility to return no kl, save it as
 attribute

---
 .../variational_layers/conv_variational.py    | 60 +++++++++++++++----
 .../variational_layers/linear_variational.py  | 10 +++-
 .../variational_layers/rnn_variational.py     | 10 +++-
 3 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 4311400..96b1db5 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -112,6 +112,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size))
         self.rho_kernel = Parameter(
@@ -160,7 +162,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -182,7 +184,11 @@ def forward(self, input):
         else:
             kl = kl_weight
 
-        return out, kl
+        self.kl = kl
+
+        if return_kl:
+            return out, kl
+        return out
 
 
 class Conv2dReparameterization(BaseVariationalLayer_):
@@ -239,6 +245,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size,
                          kernel_size))
@@ -292,7 +300,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -313,8 +321,12 @@ def forward(self, input):
             kl = kl_weight + kl_bias
         else:
             kl = kl_weight
+        
+        self.kl = kl
 
-        return out, kl
+        if return_kl:
+            return out, kl
+        return out
 
 
 class Conv3dReparameterization(BaseVariationalLayer_):
@@ -371,6 +383,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size,
                          kernel_size, kernel_size))
@@ -424,7 +438,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -446,7 +460,11 @@ def forward(self, input):
         else:
             kl = kl_weight
 
-        return out, kl
+        self.kl = kl
+
+        if return_kl:
+            return out, kl
+        return out
 
 
 class ConvTranspose1dReparameterization(BaseVariationalLayer_):
@@ -504,6 +522,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size))
         self.rho_kernel = Parameter(
@@ -552,7 +572,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -575,7 +595,11 @@ def forward(self, input):
         else:
             kl = kl_weight
 
-        return out, kl
+        self.kl = kl
+
+        if return_kl:
+            return out, kl
+        return out
 
 
 class ConvTranspose2dReparameterization(BaseVariationalLayer_):
@@ -633,6 +657,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size,
                          kernel_size))
@@ -686,7 +712,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -709,7 +735,11 @@ def forward(self, input):
         else:
             kl = kl_weight
 
-        return out, kl
+        self.kl = kl
+
+        if return_kl:
+            return out, kl
+        return out
 
 
 class ConvTranspose3dReparameterization(BaseVariationalLayer_):
@@ -768,6 +798,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size,
                          kernel_size, kernel_size))
@@ -821,7 +853,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
@@ -844,4 +876,8 @@ def forward(self, input):
         else:
             kl = kl_weight
 
-        return out, kl
+        self.kl = kl
+
+        if return_kl:
+            return out, kl
+        return out
diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py
index af113f5..bb3a296 100644
--- a/bayesian_torch/layers/variational_layers/linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/linear_variational.py
@@ -83,6 +83,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_weight = Parameter(torch.Tensor(out_features, in_features))
         self.rho_weight = Parameter(torch.Tensor(out_features, in_features))
         self.register_buffer('eps_weight',
@@ -124,7 +126,7 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
-    def forward(self, input):
+    def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         weight = self.mu_weight + \
             (sigma_weight * self.eps_weight.data.normal_())
@@ -143,5 +145,9 @@ def forward(self, input):
             kl = kl_weight + kl_bias
         else:
             kl = kl_weight
+            
+        self.kl = kl
 
-        return out, kl
+        if return_kl:
+            return out, kl
+        return out
\ No newline at end of file
diff --git a/bayesian_torch/layers/variational_layers/rnn_variational.py b/bayesian_torch/layers/variational_layers/rnn_variational.py
index ab126ad..c36378c 100644
--- a/bayesian_torch/layers/variational_layers/rnn_variational.py
+++ b/bayesian_torch/layers/variational_layers/rnn_variational.py
@@ -77,6 +77,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        self.kl = kl
+
         self.ih = LinearReparameterization(
             prior_mean=prior_mean,
             prior_variance=prior_variance,
@@ -95,7 +97,7 @@ def __init__(self,
             out_features=out_features * 4,
             bias=bias)
 
-    def forward(self, X, hidden_states=None):
+    def forward(self, X, hidden_states=None, return_kl=True):
 
         batch_size, seq_size, _ = X.size()
 
@@ -140,4 +142,8 @@ def forward(self, X, hidden_states=None):
         hidden_seq = hidden_seq.transpose(0, 1).contiguous()
         c_ts = c_ts.transpose(0, 1).contiguous()
 
-        return hidden_seq, (hidden_seq, c_ts), kl
+        self.kl = kl
+
+        if return_kl:
+            return hidden_seq, (hidden_seq, c_ts), kl
+        return hidden_seq, (hidden_seq, c_ts)

From 037006db1b0e7f35d13209517dd88fa711ec75d6 Mon Sep 17 00:00:00 2001
From: Pi <piero.skywalker@gmail.com>
Date: Fri, 26 Nov 2021 11:01:06 -0300
Subject: [PATCH 06/69] feat: add possibility to return no kl on flipout
 layers, save it as attribute

---
 .../layers/flipout_layers/conv_flipout.py     | 54 ++++++++++++++-----
 .../layers/flipout_layers/linear_flipout.py   | 10 +++-
 .../layers/flipout_layers/rnn_flipout.py      |  9 +++-
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index cc3c26e..d1996f7 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -100,6 +100,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = nn.Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size))
         self.rho_kernel = nn.Parameter(
@@ -150,7 +152,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv1d(x,
@@ -191,8 +193,11 @@ def forward(self, x):
                                      dilation=self.dilation,
                                      groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
 
 
 class Conv2dFlipout(BaseVariationalLayer_):
@@ -244,6 +249,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = nn.Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size,
                          kernel_size))
@@ -299,7 +306,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv2d(x,
@@ -340,8 +347,11 @@ def forward(self, x):
                                      dilation=self.dilation,
                                      groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
 
 
 class Conv3dFlipout(BaseVariationalLayer_):
@@ -388,6 +398,8 @@ def __init__(self,
         self.groups = groups
         self.bias = bias
 
+        self.kl = 0
+
         self.prior_mean = prior_mean
         self.prior_variance = prior_variance
         self.posterior_mu_init = posterior_mu_init
@@ -448,7 +460,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv3d(x,
@@ -489,8 +501,11 @@ def forward(self, x):
                                      dilation=self.dilation,
                                      groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
 
 
 class ConvTranspose1dFlipout(BaseVariationalLayer_):
@@ -537,6 +552,8 @@ def __init__(self,
         self.groups = groups
         self.bias = bias
 
+        self.kl = 0
+
         self.prior_mean = prior_mean
         self.prior_variance = prior_variance
         self.posterior_mu_init = posterior_mu_init
@@ -593,7 +610,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv_transpose1d(x,
@@ -635,8 +652,11 @@ def forward(self, x):
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
 
 
 class ConvTranspose2dFlipout(BaseVariationalLayer_):
@@ -683,6 +703,8 @@ def __init__(self,
         self.groups = groups
         self.bias = bias
 
+        self.kl = 0
+
         self.prior_mean = prior_mean
         self.prior_variance = prior_variance
         self.posterior_mu_init = posterior_mu_init
@@ -743,7 +765,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv_transpose2d(x,
@@ -785,8 +807,11 @@ def forward(self, x):
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
 
 
 class ConvTranspose3dFlipout(BaseVariationalLayer_):
@@ -838,6 +863,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init
         self.bias = bias
 
+        self.kl = 0
+
         self.mu_kernel = nn.Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size,
                          kernel_size, kernel_size))
@@ -893,7 +920,7 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.conv_transpose3d(x,
@@ -935,5 +962,8 @@ def forward(self, x):
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
+        self.kl = kl
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py
index d7d577f..2538f1d 100644
--- a/bayesian_torch/layers/flipout_layers/linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py
@@ -90,6 +90,8 @@ def __init__(self,
                              torch.Tensor(out_features, in_features),
                              persistent=False)
 
+        self.kl = 0
+
         if bias:
             self.mu_bias = nn.Parameter(torch.Tensor(out_features))
             self.rho_bias = nn.Parameter(torch.Tensor(out_features))
@@ -123,7 +125,7 @@ def init_parameters(self):
             self.mu_bias.data.normal_(mean=self.posterior_mu_init, std=0.1)
             self.rho_bias.data.normal_(mean=self.posterior_rho_init, std=0.1)
 
-    def forward(self, x):
+    def forward(self, x, return_kl=True):
         # sampling delta_W
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         delta_weight = (sigma_weight * self.eps_weight.data.normal_())
@@ -148,5 +150,9 @@ def forward(self, x):
         perturbed_outputs = F.linear(x * sign_input, delta_weight,
                                      bias) * sign_output
 
+        self.kl = kl
+
         # returning outputs + perturbations
-        return outputs + perturbed_outputs, kl
+        if return_kl:
+            return outputs + perturbed_outputs, kl
+        return outputs + perturbed_outputs
diff --git a/bayesian_torch/layers/flipout_layers/rnn_flipout.py b/bayesian_torch/layers/flipout_layers/rnn_flipout.py
index 38c222a..317ebc4 100644
--- a/bayesian_torch/layers/flipout_layers/rnn_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/rnn_flipout.py
@@ -76,6 +76,8 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,  # variance of weight --> sigma = log (1 + exp(rho))
         self.bias = bias
 
+        self.kl = 0
+
         self.ih = LinearFlipout(prior_mean=prior_mean,
                                 prior_variance=prior_variance,
                                 posterior_mu_init=posterior_mu_init,
@@ -92,7 +94,7 @@ def __init__(self,
                                 out_features=out_features * 4,
                                 bias=bias)
 
-    def forward(self, X, hidden_states=None):
+    def forward(self, X, hidden_states=None, return_kl=True):
 
         batch_size, seq_size, _ = X.size()
 
@@ -137,4 +139,7 @@ def forward(self, X, hidden_states=None):
         hidden_seq = hidden_seq.transpose(0, 1).contiguous()
         c_ts = c_ts.transpose(0, 1).contiguous()
 
-        return hidden_seq, (hidden_seq, c_ts), kl
+        self.kl = kl
+        if return_kl:
+            return hidden_seq, (hidden_seq, c_ts), kl
+        return hidden_seq, (hidden_seq, c_ts)

From f892b955bffeb58d51064828b41f08a56d93f4c0 Mon Sep 17 00:00:00 2001
From: msubedar <mahesh.subedar@intel.com>
Date: Tue, 7 Dec 2021 23:00:49 -0800
Subject: [PATCH 07/69] updates to support dnn to bnn imodel auto conversion

---
 .../layers/base_variational_layer.py          |   9 +
 .../layers/flipout_layers/conv_flipout.py     |  74 ++++--
 .../layers/flipout_layers/linear_flipout.py   |  24 +-
 .../layers/flipout_layers/rnn_flipout.py      |   8 +
 .../variational_layers/conv_variational.py    | 230 ++++++++++++------
 .../variational_layers/linear_variational.py  |  54 ++--
 .../variational_layers/rnn_variational.py     |  14 +-
 7 files changed, 281 insertions(+), 132 deletions(-)

diff --git a/bayesian_torch/layers/base_variational_layer.py b/bayesian_torch/layers/base_variational_layer.py
index 86b2505..4d63cc9 100644
--- a/bayesian_torch/layers/base_variational_layer.py
+++ b/bayesian_torch/layers/base_variational_layer.py
@@ -34,6 +34,15 @@
 class BaseVariationalLayer_(nn.Module):
     def __init__(self):
         super().__init__()
+        self._dnn_to_bnn_flag = False
+
+    @property
+    def dnn_to_bnn_flag(self):
+        return self._dnn_to_bnn_flag
+
+    @dnn_to_bnn_flag.setter
+    def dnn_to_bnn_flag(self, value):
+        self._dnn_to_bnn_flag = value
 
     def kl_div(self, mu_q, sigma_q, mu_p, sigma_p):
         """
diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index d1996f7..5214a99 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -154,6 +154,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv1d(x,
                            weight=self.mu_kernel,
@@ -173,16 +176,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv1d(x * sign_input,
@@ -308,6 +313,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv2d(x,
                            weight=self.mu_kernel,
@@ -327,16 +335,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv2d(x * sign_input,
@@ -347,7 +357,6 @@ def forward(self, x, return_kl=True):
                                      dilation=self.dilation,
                                      groups=self.groups) * sign_output
 
-        self.kl = kl
         # returning outputs + perturbations
         if return_kl:
             return outputs + perturbed_outputs, kl
@@ -462,6 +471,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv3d(x,
                            weight=self.mu_kernel,
@@ -481,16 +493,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv3d(x * sign_input,
@@ -612,6 +626,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose1d(x,
                                      weight=self.mu_kernel,
@@ -631,16 +648,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv_transpose1d(
@@ -767,6 +786,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose2d(x,
                                      bias=self.mu_bias,
@@ -786,16 +808,18 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = (sigma_bias * eps_bias)
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # perturbed feedforward
         perturbed_outputs = F.conv_transpose2d(
@@ -922,6 +946,9 @@ def init_parameters(self):
 
     def forward(self, x, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         # linear outputs
         outputs = F.conv_transpose3d(x,
                                      weight=self.mu_kernel,
@@ -941,8 +968,9 @@ def forward(self, x, return_kl=True):
 
         delta_kernel = (sigma_weight * eps_kernel)
 
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.bias:
diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py
index 2538f1d..af34d5d 100644
--- a/bayesian_torch/layers/flipout_layers/linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py
@@ -90,8 +90,6 @@ def __init__(self,
                              torch.Tensor(out_features, in_features),
                              persistent=False)
 
-        self.kl = 0
-
         if bias:
             self.mu_bias = nn.Parameter(torch.Tensor(out_features))
             self.rho_bias = nn.Parameter(torch.Tensor(out_features))
@@ -125,21 +123,33 @@ def init_parameters(self):
             self.mu_bias.data.normal_(mean=self.posterior_mu_init, std=0.1)
             self.rho_bias.data.normal_(mean=self.posterior_rho_init, std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_weight))
+        kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.mu_bias is not None:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
         # sampling delta_W
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         delta_weight = (sigma_weight * self.eps_weight.data.normal_())
 
         # get kl divergence
-        kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu,
-                         self.prior_weight_sigma)
+        if return_kl:
+            kl = self.kl_div(self.mu_weight, sigma_weight, self.prior_weight_mu,
+                             self.prior_weight_sigma)
 
         bias = None
         if self.mu_bias is not None:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             bias = (sigma_bias * self.eps_bias.data.normal_())
-            kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl = kl + self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         # linear outputs
         outputs = F.linear(x, self.mu_weight, self.mu_bias)
@@ -150,8 +160,6 @@ def forward(self, x, return_kl=True):
         perturbed_outputs = F.linear(x * sign_input, delta_weight,
                                      bias) * sign_output
 
-        self.kl = kl
-
         # returning outputs + perturbations
         if return_kl:
             return outputs + perturbed_outputs, kl
diff --git a/bayesian_torch/layers/flipout_layers/rnn_flipout.py b/bayesian_torch/layers/flipout_layers/rnn_flipout.py
index 317ebc4..5977740 100644
--- a/bayesian_torch/layers/flipout_layers/rnn_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/rnn_flipout.py
@@ -94,8 +94,16 @@ def __init__(self,
                                 out_features=out_features * 4,
                                 bias=bias)
 
+    def kl_loss(self):
+        kl_i = self.ih.kl_loss()
+        kl_h = self.hh.kl_loss()
+        return kl_i + kl_h
+
     def forward(self, X, hidden_states=None, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         batch_size, seq_size, _ = X.size()
 
         hidden_seq = []
diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 96b1db5..1d55363 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 Intel Labs 
+# Copyright (C) 2021 Intel Labs
 #
 # BSD-3-Clause License
 #
@@ -112,8 +112,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size))
         self.rho_kernel = Parameter(
@@ -162,32 +160,53 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv1d(input, weight, bias, self.stride, self.padding,
                        self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
             return out, kl
+
         return out
 
 
@@ -245,8 +264,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size,
                          kernel_size))
@@ -300,32 +317,45 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv2d(input, weight, bias, self.stride, self.padding,
                        self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-        
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
             return out, kl
+
         return out
 
 
@@ -383,8 +413,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size,
                          kernel_size, kernel_size))
@@ -438,32 +466,44 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv3d(input, weight, bias, self.stride, self.padding,
                        self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
             return out, kl
+
         return out
 
 
@@ -522,8 +562,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size))
         self.rho_kernel = Parameter(
@@ -572,33 +610,46 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv_transpose1d(input, weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
+
             return out, kl
+
         return out
 
 
@@ -657,8 +708,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size,
                          kernel_size))
@@ -712,33 +761,46 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv_transpose2d(input, weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
+
             return out, kl
+
         return out
 
 
@@ -798,8 +860,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_kernel = Parameter(
             torch.Tensor(in_channels, out_channels // groups, kernel_size,
                          kernel_size, kernel_size))
@@ -853,31 +913,43 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         weight = self.mu_kernel + (sigma_weight * eps_kernel)
-        kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.bias:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             eps_bias = self.eps_bias.data.normal_()
             bias = self.mu_bias + (sigma_bias * eps_bias)
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.conv_transpose3d(input, weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
-        if self.bias:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-
-        self.kl = kl
-
         if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
             return out, kl
+
         return out
diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py
index bb3a296..7efb667 100644
--- a/bayesian_torch/layers/variational_layers/linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/linear_variational.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 Intel Labs 
+# Copyright (C) 2021 Intel Labs
 #
 # BSD-3-Clause License
 #
@@ -83,8 +83,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = 0
-
         self.mu_weight = Parameter(torch.Tensor(out_features, in_features))
         self.rho_weight = Parameter(torch.Tensor(out_features, in_features))
         self.register_buffer('eps_weight',
@@ -99,8 +97,14 @@ def __init__(self,
         if bias:
             self.mu_bias = Parameter(torch.Tensor(out_features))
             self.rho_bias = Parameter(torch.Tensor(out_features))
-            self.register_buffer('eps_bias', torch.Tensor(out_features), persistent=False)
-            self.register_buffer('prior_bias_mu', torch.Tensor(out_features), persistent=False)
+            self.register_buffer(
+                'eps_bias',
+                torch.Tensor(out_features),
+                persistent=False)
+            self.register_buffer(
+                'prior_bias_mu',
+                torch.Tensor(out_features),
+                persistent=False)
             self.register_buffer('prior_bias_sigma',
                                  torch.Tensor(out_features),
                                  persistent=False)
@@ -126,28 +130,44 @@ def init_parameters(self):
             self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
                                        std=0.1)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_weight))
+        kl = self.kl_div(
+            self.mu_weight,
+            sigma_weight,
+            self.prior_weight_mu,
+            self.prior_weight_sigma)
+        if self.mu_bias is not None:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias,
+                              self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, input, return_kl=True):
+        if self.dnn_to_bnn_flag:
+            return_kl = False
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         weight = self.mu_weight + \
             (sigma_weight * self.eps_weight.data.normal_())
-        kl_weight = self.kl_div(self.mu_weight, sigma_weight,
-                                self.prior_weight_mu, self.prior_weight_sigma)
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_weight, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
         bias = None
 
         if self.mu_bias is not None:
             sigma_bias = torch.log1p(torch.exp(self.rho_bias))
             bias = self.mu_bias + (sigma_bias * self.eps_bias.data.normal_())
-            kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                  self.prior_bias_sigma)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
 
         out = F.linear(input, weight, bias)
-        if self.mu_bias is not None:
-            kl = kl_weight + kl_bias
-        else:
-            kl = kl_weight
-            
-        self.kl = kl
-
         if return_kl:
+            if self.mu_bias is not None:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
+
             return out, kl
-        return out
\ No newline at end of file
+
+        return out
diff --git a/bayesian_torch/layers/variational_layers/rnn_variational.py b/bayesian_torch/layers/variational_layers/rnn_variational.py
index c36378c..39f4a2d 100644
--- a/bayesian_torch/layers/variational_layers/rnn_variational.py
+++ b/bayesian_torch/layers/variational_layers/rnn_variational.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 Intel Labs 
+# Copyright (C) 2021 Intel Labs
 #
 # BSD-3-Clause License
 #
@@ -77,8 +77,6 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
-        self.kl = kl
-
         self.ih = LinearReparameterization(
             prior_mean=prior_mean,
             prior_variance=prior_variance,
@@ -97,8 +95,16 @@ def __init__(self,
             out_features=out_features * 4,
             bias=bias)
 
+    def kl_loss(self):
+        kl_i = self.ih.kl_loss()
+        kl_h = self.hh.kl_loss()
+        return kl_i + kl_h
+
     def forward(self, X, hidden_states=None, return_kl=True):
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         batch_size, seq_size, _ = X.size()
 
         hidden_seq = []
@@ -142,8 +148,6 @@ def forward(self, X, hidden_states=None, return_kl=True):
         hidden_seq = hidden_seq.transpose(0, 1).contiguous()
         c_ts = c_ts.transpose(0, 1).contiguous()
 
-        self.kl = kl
-
         if return_kl:
             return hidden_seq, (hidden_seq, c_ts), kl
         return hidden_seq, (hidden_seq, c_ts)

From 161bfdfe16c97ee579c45c52ab7ceddc5fc5d0e3 Mon Sep 17 00:00:00 2001
From: msubedar <mahesh.subedar@intel.com>
Date: Tue, 7 Dec 2021 23:18:05 -0800
Subject: [PATCH 08/69] updates to support dnn to bnn imodel auto conversion

---
 .../examples/main_bayesian_cifar_dnn2bnn.py   | 522 ++++++++++++++++++
 bayesian_torch/models/dnn_to_bnn.py           | 165 ++++++
 2 files changed, 687 insertions(+)
 create mode 100644 bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py
 create mode 100644 bayesian_torch/models/dnn_to_bnn.py

diff --git a/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py b/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py
new file mode 100644
index 0000000..8305844
--- /dev/null
+++ b/bayesian_torch/examples/main_bayesian_cifar_dnn2bnn.py
@@ -0,0 +1,522 @@
+import argparse
+import os
+import shutil
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import bayesian_torch.models.deterministic.resnet as resnet
+import numpy as np
+from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss
+
+model_names = sorted(
+    name
+    for name in resnet.__dict__
+    if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name])
+)
+
+print(model_names)
+len_trainset = 50000
+len_testset = 10000
+
+parser = argparse.ArgumentParser(description="CIFAR10")
+parser.add_argument(
+    "--arch",
+    "-a",
+    metavar="ARCH",
+    default="resnet20",
+    choices=model_names,
+    help="model architecture: " + " | ".join(model_names) + " (default: resnet20)",
+)
+parser.add_argument(
+    "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)"
+)
+parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run")
+parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)")
+parser.add_argument("-b", "--batch-size", default=128, type=int, metavar="N", help="mini-batch size (default: 512)")
+parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate")
+parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
+parser.add_argument(
+    "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)"
+)
+parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)")
+parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)")
+parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set")
+parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model")
+parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ")
+parser.add_argument(
+    "--save-dir",
+    dest="save_dir",
+    help="The directory used to save the trained models",
+    default="./checkpoint/bayesian",
+    type=str,
+)
+parser.add_argument(
+    "--moped-init-model",
+    dest="moped_init_model",
+    help="DNN model to intialize MOPED method",
+    default="",
+    type=str,
+)
+parser.add_argument(
+    "--moped-delta-factor",
+    dest="moped_delta_factor",
+    help="MOPED delta scale factor",
+    default=0.2,
+    type=float,
+)
+
+parser.add_argument(
+    "--bnn-rho-init",
+    dest="bnn_rho_init",
+    help="rho init for bnn layers",
+    default=-3.0,
+    type=float,
+)
+
+parser.add_argument(
+    "--use-flipout-layers",
+    type=bool,
+    default=False,
+    metavar="use_flipout_layers",
+    help="Use Flipout layers for BNNs, default is Reparameterization layers",
+)
+
+parser.add_argument(
+    "--save-every",
+    dest="save_every",
+    help="Saves checkpoints at every specified number of epochs",
+    type=int,
+    default=10,
+)
+parser.add_argument("--mode", type=str, required=True, help="train | test")
+
+parser.add_argument(
+    "--num_monte_carlo",
+    type=int,
+    default=20,
+    metavar="N",
+    help="number of Monte Carlo samples to be drawn during inference",
+)
+parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training")
+parser.add_argument(
+    "--tensorboard",
+    type=bool,
+    default=True,
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./logs/cifar/bayesian",
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+
+best_prec1 = 0
+
+
+def main():
+    global args, best_prec1
+    args = parser.parse_args()
+    moped_enable = False
+    if len(args.moped_init_model) > 0:  # use moped method if trained dnn model weights are provided
+        moped_enable = True
+
+    const_bnn_prior_parameters = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": args.bnn_rho_init,
+        "type": "Flipout" if args.use_flipout_layers else "Reparameterization",  # Flipout or Reparameterization
+        "moped_enable": moped_enable,  # initialize mu/sigma from the dnn weights
+        "moped_delta": args.moped_delta_factor,
+    }
+
+    # Check the save_dir exists or not
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
+    model.cuda() if torch.cuda.is_available() else model.cpu()
+    if moped_enable:
+        checkpoint = torch.load(args.moped_init_model)
+        if "state_dict" in checkpoint.keys():
+            model.load_state_dict(checkpoint["state_dict"])
+        else:
+            model.load_state_dict(checkpoint)
+
+    dnn_to_bnn(model, const_bnn_prior_parameters)  # only replaces linear and conv layers
+    if torch.cuda.is_available():
+        model.cuda()
+    else:
+        model.cpu()
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint["epoch"]
+            best_prec1 = checkpoint["best_prec1"]
+            model.load_state_dict(checkpoint)
+            print("=> loaded checkpoint '{}' (epoch {})".format(args.evaluate, checkpoint["epoch"]))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    tb_writer = None
+    if args.tensorboard:
+        logger_dir = os.path.join(args.log_dir, "tb_logger")
+        if not os.path.exists(logger_dir):
+            os.makedirs(logger_dir)
+        tb_writer = SummaryWriter(logger_dir)
+
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+    train_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10(
+            root="./data",
+            train=True,
+            transform=transforms.Compose(
+                [
+                    transforms.RandomHorizontalFlip(),
+                    transforms.RandomCrop(32, 4),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+            download=True,
+        ),
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.workers,
+        pin_memory=True,
+    )
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10(
+            root="./data",
+            train=False,
+            transform=transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+        ),
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.workers,
+        pin_memory=True,
+    )
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    if torch.cuda.is_available():
+        criterion = nn.CrossEntropyLoss().cuda()
+    else:
+        criterion = nn.CrossEntropyLoss().cpu()
+
+    if args.half:
+        model.half()
+        criterion.half()
+
+    if args.arch in ["resnet110"]:
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = args.lr * 0.1
+
+    if args.evaluate:
+        validate(val_loader, model, criterion)
+        return
+
+    if args.mode == "train":
+
+        for epoch in range(args.start_epoch, args.epochs):
+
+            lr = args.lr
+            if epoch >= 80 and epoch < 120:
+                lr = 0.1 * args.lr
+            elif epoch >= 120 and epoch < 160:
+                lr = 0.01 * args.lr
+            elif epoch >= 160 and epoch < 180:
+                lr = 0.001 * args.lr
+            elif epoch >= 180:
+                lr = 0.0005 * args.lr
+
+            optimizer = torch.optim.Adam(model.parameters(), lr)
+
+            # train for one epoch
+            print("current lr {:.5e}".format(optimizer.param_groups[0]["lr"]))
+            train(args, train_loader, model, criterion, optimizer, epoch, tb_writer)
+
+            prec1 = validate(args, val_loader, model, criterion, epoch, tb_writer)
+
+            is_best = prec1 > best_prec1
+            best_prec1 = max(prec1, best_prec1)
+
+            if is_best:
+                save_checkpoint(
+                    {
+                        "epoch": epoch + 1,
+                        "state_dict": model.state_dict(),
+                        "best_prec1": best_prec1,
+                    },
+                    is_best,
+                    filename=os.path.join(args.save_dir, "bayesian_{}_cifar.pth".format(args.arch)),
+                )
+
+    elif args.mode == "test":
+        checkpoint_file = args.save_dir + "/bayesian_{}_cifar.pth".format(args.arch)
+        if torch.cuda.is_available():
+            checkpoint = torch.load(checkpoint_file)
+        else:
+            checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
+        model.load_state_dict(checkpoint["state_dict"])
+        evaluate(args, model, val_loader)
+
+
+def train(args, train_loader, model, criterion, optimizer, epoch, tb_writer=None):
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if torch.cuda.is_available():
+            target = target.cuda()
+            input_var = input.cuda()
+            target_var = target
+        else:
+            target = target.cpu()
+            input_var = input.cpu()
+            target_var = target
+
+        if args.half:
+            input_var = input_var.half()
+
+        # compute output
+        output_ = []
+        kl_ = []
+        for mc_run in range(args.num_mc):
+            output = model(input_var)
+            kl = get_kl_loss(model)
+            output_.append(output)
+            kl_.append(kl)
+        output = torch.mean(torch.stack(output_), dim=0)
+        kl = torch.mean(torch.stack(kl_), dim=0)
+        cross_entropy_loss = criterion(output, target_var)
+        scaled_kl = kl / args.batch_size
+
+        # ELBO loss
+        loss = cross_entropy_loss + scaled_kl
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        output = output.float()
+        loss = loss.float()
+        # measure accuracy and record loss
+        prec1 = accuracy(output.data, target)[0]
+        losses.update(loss.item(), input.size(0))
+        top1.update(prec1.item(), input.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            print(
+                "Epoch: [{0}][{1}/{2}]\t"
+                "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
+                "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
+                "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format(
+                    epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1
+                )
+            )
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/cross_entropy_loss", cross_entropy_loss.item(), epoch)
+            tb_writer.add_scalar("train/kl_div", scaled_kl.item(), epoch)
+            tb_writer.add_scalar("train/elbo_loss", loss.item(), epoch)
+            tb_writer.add_scalar("train/accuracy", prec1.item(), epoch)
+            tb_writer.flush()
+
+
+def validate(args, val_loader, model, criterion, epoch, tb_writer=None):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    end = time.time()
+    with torch.no_grad():
+        for i, (input, target) in enumerate(val_loader):
+            if torch.cuda.is_available():
+                target = target.cuda()
+                input_var = input.cuda()
+                target_var = target.cuda()
+            else:
+                target = target.cpu()
+                input_var = input.cpu()
+                target_var = target.cpu()
+
+            if args.half:
+                input_var = input_var.half()
+
+            # compute output
+            output_ = []
+            kl_ = []
+            for mc_run in range(args.num_mc):
+                output = model(input_var)
+                kl = get_kl_loss(model)
+                output_.append(output)
+                kl_.append(kl)
+            output = torch.mean(torch.stack(output_), dim=0)
+            kl = torch.mean(torch.stack(kl_), dim=0)
+            cross_entropy_loss = criterion(output, target_var)
+            # scaled_kl = kl / len_trainset
+            scaled_kl = kl / args.batch_size
+            # scaled_kl = 0.2 * (kl / len_trainset)
+
+            # ELBO loss
+            loss = cross_entropy_loss + scaled_kl
+
+            output = output.float()
+            loss = loss.float()
+
+            # measure accuracy and record loss
+            prec1 = accuracy(output.data, target)[0]
+            losses.update(loss.item(), input.size(0))
+            top1.update(prec1.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                print(
+                    "Test: [{0}/{1}]\t"
+                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                    "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
+                    "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format(
+                        i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1
+                    )
+                )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar("val/cross_entropy_loss", cross_entropy_loss.item(), epoch)
+                tb_writer.add_scalar("val/kl_div", scaled_kl.item(), epoch)
+                tb_writer.add_scalar("val/elbo_loss", loss.item(), epoch)
+                tb_writer.add_scalar("val/accuracy", prec1.item(), epoch)
+                tb_writer.flush()
+
+    print(" * Prec@1 {top1.avg:.3f}".format(top1=top1))
+
+    return top1.avg
+
+
+def evaluate(args, model, val_loader):
+    pred_probs_mc = []
+    test_loss = 0
+    correct = 0
+    output_list = []
+    labels_list = []
+    model.eval()
+    with torch.no_grad():
+        begin = time.time()
+        for data, target in val_loader:
+            if torch.cuda.is_available():
+                data, target = data.cuda(), target.cuda()
+            else:
+                data, target = data.cpu(), target.cpu()
+            output_mc = []
+            for mc_run in range(args.num_monte_carlo):
+                output = model.forward(data)
+                output_mc.append(output)
+            output_ = torch.stack(output_mc)
+            output_list.append(output_)
+            labels_list.append(target)
+        end = time.time()
+        print("inference throughput: ", len_testset / (end - begin), " images/s")
+
+        output = torch.stack(output_list)
+        output = output.permute(1, 0, 2, 3)
+        output = output.contiguous().view(args.num_monte_carlo, len_testset, -1)
+        output = torch.nn.functional.softmax(output, dim=2)
+        labels = torch.cat(labels_list)
+        pred_mean = output.mean(dim=0)
+        Y_pred = torch.argmax(pred_mean, axis=1)
+        print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100)
+        np.save("./probs_cifar_mc.npy", output.data.cpu().numpy())
+        np.save("./cifar_test_labels_mc.npy", labels.data.cpu().numpy())
+
+
+def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+    """
+    Save the training model
+    """
+    torch.save(state, filename)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bayesian_torch/models/dnn_to_bnn.py b/bayesian_torch/models/dnn_to_bnn.py
new file mode 100644
index 0000000..18b9b51
--- /dev/null
+++ b/bayesian_torch/models/dnn_to_bnn.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Functions related to DNN to BNN model conversion.
+#
+# @authors: Mahesh Subedar
+#
+# ===============================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import bayesian_torch.layers as bayesian_layers
+from bayesian_torch.utils.util import get_rho
+
+
+# --------------------------------------------------------------------------------
+# Parameters used to define BNN layyers.
+#    bnn_prior_parameters = {
+#       "prior_mu": 0.0,
+#       "prior_sigma": 1.0,
+#       "posterior_mu_init": 0.0,
+#       "posterior_rho_init": -4.0,
+#       "type": "Reparameterization",  # Flipout or Reparameterization
+# }
+
+
+def bnn_linear_layer(params, d):
+    layer_type = d.__class__.__name__ + params["type"]
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get BNN layer
+    bnn_layer = layer_fn(
+        in_features=d.in_features,
+        out_features=d.out_features,
+        prior_mean=params["prior_mu"],
+        prior_variance=params["prior_sigma"],
+        posterior_mu_init=params["posterior_mu_init"],
+        posterior_rho_init=params["posterior_rho_init"],
+        bias=d.bias is not None,
+    )
+    # if MOPED is enabled initialize mu and sigma
+    if params["moped_enable"]:
+        delta = params["moped_delta"]
+        bnn_layer.mu_weight.data.copy_(d.weight.data)
+        bnn_layer.rho_weight.data.copy_(get_rho(d.weight.data, delta))
+        if bnn_layer.mu_bias is not None:
+            bnn_layer.mu_bias.data.copy_(d.bias.data)
+            bnn_layer.rho_bias.data.copy_(get_rho(d.bias.data, delta))
+    bnn_layer.dnn_to_bnn_flag = True
+    return bnn_layer
+
+
+def bnn_conv_layer(params, d):
+    layer_type = d.__class__.__name__ + params["type"]
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get BNN layer
+    bnn_layer = layer_fn(
+        in_channels=d.in_channels,
+        out_channels=d.out_channels,
+        kernel_size=d.kernel_size[0],
+        stride=d.stride,
+        padding=d.padding,
+        dilation=d.dilation,
+        groups=d.groups,
+        prior_mean=params["prior_mu"],
+        prior_variance=params["prior_sigma"],
+        posterior_mu_init=params["posterior_mu_init"],
+        posterior_rho_init=params["posterior_rho_init"],
+        bias=d.bias is not None,
+    )
+
+    # if MOPED is enabled,  initialize mu and sigma
+    if params["moped_enable"]:
+        delta = params["moped_delta"]
+        bnn_layer.mu_kernel.data.copy_(d.weight.data)
+        bnn_layer.rho_kernel.data.copy_(get_rho(d.weight.data, delta))
+        if bnn_layer.mu_bias is not None:
+            bnn_layer.mu_bias.data.copy_(d.bias.data)
+            bnn_layer.rho_bias.data.copy_(get_rho(d.bias.data, delta))
+    bnn_layer.dnn_to_bnn_flag = True
+    return bnn_layer
+
+
+def bnn_lstm_layer(params, d):
+    layer_type = d.__class__.__name__ + params["type"]
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get BNN layer
+    bnn_layer = layer_fn(
+        in_features=d.input_size,
+        out_features=d.hidden_size,
+        prior_mean=params["prior_mu"],
+        prior_variance=params["prior_sigma"],
+        posterior_mu_init=params["posterior_mu_init"],
+        posterior_rho_init=params["posterior_rho_init"],
+        bias=d.bias is not None,
+    )
+    # if MOPED is enabled initialize mu and sigma
+    if params["moped_enable"]:
+        print("WARNING: MOPED method is not supported for LSTM layers!!!")
+    bnn_layer.dnn_to_bnn_flag = True
+    return bnn_layer
+
+
+# replaces linear and conv layers
+# bnn_prior_parameters - check the template at the top.
+def dnn_to_bnn(m, bnn_prior_parameters):
+    for name, value in list(m._modules.items()):
+        if m._modules[name]._modules:
+            dnn_to_bnn(m._modules[name], bnn_prior_parameters)
+        elif "Conv" in m._modules[name].__class__.__name__:
+            setattr(
+                m,
+                name,
+                bnn_conv_layer(
+                    bnn_prior_parameters,
+                    m._modules[name]))
+        elif "Linear" in m._modules[name].__class__.__name__:
+            setattr(
+                m,
+                name,
+                bnn_linear_layer(
+                    bnn_prior_parameters,
+                    m._modules[name]))
+        elif "LSTM" in m._modules[name].__class__.__name__:
+            setattr(
+                m,
+                name,
+                bnn_lstm_layer(
+                    bnn_prior_parameters,
+                    m._modules[name]))
+        else:
+            pass
+    return
+
+
+def get_kl_loss(m):
+    kl_loss = None
+    for layer in m.modules():
+        if hasattr(layer, "kl_loss"):
+            if kl_loss is None:
+                kl_loss = layer.kl_loss()
+            else:
+                kl_loss += layer.kl_loss()
+    return kl_loss

From b2f81a3fa8df54973144615bc61af1472e08cff7 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 13:06:16 -0800
Subject: [PATCH 09/69] Update README.md

---
 README.md | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 9d77441..12c9bc4 100644
--- a/README.md
+++ b/README.md
@@ -27,21 +27,20 @@ The repository has implementations for the following Bayesian layers:
       LinearRadial
       Conv1dRadial, Conv2dRadial, Conv3dRadial, ConvTranspose1dRadial, ConvTranspose2dRadial, ConvTranspose3dRadial
       LSTMRadial
--->
 
 - [ ] **Variational layers with Gaussian mixture model (GMM) posteriors using reparameterized Monte Carlo estimators** (in `pre-alpha`)
 
       LinearMixture
       Conv1dMixture, Conv2dMixture, Conv3dMixture, ConvTranspose1dMixture, ConvTranspose2dMixture, ConvTranspose3dMixture
       LSTMMixture
+-->
 
 Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
 
 Other features include:
 - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
 - [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
-- [ ] dnn_to_bnn: An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition by replacing neural network layers with corresponding Bayesian layers (`updating soon...`)
- 
+- [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
 
 ## Installation
 
@@ -119,19 +118,44 @@ sh scripts/test_deterministic_cifar.sh
 If you use this code, please cite as:
 ```sh
 @misc{krishnan2020bayesiantorch,
-    author = {Ranganath Krishnan and Piero Esposito},
+    author = {Ranganath Krishnan and Piero Esposito and Mahesh Subedar},
     title = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation},
     year = {2020},
     publisher = {GitHub},
     howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}}
 }
 ```
-
-Cite the weight sampling methods as well: [Blundell et al. 2015](https://arxiv.org/abs/1505.05424); [Wen et al. 2018](https://arxiv.org/abs/1803.04386)
+Accuracy versus Uncertainty Calibration (AvUC) loss
+```sh
+@inproceedings{NEURIPS2020_d3d94468,
+ title = {Improving model calibration with accuracy versus uncertainty optimization},
+ author = {Krishnan, Ranganath and Tickoo, Omesh},
+ booktitle = {Advances in Neural Information Processing Systems},
+ volume = {33},
+ pages = {18237--18248},
+ year = {2020},
+ url = {https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf}
+ 
+}
+```
+MOdel Priors with Empirical Bayes using DNN (MOPED)
+```sh
+@inproceedings{krishnan2020specifying,
+  title={Specifying weight priors in bayesian deep neural networks with empirical bayes},
+  author={Krishnan, Ranganath and Subedar, Mahesh and Tickoo, Omesh},
+  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
+  volume={34},
+  number={04},
+  pages={4477--4484},
+  year={2020},
+  url = {https://ojs.aaai.org/index.php/AAAI/article/view/5875}
+}
+```
 
 **Contributors**
 - Ranganath Krishnan 
 - Piero Esposito 
+- Mahesh Subedar
 
 This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. 
 Feedbacks, issues and contributions are welcome. Email to <ranganath.krishnan@intel.com> for any questions.

From 42724f5ad76c50a9a749167d077b180bd5a18009 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 13:10:27 -0800
Subject: [PATCH 10/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 12c9bc4..1495f01 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian
 
 Other features include:
 - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
-- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2019](https://arxiv.org/abs/1906.05323)]
+- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)]
 - [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
 
 ## Installation

From d8b9940dd411e6c6651cc1dfd1d2daa21f64778d Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 06:21:34 -0800
Subject: [PATCH 11/69] update the posterior variational param init value

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 bayesian_torch/models/bayesian/resnet_variational.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bayesian_torch/models/bayesian/resnet_variational.py b/bayesian_torch/models/bayesian/resnet_variational.py
index 49d3086..74e1d16 100644
--- a/bayesian_torch/models/bayesian/resnet_variational.py
+++ b/bayesian_torch/models/bayesian/resnet_variational.py
@@ -20,7 +20,7 @@
 prior_mu = 0.0
 prior_sigma = 1.0
 posterior_mu_init = 0.0
-posterior_rho_init = -2.0
+posterior_rho_init = -3.0
 
 
 def _weights_init(m):

From 8d4e1366bb7d73dbd98c723c9162ccada26fd6b2 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 06:26:08 -0800
Subject: [PATCH 12/69] remove duplicate kl_loss definition in
 Conv1dReparameterization layer

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 .../layers/variational_layers/conv_variational.py        | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 1d55363..7855ad8 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -169,15 +169,6 @@ def kl_loss(self):
 
         return kl
 
-    def kl_loss(self):
-        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
-        if self.bias:
-            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
-            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
-
-        return kl
-
     def forward(self, input, return_kl=True):
         if self.dnn_to_bnn_flag:
             return_kl = False

From bc6681bbed3f7d4a963536250abf505c77f2bcbe Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 06:50:57 -0800
Subject: [PATCH 13/69] include kl_loss() function in Convolutional flipout
 layers, to compute kl when 'return_kl' flag is set to False. Fix for
 issue#12.

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 .../layers/flipout_layers/conv_flipout.py     | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index 5214a99..ce13897 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -152,6 +152,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:
@@ -311,6 +319,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:
@@ -469,6 +485,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:
@@ -624,6 +648,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:
@@ -784,6 +816,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:
@@ -944,6 +984,14 @@ def init_parameters(self):
             self.prior_bias_mu.data.fill_(self.prior_mean)
             self.prior_bias_sigma.data.fill_(self.prior_variance)
 
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+           sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+           kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+        return kl
+
     def forward(self, x, return_kl=True):
 
         if self.dnn_to_bnn_flag:

From 06922f86c5672b178c84ede714e461cbc13ad439 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 16 Dec 2021 13:53:28 -0800
Subject: [PATCH 14/69] Update release version with dnn_to_bnn() feature

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 533df3a..0ff1bc1 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name = "bayesian-torch",
     packages = find_packages(),
-    version = "0.1",
+    version = "0.2",
     description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch",
     author = "ranganath.krishnan@intel.com",
     url = "https://github.com/IntelLabs/bayesian-torch",

From fa51c94f7de025ef2291f85fb4ba81a6fcbb5831 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Thu, 13 Jan 2022 00:16:17 -0800
Subject: [PATCH 15/69] Update README.md

update usage instructions in README file
---
 README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 1495f01..5f33aff 100644
--- a/README.md
+++ b/README.md
@@ -43,9 +43,13 @@ Other features include:
 - [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
 
 ## Installation
-
-
-**Install from source:**
+<!--
+**To install from PyPI:**
+```
+pip install bayesian-torch
+```
+-->
+**To install latest development version from source:**
 ```sh
 git clone https://github.com/IntelLabs/bayesian-torch
 cd bayesian-torch
@@ -61,15 +65,52 @@ Dependencies:
 - pip install tensorboard
 - pip install scikit-learn
 
-## Example usage
-We have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers.
+## Usage
+There are two ways to build Bayesian deep neural networks using Bayesian-Torch: 
+1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn()
+2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers))
 
-We also provide [example usages](bayesian_torch/examples) and [scripts](bayesian_torch/scripts) to train/evaluate the models. The instructions for CIFAR10 examples is provided below, similar scripts for ImageNet and MNIST are available.
+(1) For instance to build Bayesian-ResNet18 from torchvision deterministic ResNet18 model:
+```
+import torchvision
+from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn
+
+const_bnn_prior_parameters = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": -3.0,
+        "type": "Reparameterization",  # Flipout or Reparameterization
+        "moped_enable": False,  # True to initialize mu/sigma from the pretrained dnn weights
+        "moped_delta": 0.2,
+}
+    
+model = torchvision.models.resnet18()
+dnn_to_bnn(model, const_bnn_prior_parameters)
+```
+To use MOPED method, setting the prior and initializing variational parameters from a pretrained determined model (helps training convergence of larger models):
+```
+const_bnn_prior_parameters = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": -3.0,
+        "type": "Reparameterization",  # Flipout or Reparameterization
+        "moped_enable": True,  # True to initialize mu/sigma from the pretrained dnn weights
+        "moped_delta": 0.2,
+}
+    
+model = torchvision.models.resnet18(pretrained=True)
+dnn_to_bnn(model, const_bnn_prior_parameters)
+```
+(2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers.
 
+## Example usage (training and evaluation of models)
+
+We have provided [example usages](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/examples) and [scripts](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/scripts) to train/evaluate the models. The instructions for CIFAR10 examples is provided below, similar scripts for ImageNet and MNIST are available.
 ```
 cd bayesian_torch
 ```
-
 ### Training
 
 To train Bayesian ResNet on CIFAR10, run this command:
@@ -152,11 +193,6 @@ MOdel Priors with Empirical Bayes using DNN (MOPED)
 }
 ```
 
-**Contributors**
-- Ranganath Krishnan 
-- Piero Esposito 
-- Mahesh Subedar
-
 This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. 
 Feedbacks, issues and contributions are welcome. Email to <ranganath.krishnan@intel.com> for any questions.
  

From c3ca3f81af08833b7049ba6ffb82699d273bba7e Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Mon, 17 Jan 2022 15:20:35 -0800
Subject: [PATCH 16/69] Update requirements.txt

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f67a034..2452240 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-torch
-torchvision
-tensorboard
-scikit-learn
\ No newline at end of file
+torch>=1.7.0
+torchvision>=0.8.1
+tensorboard>=1.15.0
+scikit-learn>=0.20.3

From de85018aee4927d4324e10f0c1b53095f85dbec8 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Mon, 17 Jan 2022 16:56:43 -0800
Subject: [PATCH 17/69] Include training, testing and uncertainty
 quantification snippet in README.md

---
 README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 5f33aff..2965ee4 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Bayesian-Torch: Bayesian neural network layers for uncertainty estimation
-**[Get started](#Installation)** | **[Example usage](#example-usage)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** 
+**[Get started](#installation)** | **[Example usage](#example-usage-training-and-evaluation-of-models)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** 
 
 ### Bayesian layers and utilities to perform stochastic variational inference in PyTorch
 
@@ -38,9 +38,9 @@ The repository has implementations for the following Bayesian layers:
 Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
 
 Other features include:
+- [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
+- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)]
 - [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
-- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)]
-- [x] [dnn_to_bnn](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
 
 ## Installation
 <!--
@@ -55,6 +55,7 @@ git clone https://github.com/IntelLabs/bayesian-torch
 cd bayesian-torch
 pip install .
 ```
+<!--
 This code has been tested on PyTorch v1.8.1 LTS.
 
 Dependencies:
@@ -64,16 +65,17 @@ Dependencies:
 - conda install -c conda-forge accimage
 - pip install tensorboard
 - pip install scikit-learn
-
+-->
 ## Usage
 There are two ways to build Bayesian deep neural networks using Bayesian-Torch: 
 1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn()
 2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers))
 
-(1) For instance to build Bayesian-ResNet18 from torchvision deterministic ResNet18 model:
+(1) For instance, building Bayesian-ResNet18 from torchvision deterministic ResNet18 model is as simple as:
 ```
+import torch
 import torchvision
-from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn
+from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss
 
 const_bnn_prior_parameters = {
         "prior_mu": 0.0,
@@ -82,13 +84,13 @@ const_bnn_prior_parameters = {
         "posterior_rho_init": -3.0,
         "type": "Reparameterization",  # Flipout or Reparameterization
         "moped_enable": False,  # True to initialize mu/sigma from the pretrained dnn weights
-        "moped_delta": 0.2,
+        "moped_delta": 0.5,
 }
     
 model = torchvision.models.resnet18()
 dnn_to_bnn(model, const_bnn_prior_parameters)
 ```
-To use MOPED method, setting the prior and initializing variational parameters from a pretrained determined model (helps training convergence of larger models):
+To use MOPED method, setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models):
 ```
 const_bnn_prior_parameters = {
         "prior_mu": 0.0,
@@ -97,12 +99,47 @@ const_bnn_prior_parameters = {
         "posterior_rho_init": -3.0,
         "type": "Reparameterization",  # Flipout or Reparameterization
         "moped_enable": True,  # True to initialize mu/sigma from the pretrained dnn weights
-        "moped_delta": 0.2,
+        "moped_delta": 0.5,
 }
     
 model = torchvision.models.resnet18(pretrained=True)
 dnn_to_bnn(model, const_bnn_prior_parameters)
 ```
+Training snippet:
+```
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(model.parameters(), args.learning_rate)
+
+output = model(x_train)
+kl = get_kl_loss(model)
+ce_loss = criterion(output, y_train)
+loss = ce_loss + kl / args.batch_size 
+
+loss.backward()
+optimizer.step()
+```
+Testing snippet:
+```
+model.eval()
+with torch.no_grad():
+    output_mc = []
+    for mc_run in range(args.num_monte_carlo):
+        logits = model(x_test)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        output_mc.append(probs)
+    output = torch.stack(output_mc)  
+    pred_mean = output.mean(dim=0)
+    y_pred = torch.argmax(pred_mean, axis=-1)
+    test_acc = (y_pred.data.cpu().numpy() == y_test.data.cpu().numpy()).mean()
+```
+Uncertainty Quantification:
+```
+from utils.util import predictive_entropy, mutual_information
+
+predictive_uncertainty = predictive_entropy(output.data.cpu().numpy())
+model_uncertainty = mutual_information(output.data.cpu().numpy())
+```
+
 (2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers.
 
 ## Example usage (training and evaluation of models)

From 57ac5df8a93edef21e08c2a338e83f5d122faafc Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Mon, 17 Jan 2022 17:24:16 -0800
Subject: [PATCH 18/69] update version in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0ff1bc1..16cdc9f 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name = "bayesian-torch",
     packages = find_packages(),
-    version = "0.2",
+    version = "0.2.0",
     description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch",
     author = "ranganath.krishnan@intel.com",
     url = "https://github.com/IntelLabs/bayesian-torch",

From e38d6961c5e4249eda0d5573cc43eca601973f72 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 26 Jan 2022 15:36:18 -0800
Subject: [PATCH 19/69] Update bayesian_torch.layers.md

---
 doc/bayesian_torch.layers.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/bayesian_torch.layers.md b/doc/bayesian_torch.layers.md
index d03c3cf..80995a1 100644
--- a/doc/bayesian_torch.layers.md
+++ b/doc/bayesian_torch.layers.md
@@ -3,8 +3,10 @@ A set of Bayesian neural network layers to perform stochastic variational infere
 
 - Variational layers with reparameterized Monte Carlo estimators [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)]
 - Variational layers with Flipout Monte Carlo estimators [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)]
+<!--
 - Radial BNN layers [[Farquhar et al. 2020](https://arxiv.org/abs/1907.00865)]
 - Variational layers with Gaussian mixture model (GMM) posteriors using reparameterized Monte Carlo estimators (in pre-alpha)
+-->
 
 # Layers
 
@@ -29,7 +31,7 @@ A set of Bayesian neural network layers to perform stochastic variational infere
  * [ConvTranspose3dFlipout](#class-convtranspose3dflipout)
  * [LSTMFlipout](#class-lstmflipout)
   
- 
+ <!--
  * [LinearRadial](#class-linearradial)
  * [Conv1dRadial](#class-conv1dradial)
  * [Conv2dRadial](#class-conv2dradial)
@@ -48,7 +50,7 @@ A set of Bayesian neural network layers to perform stochastic variational infere
  * [ConvTranspose2dMixture](#class-convtranspose2dmixture)
  * [ConvTranspose3dMixture](#class-convtranspose3dmixture)
  * [LSTMMixture](#class-lstmmixture)
-
+-->
 
 
@@ -66,6 +68,7 @@ Calculates the Kullback-Leibler divergence from distribution normal Q (parametri
 ##### Returns
 torch.Tensor of shape 0
 
+<!--
 ## class BaseMixtureLayer_(torch.nn.Module)
 Abstract class which inherits from BaseVariationalLayer_, powered with method to calculate the a KL divergence sample between two mixture of gaussians. 
 
@@ -85,6 +88,7 @@ Calculates a sample of KL divergence between two mixture of gaussians (Q || P),
  
 ##### Returns
 torch.Tensor of shape 0
+-->
 
 ## class LinearReparameterization
 ### bayesian_torch.layers.LinearReparameterization(in_features, out_features, prior_mean, prior_variance, posterior_mu_init, posterior_rho_init, bias=True)
@@ -539,6 +543,7 @@ Samples the weights with Flipout and performs LSTM feedforward operation.
 
 ---
 
+<!--
 ## class LinearRadial
 ### bayesian_torch.layers.LinearRadial(in_features, out_features, prior_mean, prior_variance, posterior_mu_init, posterior_rho_init, bias=True)
 #### Parameters:
@@ -1020,3 +1025,4 @@ Samples the weights with mixture (gaussian bimodal) reparameterization and perfo
     , float corresponding to KL divergence from the samples weights distribution to the prior
 
 ---
+-->

From 7d343e5b2071243e38ff894f9f2e5ce0d79cc629 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 26 Jan 2022 15:38:07 -0800
Subject: [PATCH 20/69] Update links in README.md

---
 README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2965ee4..0765468 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Bayesian-Torch: Bayesian neural network layers for uncertainty estimation
-**[Get started](#installation)** | **[Example usage](#example-usage-training-and-evaluation-of-models)** | **[Documentation](doc/bayesian_torch.layers.md)** | **[License](LICENSE)** | **[Citing](#citing)** 
+**[Get started](https://github.com/IntelLabs/bayesian-torch#installation)** | **[Example usage](https://github.com/IntelLabs/bayesian-torch#usage)** | **[Documentation](https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md)** | **[License](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)** | **[Citing](https://github.com/IntelLabs/bayesian-torch#citing)** 
 
 ### Bayesian layers and utilities to perform stochastic variational inference in PyTorch
 
@@ -8,14 +8,14 @@ Bayesian-Torch is designed to be flexible and seamless in extending a determinis
 
 
 The repository has implementations for the following Bayesian layers:
-- [x] **[Variational layers with reparameterized Monte Carlo estimators](bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)]
+- [x] **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)]
 
       
       LinearVariational 
       Conv1dVariational, Conv2dVariational, Conv3dVariational, ConvTranspose1dVariational, ConvTranspose2dVariational, ConvTranspose3dVariational
       LSTMVariational
       
-- [x] **[Variational layers with Flipout Monte Carlo estimators](bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)]
+- [x] **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)]
       
       LinearFlipout 
       Conv1dFlipout, Conv2dFlipout, Conv3dFlipout, ConvTranspose1dFlipout, ConvTranspose2dFlipout, ConvTranspose3dFlipout
@@ -35,7 +35,9 @@ The repository has implementations for the following Bayesian layers:
       LSTMMixture
 -->
 
+<!--
 Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
+-->
 
 Other features include:
 - [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
@@ -140,7 +142,7 @@ predictive_uncertainty = predictive_entropy(output.data.cpu().numpy())
 model_uncertainty = mutual_information(output.data.cpu().numpy())
 ```
 
-(2) For building custom models, we have provided [example model implementations](bayesian_torch/models/bayesian) using the Bayesian layers.
+(2) For building custom models, we have provided [example model implementations](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/models/bayesian) using the Bayesian layers.
 
 ## Example usage (training and evaluation of models)
 

From 83fe7174eae3ae065af7ae847270364982b16dd9 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 26 Jan 2022 15:55:56 -0800
Subject: [PATCH 21/69] Update setup.py

---
 setup.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 16cdc9f..2ba22c1 100644
--- a/setup.py
+++ b/setup.py
@@ -12,8 +12,9 @@
     name = "bayesian-torch",
     packages = find_packages(),
     version = "0.2.0",
-    description = "Bayesian layers and utilities to perform stochastic variational inference in PyTorch",
-    author = "ranganath.krishnan@intel.com",
+    description = "Bayesian-Torch: Bayesian neural network layers for uncertainty estimation",
+    author = "Intel Labs",
+    author_email = "ranganath.krishnan@intel.com",
     url = "https://github.com/IntelLabs/bayesian-torch",
     long_description = long_desc,
     long_description_content_type = "text/markdown",
@@ -22,6 +23,11 @@
                     "Development Status :: 3 - Alpha",
                     "Intended Audience :: Developers",
                     "Intended Audience :: Science/Research",
-                    "Programming Language :: Python :: 3.7"
+                    "Programming Language :: Python :: 3",
+                    "Programming Language :: Python :: 3.7",
+                    "Topic :: Scientific/Engineering",
+                    "Topic :: Scientific/Engineering :: " + "Artificial Intelligence",
+                    "Topic :: Software Development :: Libraries",
+                    "Topic :: Software Development :: Libraries :: " + "Python Modules",
                   ]
 )

From 7c6df36ced650f7b1dbdd01ec7e28353704a7315 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 26 Jan 2022 16:34:48 -0800
Subject: [PATCH 22/69] Update README.md

---
 README.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0765468..28c6f3e 100644
--- a/README.md
+++ b/README.md
@@ -197,12 +197,14 @@ sh scripts/test_deterministic_cifar.sh
 
 If you use this code, please cite as:
 ```sh
-@misc{krishnan2020bayesiantorch,
-    author = {Ranganath Krishnan and Piero Esposito and Mahesh Subedar},
-    title = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation},
-    year = {2020},
-    publisher = {GitHub},
-    howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}}
+@software{krishnan2022bayesiantorch,
+  author       = {Ranganath Krishnan and Pi Esposito and Mahesh Subedar},               
+  title        = {Bayesian-Torch: Bayesian neural network layers for uncertainty estimation},
+  month        = jan,
+  year         = 2022,
+  doi          = {10.5281/zenodo.5908307},
+  url          = {https://doi.org/10.5281/zenodo.5908307}
+  howpublished = {\url{https://github.com/IntelLabs/bayesian-torch}}
 }
 ```
 Accuracy versus Uncertainty Calibration (AvUC) loss

From 3acddc9c258ed68e67e9861d53b15b17d5a689d2 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 07:10:23 -0800
Subject: [PATCH 23/69] include assets folder

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 assets/bayesian-torch.png | Bin 0 -> 31054 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 assets/bayesian-torch.png

diff --git a/assets/bayesian-torch.png b/assets/bayesian-torch.png
new file mode 100644
index 0000000000000000000000000000000000000000..95f640371fcf9c07db165b85ae2170af734a9752
GIT binary patch
literal 31054
zcmeFZWmMbG);1gpg$hzAlwzewa4YWa?otA@MS@cjq&O8yp=b#14y8bg2Q8(=U5lnb
zf<tf!`iB3x&pG$I*7NoG_RLxY@|($IX3xx?y{~<3BJ_3DNbb?z0{{Rd8tTf1001FA
z?(ge6gt*tafubzjFFdHBnj)ZLkbWKah5)RfqW}PWkG+5K@;2`M-Ph_CPym3M{N@L*
zm+e3f_mIg;#oS8?V*k?1$;17rv6GuUKu|zXP*UKTq@eiIXF`$!B9g-5<A_fN0Kl6%
z4P^!6H`d$pcSGq+E{S%L={?7Jg>&6Z;h*S3ly|DP+2$eKIflma`ttg{?p3_WdGJWc
zoozNJJPBp;I3)!I#H>!;+zC@RZ#X@~Xs|9l_2jyxr_yk(w#hG{U%Qo2XFr&~&nH29
zr9}+Gj2qqmEB{Xd|JNj7P7jO)WcMCvKi!ZC6E2o9GtzV#rGImg^2KpqV(7JY@UGDX
z{c5qe+^I+Z5O4R1FoJXyn$=^t5`XY6d->va+w7`Z@&QTq0rk~<FDf1RGD}E~9V@#k
zmEn;B9>7fAza&jw4n7FFs$bbd{t^>OzMl5GAWAvK4-U+{j-JATPdj5{4kV*@uXKzA
zsj#tlH#3E?SUwrz?K}X!b3kuymZnIkGXp};dJ;RyOXale)MQ)o3IJ&olK3wS6u_Zb
zOyoSD)rMdlVtHWob#QlaxT8uVT_(XzM-N;k21CnOZ@TFlrorh9gKgpl6K1J>C(!+{
zqLpqI05M$4W<_;$C(vb8@r!y)WAs9iAINdOoutGG&W+LA_T}f7Bf077pQ7)(Mhv*4
zq`Bv-wXGZ}DbWa<LqIt!Tc4$P$Mcq(T{8W<A#S+xV1Rr}$~3+M7Fd{73_>U~0JUpx
zRM0;qwDG3iwxGu#d3Xe!%sC|#7L!(6fx6LV|0?F;gMd&~&5`HIXRNfSR{80DC$Ft-
z78Gt8|1K`LIX$3?)XmZJx5*jYXYKh%djHgZyOoItFn}3y#}`#|m<4}=6net$|NC<Q
z{fAzJ0Oheq${VJ=-o7Jmwl$ypr|w5+ju7DKXa;?`!4<W4l207*f2u-*jrQ;CmV$-|
zm}QihKM`!l`;ZnWZYIKT-}}G{vEKx%aw)J*EVARGz?1(|h2EQc)vU?vXxiEu6`A^;
z^%kHc`vum{yG+>!HP-Oxo(_o+KH!xCqTrAU&dtbvGckiYMRl=%ec$7!xH10fqk;2g
zb2$BX2(k(|xy^B$QF+5-lJ5NH`HR#WS*XRN=`sMn&ffjc^PqP)DK_k{zv@k@i~C>A
zBW{|TasEcycvmR@r!_$91FpHaQ4PK~{2s-BrOA^7Qw!BY`?Eb*r=gh6chVa@!;jeL
z^AwSIZsbvzb}ZrcD&d%w0af@hu7Y^A&0Aa|&}a~%Ff;dQa%Gk+W8p(K9nHl;$*z+b
zSCK~|ZWuY=A@;nSO{yD@(qM}}=>Z$e|Lph<R?W0|)Clt>huC+Ugr^cU{!#bJyVMB)
z`f`X-aoNh5mdSX%<>ws2B7t}>3|@K@mLRx|rgkvulpYNA<w}iJoQRscYO_YaXA=xd
z$tT$9ew=vZFa3FTGZo=SlK6n98$ZSMoPq#Ug2`IA6H==P284Q>v)y^|G&0{NZ#^}g
zBYS;m*4L@C0j*IskGQSR8k?1v{e`iMMrV4I47Y;+E~mt})%j!S*pj-R%A=#_p<XZ}
zUJMG#)0I98EBJRK06-QWw+&x**ZUaBW-^TuJ@gw=cW*G<(1NGew*4=$U%$tRT>veV
ztlzrLogw)>ja)RW@ZEp9c|ndlb|v9~XU3ymDauTc<D9v#*9rfr20RVLecYe3fS*|<
zz;mO+S=BtlL*u_={a5vdCQ`oOIq+V1WvBX${qV2yue!c^S@kqF4ZCcmd<_>M&NldW
z!HJAEFD^^<Ig&Zv?Y+~hyVd_apnpSfduW)0OT@tN*r~Ssf6e;lxd3(~_F<`<rrm7k
z{(tw~%|JUxL*hSJrHRH<0s!*(#tX!&H^vLD;D_m`zBE8&-BtGgsC@kIr6sseZxx(i
zSQDx2V|*wj%u-$TFTs8L2C~rVeNs-AqqqWzz-=j0+!kT}{~s>e^02KTQuWfrwGXqV
z@c#}Yjs9k01I6d8_EXJ%9jyORO8^~G+=7v;wlUxBo6Cj$Yh1Y3P_}=xkr6MyRxjUB
zNB@5+0C#ZOB4@-4n30O-{J`OVW`e7h2jDu2HtlZiI$<=$PjB}7-<=hs$p0GsA13NK
zFF;#&p++%nWMGp=Q$C~xr&pVkiy{nZrk7ZN6kMRYp+p9suRe_D=#`u#APQDP6vs{i
zZGve5fTz}M$sdO;6VcraKhYVSo8$WX{)j*q?1zu1*>O@!5B-y6iw^476R&Rp2${_$
zVO46I_$6@&5HkQ!Vp1U-qdw0S&wv%K2vaobpawh&8w@?$;P6l<mEnz-@eFE{!hj;0
z{n8<kNa|W9>_y4x$~wPMFQY2zc(J)qCK%uPF9GuKb9;VK$Udeqy9oSqIv?-py_g+~
z?B|F6fnO{2W^A(I&m-F8IwI&>{a8-fKPLE4hA_&a4q{T{e~Dd^J6+;Q3?88T_~Rhi
z#Z-p}!a$*^3(d7~4KaST#H4}K%lRneP%SpdD#62L_q>g8wOqbp1)4&un)r7lm$L1e
z>g*H!6)}8B?1bfN>#8q%I^-1U7ZI`oG`v_N^^Yjpx%ha&{lIryjCiHv1ynb6Kf6z&
z<)z5>o~^T=ZqrIvQqRFv2h#Iu5F$}*Wwaurz_}WnCfXUCdFxT)rp0^Yb>%L?hb87T
zV5joO>QwxNUla<nckg04O-(lTi{7b}OtMGz?$91KDGS{UMts@l89>isoBHntGmbM#
z;=?MTE6*=lK(Tsdn|t66u_a+7eF$p^S@QDXYrb$q4CUNumo#7aM0nt%tGGbE1RX1l
z1R9Y2VC_&sf`67jH-C-{p7)5~s{X$0O5N*?K6+1v_Hk;;ViCs5zzag}K#7l5GTG8c
ze2m2yPbk)EKS);!<Js@DB)w|jt|zDY`%tJI6O+*3O?Xi$^A{2jDvI{MRo~W4yz_U+
zr5#+1MXsK`6+I%z-n>gHw=wlj^4*H$yP<kk(yb5OxO<xCUMAjAB5xt$1&4;+5VuFc
z+R%9bh00Wcl<BhOk)z?2V?{~SN^hU(zP5P(?}uvhL$|jXgdK;GYhMk9dV~ObqtBq;
z?(jMY*8q`E8rPsI-0(Q*x@?#mb#?+lAG%)!FDNA<e8&!DzF++LIRkc+cjxM`Uoqcz
zw1o5sPyXDecf!bCN$&t5n8Nt$me`^#Ih(Ba9754QvF}x(@8kL;X@4W9Mt<3t`iT;p
zxwSTBcO$|XD+>cR-z$g`gU;vs5~O^6E2OptN7w@2&3!U<{=JbP@?|aL;JYXFRLBnM
zV8=>W<}ywi|D_UGem@0bAz|C#W<OYOD-M3Y@h~YB<EA;V0>O$xGN#89{Y<m)9ye(C
ze$`6EO6~16J@oT0em(R1ENK6TMR~*EDn|Ne7lvhgqSw3c&S!tHy!+&JypPY~Nz#yv
zzfeVSy26u3#{O8}`QP5LP4BxNVcaFT4{ZD>a@f|0LsfExa#|rWMEmDKrOI`I-M2Gn
zZqw@&{b2Q#3!X<V;deyExY8Rna0T%?T~``A253SyT`o-c$`LbmvMd1?MA(8vMroL|
zb1P?Z_VPEmd#TL@u*!U|2NE@0W?2jse<d##_6DT)2T+`a-O1A1C*@n*F_&QldA6_o
zv}84p+4p^xZC`2ilrxrNP_rVarYYfV$3zgBSpUT(HlUF#1T2;`N4ST0S#YJhle6i#
zG3fBf*sDfPCS|-&P0Oz9ufpnBYQAh7$IQEt5>CF=?m5-lek98>?e;s!(?efgs?HSB
zca9rtS*)B=k+#K}&M(*o7T{c0I|VH=-JV?<?x$Rbj}V8{73m-yY`=C-m_UxuQ<|#z
zS!gY0He<<0MV<OOxo}3}wTRiowf67BAQ&B)gQ-)moI1{?O*LW_5ttr*1R8ptO>dRi
z^V0o!c_#*Q2O?7M*+Oot5uHPMUCAGKjRCQ3@fy_Ffjx#Q!P3|9$fd^1Gnl1K`(9rc
zpM;qT1fW1Eg<n+|gbk*4x7nfHe~~k^?4NOVe~PN2vmdYI_Ko4aMqE+}nl?@fwkF{z
zT{Cl3b#RNwYmf5UU4g1}fgT%x)+3&lU;j${2IY`jA?+#67N9MNq*Jk4Z_|XkstpAu
z8kUN%ezAhnl1z{KqlVw?k6&FhM59Erik=u=+4O9;WEAZrZ2?{TtN3;x+bAsZe$eq~
z=(-ek2ZJ)M$aQKG4o*#pNlnM82W4g{%47zECoM_5<v&)fj5<?EL+8JwlII`s76>Op
zZ7UNq9M_tifTK$NQG@}6Gs8oPZS5y&yW5)zbL#D`sweyf0SCr&XkgE6%VC-aHflp>
zw1}O0Y~Zn*>}ncXCBNh3N+ad~Ji5VetS-ld7)jh-h=xNGltr5dSCRbcjUEsoggd$p
zGGCS0=P$JB9fmtH%H={Y+JA`a#jg4Ti&>Nd$jJBhW=A0=b`Ha8g+f32DZFH_?uOU|
z_s6dnIP+{Lp9W!Bk|jdo=tE_d{zu_VSO-Uesz7&#r}SlQRl9|)>$<d)h4Phj)a0d8
zJ_AX;4a_<ne!p3VprNN%>=QpIbi_Pa?8qWu+_N3gkKInHA@gpyy+*|992Fts5cygv
zc^ruo#VC~yTrwRLB&yRV&57vIZ;_wYrJ7gZtc9UE+b1qE8@^Snej9Qr8Vo&7O^2lm
zqBG{9Z&IW$pC&`cSjIwKy}+TVx%Hh~Y7qSEJF=_6I*@#oF=9h^YP8z5B_zjmjrdbG
zXsDv#GQUCkj~XOTXN)RPJ45Gp|2FKQ%&`>y!tw#8D%n8jydv&O7Q3K;YOqP1d#zk3
zloK?m3p$3(FGTyy2jd;Ia;^%y>n#8!qDR&SL-V!h613!24r0~`9)kv=C@v|nU(|os
zAhH0%?8lRj2t%)g`gF`OcLa==lko&tHmeHLHP)ZRERB)B;P_2K;?3d>=PMjE3@W<B
zCUW8~|B;=ENS~jamn<67`w%6mTQgcsMrv3_)TR_OlKe#pe5vigvEqYlI}+Ph(gJ4p
zV_SfrmrHziT{0vk92oFA>)%7oD=zeh+vTv|6<}nd_I*->UCrtK8TqGP1dkm)el~1d
zwY6hQ_9n6JENPeQBN|I9lTnL-`q)X`B8FxGF%mb97x`+fI-fjLpzq6yrtbqMcgtS|
zEI$N}#MS0IL(X*+W=lBN3a2C>g>DU7&9IOilbFFfEi6y(X|tPFT=+t)DrUQI?O$hj
z>^=)*53vr(iKuR4Mhw_OobH796ho!{-j&7JNwB|T!23A!iC!{&2Hc{|FdweL_QNn2
zXSL-XiUD6HPe#*8q;5`!b4&R`HyZ!u`DaLM$d7K&^d4vMNGx*31Z2J&c<VamQ2HJv
zdF)dy_Ho`q;Q6(o;nBoMi->)n7_7`4HR{LT6HQ%6`qT{BPSSFV!Gx9%vG<C0bA`Fl
zDW}MIjz7<%o#>F*A5%_^6-iYk{xfW_kR+*jeTTMtB_+1L?ysUrotL$1TQ&ZK^~_tF
zlL64Nb$>Rx5k`lv8$;-E$iKl8ZWK=N3FgYln+ZsU09iWUlfW$`cN$-OQ~SoM3oHF?
zBe31zmYMUF$(u1b^IXflDwQ0K7@Y{bSm-xcPR^p^C?sV<hD6%*8A~o^B?85!s|p!M
z=>22lnwEj=XP}eMC~;~+20g@VUhM^?GU<(d+>`&B7BcwvY=~HVv5pEc+#Gkmzs-|!
zKn9OcbG-*tGDsYh6q>({GqC-^KQ1D-E$|$c(f3(R7!$NgUyMgA&*?dcsxg<4(kkoi
zNh}h<$b;HfAMi|hGa&2k?86k}Et<rJDlr<p$G=vEBJy0Z$)JGNWO}M0(k7t6#vRQO
z{Bi3+b=@laJ{8D>)kWj7gk!s_X~{ZmFaz}ns!ZU32-Zb>+?`l_;5+^`A~T96Tvgx?
z%MqZh4mr$J$OKClu8<D?QXNQH>c>xDuT$Nfq(`g@PY|TDWdD$+&KSJU><!;`KV<|u
zm<CmhcXVxFFZ`$Aa}8L@_>;%=9a}qEcO!1RXVsphVA??ng}D^W%=$x(DK($TcHt?Z
zu~nEfZ+SG`0@?2Bu}s9a&}M*38h1Pp6!ENYkTrRJ5TMhRXdINX>=bkD^;I?VkcKh9
zfd&?{vsq%hvH_#fao`=5xXRCnoHFFUtS9Fj$i^L5=QQ*^o@ZU^K9lbY)~I0#(lKZE
zrnDMMtG&LWScmMk^}D{x9jyN0lh`X-3qKtE;%l(_b&b!1VA#gtY5PUw^d02ZZ<uk{
zXygXX21i$r&vaF444Dp?s83V$y{jKt@BrB!7HbAe;Ng58LofL`c3#fte9HtVM#Gj=
z&x+)SE=sCGmUK#uPR%bK)*>H_prdo*Z@jrX)B9|J5LpxE*^2nCU7ZGd9ljS8%xLR=
zgBt#0i`=Bup=~_{HC=PzOlHmiP}kKOmSc@)`v=u_G_Xo3$gmCgu1W@cyXzrhQ%hGc
z^+>h0F;=e9(MQu>OKy?u%Jhm}G(H0b9rp<gVQ4!J6v0p8n(e?mWMt``H-x}+%8>d2
z*uV%_N7QjZy)j$YydDJBzI8oZhRKUD44lj*G^Y@9=^|z=8FbWYtnJC|I~B=h^W0DI
z$y+~ld^ElD5c{X)tx-?f%SLh1ZMBN)7Xs$YceU9EFEsT81eh&a)pou$AqKNAY$?ue
z-*{~~hwHGaAxw+0;bN~hMunI6#FNW99e?(PBm`oAPQ4xdO7ep)^JTem#qDz5p?kYW
zEpTLq>tni#e1~!^hH7{8fjp%6PjHq+hv1*#x>bUn2tuJSOU`OxeW+l1u6MsbTrT-L
z4FCRSN{dAHCT0cWe6U^rcON_9nwL%aFrZc4jut!U*rSnT?93st0qISIJzK_08!DJi
zH&ny=X!d^1ywdkl{FeN~pwP}c6y?vh8w9O+$UfI7m>p8dhGvC|{(zM;5GQpfw(@&Q
zr}}?e|D{pbZ|$W~bK7qrb9{Q9>lpxdz&H5*{4&_)jdu&lU*hA}kd$segAU4_djIlf
zkf946q{=RZ)%6v(srP0nY;b4*KS>%>_%v{RkfD?9;u(NJc7R<vog8VwW5s(Xo&BuN
z(n4-BrA7_s9iDBy*?l~+mDkT9Ke~_T6$dZQB@(~Nyguu|&2PH`>DnRScNG>0L+a-d
z7^%p1U76nseY3{FpnPx|xH&us8w>2WzV&|X$p#08Sz^z@q225vPr1iIwmRui1wX5k
zN9dJd2+IG+lj7Is>Fi%)s~T{S!*Hl6&H>0mZMEMLxk1QeZZqIfE>sbx@x>d9Ux{rf
zgb8uU`iv{zX!W-$Os=e7AJ)ac2FIc=Ih(CFG}E107JHJ&R?auO4a!V}(h?<8stUP*
zj!~c<uUo=HHAk)b8{mu9Ips{Ta#o1}Zu3X#*KaL#d0}JC@B^KzR%<o`Y5S%j)Us3-
z-EFc@lF5>;$?*WD9&a!LC-&F5U)D>#yK^laU*S&t`am!nXX1s6j)R}*K`ME<*?^-n
zSYHhbhQ-7#^!l|IUFazA>b17U$Q8Q*@zWmPtQ2r729TFr8b|6uU{e9+j`BArE@U^V
zbM^h;|Lowg$zdDrn3a-n%qBz)#1WVCdeCouXfl(p2)HE{mG_KU_8xpj?vMGKebm{M
z5~R>(m+^I;@-B_D*r?KzoWWEFy|*jSpKjdydn04;zFWM}hD5I!*uDnE{Ipl4=U#mr
zkrtlMkMf_DrNC1*st6Ed{Q9S9<sh&zatPtc_$%Q8kJ{l;l0_DYEPA-oc7jRY>NzH0
zBFE{gb})KoV?ScJAE!a|$rgPHQab(bu#BrzNT<0J0d-%FcPw$7F?jd8Cv?XMBz+wT
z7IYv=DebCD-svvxuw({W%4U6@rJ-oQ<zTxgQ7$%OGLx9Gdz-q;_);_?ruay`cqiAj
z6hW)DU0CZ>;mgg)*t0HNyII^G7Te%ytZS&FejS7JC7<HS*Pt4kPVduN&jPOomygY)
zldYTx5u<lOHsWT?<|*qBKJ{IRq~+BnTeK7yy2N2Fb(tLvZ-8#2x|%0w|B4mk?tJUR
zf$<62!MifEEp*7Cm3u?`6U62`=xaW6=x~zaSuCuo>+5v}aT56UX$Y{eM%l-{k3Bwb
zSClwb229;pT`ZgbWd>ZSNflJ5{EbKRGJ+1;7RNEows1xTStr$Xvl<BfEz~#k%hB1d
zqKk)Qygz%r7uDdnrF3Ygwd#pQ{$lVB@*zaOYS_{o_4dmCiC%P~Xn$48T>ths?rGY)
z)b|6;*G%>8KWDBVyu!PI9C3G+V&oI{(MA5U<;~Yg&E1K|Ea*8_a-Qfs@6!4ypW@g{
z-Gafad?1z0msj<8>op&$Jzs;@)dE2u+BiKgqE`>>Px}ZOuBPF6wXE-2cPH>+niZ~2
z_xFv+_9E;Vdfpa#dYvLoTrQAWd48PEg)tNkMG9VaTHrXto|!H#P@C=#m_iTvmgnLH
zh}oPLZKchCUUQ%GIOcNlcV@0P3*o>Mrn!MolQwCz5TmuA5t(qkZyl^+(rLQIOXla)
zuJ6!*@ob>8^w2q3+hkKuaVq^~v_~=wbDg;?C4Q}O^;WDj@A5C0PL-iIQ8^$V<9&zj
zZb0H2F%E4aszSJO?bw;h2JPi1<zrMY@@e^CG6!v-OgLF6xO<GuFwA<<QYFhty09u&
zH$2<?OcE*Pt|VK2{hRTJRYpVVnAx7+rs96feThqU*$^wbj4A0cKM)Q(%iF7IO^Iak
z_%iS40B<rZVtIN|)u_kiv43)C-RgaSJ2u%$r>DEuz3bEMd%5dv(Q>Nex^UhuyV7!p
zCwuh|)O7yZ22n=<XxS~#IJ2SO4|T3HyH>5XrY9X0=*R*<hnY=WoY95wyb7~RwiFo>
zovwx3RO&IffP?#+8Ag?Yl?rXErZyIXeUxVS!Dm!Dy0$-g=Lzj}<3k;TWBzw~!LAPm
z{0>Zlb-1;3Wm>e;`k&mUf*fLn9e+La>J){3(c|qX$T})rs)yL-C)RYu?wsvw2N|#N
zLXJKfcQ><3)ILkMn6d7dDc&wgY1PYn)05=BGBn9^`67HY+|aCZXKe)JMAs^oZE4FV
zaJC@V;|ccsk{)<sqKWm;$EbM>AIhwnguGySs8tq+snmr(&#YO9hx*)Ag^5Pyc^H_T
zRZ>hRaHkFoZPtK3U8P_u?x;?<U|y!3jC6QiEwEL3^wT0a!Ai13X=rnwm9}&bVd*y&
z*1`igAV=V;f+P_8^&gM7YIdY({CIrAzf#%fgJ;Y(9Un`xpi^?6<Ru3qx1PS|+~)q=
zXsT*@ADYfnV)Y<@IoKqtlrc88Z@t`C)3|9c-IQ4d<rcB@gT+R!%NxFW3O)Vcv2r09
z+a~2e24@1*5wB#Mr^`)|a3)sOfA}kp^^!|I-?LaMwZ`54S_IBPfGNE1rE9CVg^C8`
zOFuS-%jHO&9(#-qwjo-tq}SUS^?zeSO&j-={Dowq2qjp$Q{~<Tf@PS&ELXZ7{YXrC
zLvYpmix}0t(?G$bdv4Xhx-Oo(>4&L*9RL=dTMHCNHmnOQ7#dkN%+l<`f=hO6LttcI
zE}5e_8lP5h2j74BI3AwXMo}MRwqzS%FY7Z!8e_%W>K6#L{L)rDSaWqd`(X%h<OfP~
z`LuS9%iu6|6OXfk(ecP87{4L-#m1ePRp{TRx@0RuZho@GuHE(C=r%_IZ}O>gvIIaH
z!N1UrwRBq#n}ij!hF2x^?C8rl4wKa<CYM}(SC7*cI!Q+22&hF&g7|0y&O9OGXw<Ox
zc;ZztYI0Cp&$vbo8MEjZ=IpRGSP(cf*j+sL^*hK<h}viSSw8HuQ`6?oR0qM;V6Cxr
ze0NOXjJt&6ar9cb^smx0_nEcLzV1Q`RH1``=b;u%N?_X;di!YEs$oPZ1}>X2gy+3A
z{`VkF$tK?^wVpNK$KkAOMbrIHP3zksA+zBwmWrH%`tMTyP1RtjtIKmpZxrzyNB8}N
z=+73O>%?o#P>ywMC4}o@zrDZP^N8v~v7$&ZjqWf>8)wdH)^7bKxwdVfzl=nE>?__x
z&_5G$-})2?4n5&Z{w;)UGFXDTMGP6}^*-!#*!Y~d@GS5NZATmXKwFpVJDZRhxwrRh
zhvM$wqC(?pYJPfrvz5su3s;@&U)$m^LtD`B&7~UT5AXb3+&2iW&S3U7X*h?*V^e|K
z=a!?>O@Cy}@`PTjvn7cZVHa9RHvU|yy}mG=n-<PL^6Q~X0i6sg556@nr8hpATG|{j
z?w{i`xK?Qul9%iC73wjoc1#JN&Xi5tBo2D9k+X0y(9mLEfk+`emucYL3)@IIDmuoU
zpjry~hyR1+nj~NtRT-OW1=o`j8x}joev_Dl=(qzshN9hsBnw)p&t>&CiN!>FDZ<z-
zW#L1VFWu#R`J6D3`l^jQTP35fopWuwjk@$BlM?23Ia-*!?Xwa~L=ja80cYQjAGbgM
zJ&NPjTRBfCdrKvB8swN<!+@x=2-(kZ)~8<G|1LT(^ZnRQ4&kQayvGL1XWv(_D%Qmd
z0-}YGi8(%q9Vx_~t%J-5rH5iUEn|vu;k<FY7<JO<VrxleI5gFwt|U&Bfvh${3mh7s
z*&@Sv?H-}{qpQYZzc{W{SMcYUg=}QsxNXA%#no4FopbX59bCv;fT6tTpvcj))6jjz
zv|zpk`$jZ>ce27^GEA-IU@EuTS0vNs1rBKp+!p()DSV0tndjHkJ?&S4b`2c(Zb5qM
zTiFngoKzTOij@Ac(?@+-nfQ#BguNXME$mC`4dj{dnEYx%F}&E=0;$MxD!+~g1$5e=
zN7Kp}x;=WyPLbWXm|uTY>+i!KZQSu&#JgF4kAG+P_#$EiI=u-0#BWZ2CVO|Qcq6bp
z!bY#v&FF5<U0h7*5tS)e9=;aIHYAdOz#2GddV&_ke}7N1C#NL2`-DY<v7f5_%*{Vx
zd+}cFpr)}@bzNK8y?=nb!a9Tg%S0R&n7udDa4Y01>cg*deCiS(QPet*-V)0JPsEn~
z#_J7dlIn!6)|mYOQ?*yvME-GM&8>qCiQCqj(1w>nyyK|~_k}WVy>E#yF4fL6b}m7E
zRA(3<IjsghZg9o<su30t9)dHID{I{x#m=plh9PPov8azR;_v#tFz%h(SMFCFMP8&i
z+pO^DX5y(%x}LiEWs_}N^`c3Vp-fww)jSRf9p9!zMbGIdg?y4!qn?99D~F06fw^?b
zJy}<(P@0*6QS@EN<+CV)@o|-~L*)ouJZ-Tr@%rZpvVxH5;D`>SC!H{>?z7pimB1Uz
zj)tcHEJX@}(yA_G_;5;I-Ip+he5!Y`%LNO<l7_S`0J|2udYcNcBYVl049Aurh*#uh
zBAKQx65ZRjcpf(!bw6bam8HSG`%D^WlpWLpnGe@j$yDY%0`}aE9r9jz<^N*4K;4^=
zAQLclDh<uG`>?!bVUrKHCA4M0;Y4E#FO41Vhlq+O-?~wt12`ZR+<J9atTlw&;L@(C
zi8uO5w~=PH0@(hF+tYS@`KOzThoaEgcleLdKknaVnm-wlIUFd3yT_OKJV*9Kdsjbd
zV#ox&96MtwOQ+*cY7M`R?%<F+^Xw_9JRY4veUW?pr96#1@1B^pb0$S5Z)O+r;DPAC
zF@{Aiq|!a}S>LFEY$#o07qr$L+C%g7=%G8$ET_%VB+Uy{?-=%K%~>B`$7~n7VO*xQ
zK%xxx)7$4xhG~{_NH7gABNSZujwje8Gx0)}hcIAcb*OEd?z<`3Q*GHFWQiHC2TmsQ
zp)=O-5=5@hQ_%GOUcs7I#833dZQ-vc=90F_t=rEWEbGpd-PtGV`25?}+^};2Z3MLu
z#$dqIkjA`1RtOPxVS6%Jc_TI_KfD6Q0V-M;E|Xf%Agqn8Mkk!s>c(@>*KU<jC*mNK
z1Vr$zR>VerO9oSNe;dse4{J3B^u4^Zj^I}qODy0)&`|vJgL-lt!YAiT_GRrY|AF1)
z7%5l9`rt=jg2!P|7S{G;S4}_5?CQjk$;R6?R|y4Mug87bRSP{AQ$~g!adx?fO+kl9
z+gNbo!uFGrY~B$l0$Z<Ob{k%dyZ!-PD{GKs6d%LjGBf8y(Z-8AQ=>M&53^7IoYFcS
z*~VCxI2<QT%1lb$TI%JYTRE@S7!Sz`j-y|to72Cr?vT0_RUw?*RGRs|jhm-++KlyX
ze<#vm(*7*nd;5++KSdh0|0RfTx=e^i=d+ZCAz*QxXIsBGCNvoVS_m&Hj&(Fw3pIFr
zzM0b@`MAb_=y~ekCMzOCc>CAt&rGazAiWM_fEy0Hg~+(Pp0)^=G$>5oWC$lZ`l#wn
z@Oa|>oC8ha4EQ6TMx^N?q4~D=dFPwZnrfwa+^E*qDMHv)4^EaTal3mtI1y5KHm;Jh
zh;!5)5IJNjsX_6Ki$zMQ!Dm?q-3Cos<78*RU_2rNrfO=l-L0q>-wU~+53k*%po_G+
zXf4ZT^(E7-*vZ$Kq#7!-1?dBgYs+PCZ>OyowoWYYymv*s*D?yICVr$=>Qiy&qrlIg
z(8$#M_idco4l>)WCqKZIDbb^bHA%`oj+e}ZT;U_|eo#+DlH#UcV8+*>kkz4g=s#yl
z#S{>OdR3?#3ZmP_Z(!pG(q&W{6S5qud3GJw>?lB8XC}DH&Y*kQ?iKv2)Yi|l=Hvl(
zCkCWeiroMFc}!%*P%xc`v_`42lCpN}*C>vl^Fe8Imp?6a&~l3I_jdm7iS7D*A*dT`
z^pgJy73|>?6)A%1jAh-;18nn&NktR=4|HNn^pQ*9r4dJ=DU5@@H8{<<kAo{Fy0fYr
zdx%81J>CJ_($gcBe7=N+&?0(ujUO9j$Fu@D3-p$tpQl|o?3~}RKK)0+PG>=e8`SL|
z)y4!J9!__16=62bm<o=53%~ib0%r_JMJem$Zga{${Alv6LI*J#467KZvC3LG@A~o<
zC>ez|Qq-rh^D_;tn)*H}8(?B6*&I*naA}DyM!cnIDV%-Rrh#EL5Qm=&X^;TK99_rG
z1kxI3=_pjpn4q+bVm3_G5&<@j*92TK6__ht1MjN^#snYVM91mCw4UhI0YpX?m&Ayz
zWTmnGV3^R-kj8Ic%G7B<;B7E_S^D)vUJ4>p8eI7Te_Q}N=*xQhg-u-`NA}~N<&2Jv
zABLK29i4wAH4D15q#G<K7n=wv7OdMzeh~)cy(u-eW`X!Fp7q8DE>zF9mQMVdc54rX
z3pfScpAu0GYdl<(*Je22_s{lOIkOe-Q!Rvdcn^vX=t>E#m84Zo2S(79#kGEl&BSND
z*LOTF0hQ8FdWQq?pTf9W0`ue&31iK}x$k;|LaNBLnUwu4(oI{ssK%8V-ksn}+NO8v
zOmCOH;Jkfl6v1P3u9fHWd|KvFKijlJ`Cf_9I47r6V`m0a=?r3l^MZ(vKi*ye{I_*n
zq)tsRk3BoJCj|V7rT1t7zDe`PP%ce02@X7qHKv!+dbcLj_n2y?wadOotS7&am-k43
zs&nRCmsLUFtWVA`4=5xg5V>ykt`j<rGm+33WpQdT_*F&0(r+(<=CjTz7O^p~B%_|Y
zvrWbQR!#|~PwQa95wK`|@dF|a<IA_REF)P1NqW~j94tRkB@-noI!PR=hpv$MOb)rU
z%{X$DnWAj)C7n~){*%B8vORj6_n#5kTi%@s(>8pR)*u{~5)-=2%D~{DM_br2Z>!!!
zq<N2m7zYlF>@#Ht6^{h^GRTf(Kc2q7j@3@08~NNO??gK_I5&WRU8(g%ga>whb#V^q
zsMoXyXmRr@<R10r<ae_b=F3Rs+}PJFQ}Nv&<q`~iM2WXD=6geCtN0(2_Df&!ZFeB^
z*R)fuDXME!OGvDP8d%TN)}vYN!}Fj(D*mv3q@%yTkZz{Mz$v|rt=sVLo>-ae*iE3T
zAVx+ZYat#vyQ>yFW2|nYG_%xSqUVOA4ML-ShZi+Dg-y2H@D5-Pd9jm3J-+OklphU*
zosNrOgx|nx-8MKSs1vazR2_4$&ZtT0Rum<m)g7iW@@!A?dIRa%@;wom{yJHSuEf=s
zD84#hnQ7OfAYR6%nA7O29&X!`nShPs5XoxuI6mnz#L?pJS9h6srM8UZ{UQG*Z?<xB
zVfB3SZ~xn}VFlPb0$hYrUKoye>HnBolBMCDaA_{xoukC-J@>$CERblhP|=?(tuCNp
zY*s0MOV3ifP*vaE6q%f1{%G4rqnEVsIj7}wg}k-Bnt9;6z|Ss!HWo4pwhj(k=k&R>
zD1#K<i`D3aofJkCzKz<iua7}sE)%#He6=X`iVX5E=dsW;uT&Nxd|v0VgZzTHXPZR}
z)+8VFrpCG$-IT^`{iP|N>#YpUUuhGDZ+Qn!y*GU57#Vr|JLT1i;pjpk#kASpGbzAU
zn%9A|a?7Ikt5lis{;$j8ca!4A*fl6gtI5>qQwlxzQ{R&8mQbVVxl13c1vnPWNbNJ=
z-8W8-or}Hy*|&`5kwh*@QPtuzu7dpQOu_d;16vCMDVKthw%_~#;{-Plx*aWQWvQRr
zBWhW3mANOHs5<wX-hC^SshW{&5S(BS?hzD?*{&R@unM5okj4JED$!M8SLgDlulnOS
zJY><do675^fZE}Xw|BNla~G|Tp{XtvSWgF5^+C@<=#Akm-d>sG2e8QcpMTQVYsIEm
zZERF$0<!vvh0Pmdk;rJS1)>Rgz5G{ao9(T)Jc}F~3${i)H;9KB+PlxiQ_NBiQ%R+p
zep-z&r0Lo3j<lB`1yXeVWW=)aO%76xPU&eD2+^7h3JDJ7+y=r}f6~GX$5_Hj_wHsN
zk#E{#A&4y*jz31sy7yAU5ft>DVS5wh+hySN$?Zvp`0#<%HW41#=mbhu%9|jXMyZa5
zgxJXQG7}E7)^u(rxk6WoU!@UAsU%?<*Pm*t@x{J&&{BbHVr;LO?>Y&6{u%R^86Hxm
zob+Y{#8pyl7eL!E5JNfk;?CQwPGq4A@Dn5QK3KDSGJFhGMyOHanoz66`CK<|#z|8Z
z9Qrazh0}pFPjNPxl_;q*18SLJh5aKdkb+1i!1jBTd+3)LTY8u8eiBM|UKL`3A24Cv
zzP<Dl@5;<$SnG0N6$`z+jEFj}N!6kH=!a?~8a<RjCW{B}mxsT$Vs56fPc*Jk6KvXv
zb#iv8*);4W!^n7LzJJ#eu*4yEEXXIlP4;};-$g?uyr_q=ICUBq@kSYm;|EX@60<Vk
zM3HA3;I5tB%f290+Hwf64~RigaAu5%h@yq4O~ov=KRMZEt(?s71QQjFt^S!+Q@2y-
z-QNlwkHlXn5Yc`<o$2g8Sy!KX!=p-_!N@;LZ|=N&W$QVYUvQbNWf!Z&;xxFi418XP
z5aJC+61S;P3`g}-82ql2(|{9~K9Fn;kNTr&NM2~eTrBqWbK>H&zT4O{|A;MG;m0i3
zKVoW4GavBqk`t;tEakq`i&*N35;kRuSu$y1821B<V?ws~$3<4|X{Z@h90oD_K8$dc
zN}WxvW@XSj=;6_@eVI*Ejf#^^+ILb3`!1K*Uy`B{R?~vE+bm3xO_aWDP!=uMdDv&E
zNlS4DcQ_1CU&<SPo?7-EHESUDOX0lb>_hT0jz^Xn(?+Qt7-o&&euj1xlCow;tEWAK
zg6VBDa}Rj=V`~qrn$92d{5|b?><xN+?$TP1R46%`0@k`xA$Npb3r&+L<psqLlD|nn
zNZ>JVz<|q)MxPM1(Y-ulaV-rl9(+KO<Gs=+rIq;=HJ1k_EGVIqgUh{%A9cp}*ZsmZ
zAMnmpBP*)-zQ)jA^Ys>Y{Ap-w?f>{wa_zz6$3lNKM&801>nTFW24QoK1{+{lBqMu)
z<h#id7S@{pMV^~|>m~et@hYcTWQYng-@WxFb;|{w9-}&bn0bY|M1VRoGkqT&k(q8X
zoP=qMDyQi7CW1Z8^f>vx;rPj5D4BH%KPQfT0_YL+u8%!k*fD;CR`w&9zCY;@E~3a3
zMDkCFLv2JCcI&Zt^{-OHW6^CJ^Dn?=k_bAc>?;y%qa}0SxGe0`gCG*BnJn1Tv=~0O
z{_PkyV3n3d7J5!S?9ltd@Ax3EI)vL#IH{;uEWg6*1K4~cy|nDe`tGYwa><c(ZO>{e
zm#YIs4(V_+dZm=9jM&$vc}9{8-8LOhtO|%{jS`84&L;pT-e_9T11rHsIHZwcyr}xZ
z<Z8B+EgpDg3t7!#RS=|n4o;Y?JCS`nh`G*!SvMZUZUHN(wCp-KHc}9X0z8VJ$G1z>
zaZz=%C(C^#_9&$4xF1?y1D`l*bP@Dn^2}J$WgJQY42S%r{VT!+sqUe&dRn>$QIZTE
zxc$7k{{_rZQpHfK_~>_m5uH)+R)w&P6>M&8l=A&Fx0XS7Gj;D4pwMk_H2l*|AgsB@
zGaCJ}Hz{9&L|CH+{h9UoEaun9VSvVTu97)X$kQsc2kuU?u>@T|`(R-9Jd-`&D?vrV
zCst4C#MTE;xT5}(6us1Z|E?`!EZta{h<nWsMH1nwge)o8$~GctnYq@u_u_@uV{eu-
z{_^3j`>~$3gxVjzGr5@DsWu~04<IuzTVab}?k8K7Ik+$aCenxX-$$Ld!5j_JbXWI%
zSg+^EN(8uOgcPsT<a$FeNY+pM+hO`@uf+;jb{N_4Sc!2<v7|ckp!G7Q3Y~|Ne?0!Q
ziB1MvlT@&!QTvC%qF_9&-5B$6Omu4SIxf<7J18mmF$p0?T7y-q_2~H(4{z++SUGGB
z|C&bQPp)wi+k9QEGGGsUoV`~&pj+L3xSVd5;Ps&d>U2hm(RyG5j`dH(@Vfpz^}{hB
zUi`gbo>*0VS3VZ^r`74{Pvr2cY1B9LDZRr>RWRr1LF-zBY{e@bC#A4JtvdNz?mN^z
z=59Tj)|DLd1Rp=VBYIo+TG$8eHV3}VBha#~E4#Ck(4ED^m=_bLkBcvT;#S#iU-|hU
z=7q3Qp;_KcVY7(NfDN-0j;qRit=mLb%M}z=K)akWO5MO@>q%m3U)k|XyT!D3=l6^9
z%E*i7UuuVY_c^F)IlUKZU<6vli8#(demyKohZyhC>%%rydmJARG^rA$3(U{;aNNN8
zu(W*ud7Yor&@W3@3Why)$LuUYh03CZrI$Ns;JkU{mQ6~{K<O5cvnlXMN&`3qzg#^C
z4{#MYP#U`m<f&}0Zy+@cjJ-ZajgxnO53qU~Dyeuwq<Oj!tceS<14$+k9-2Sj!Se>2
zwrp7UCTL()%F|YP#cf=6zRQ0ro*4m6q_$^~js_Av8C2gwjqc_?lY1kj!Smj>P3VWg
ztby7(qwx3D&oKN49uCUcI-0ZDx6SUyW<?Jf-VXZ9alWa?eF@aSJIuxr*n6MoBo7}x
zd$jr203BX!tiO@q=t?ZBr`9GUKjx@mfkLYP?zg32HEKQL;i_@N;YZm(_N>5bQKLJW
zaDe0ooj9tEJ9ooy<c<cT!AcIZK}`am7GP}B;Y^#lq4sqm23ba!gwiC*V94RA!006>
zRGGI!L9<#MErW?peJ(y~+EWc?-SwbnM<z%~e0sSIomu)?>VYXT!@9sC6yR4%Rbcx-
zgef>wX>M3AuEG>5n`uVkI<hwR0dF+2@6CB8o~9L7sa;?Fydp}5DPE>^fTj+=kg*Ud
z$f)_iDuy&y=!eGf*CbX0E(>phu39-E`P`T|V;n#9Y)|%zIhwu|wB^p&)<$2d(9sPw
zT~1j1h&rF$wfG(#c=pICh``K;Zz+A=>G(3acyM0~NK$v2B!w?r{*<#xSSK0IkR|yK
zdug7HQ!3ESmxfnw)(wOWzLPJ;tUh7|t5Pz;&1q$18&<`yQntG(`4Ljs#c8>d!&1?)
z^8A6BPoG4gnkE-5edgDXt+_q!YT(~CmWF`KF$-EHP7!rx%Y~xAO<tI;?DmC~;W39Z
z4nmJ<r%=d5SMI0RHsVP9HG0l*mqV0_OM^+ug4%Qk!u27@xNcINHDYy+rcE;?!5czD
zeenNE0f0;6B1-yUK7TzXVo-i^1;)-KenqnV#;1vgY{EdJc={3?eG15lOR1<GJ9OhJ
zsEqH<+A~AcOgj6>`EI*`PHMNH^I{Jq>L#;-BKMsF=w=u^H`Qq;U-=S#EXqFE+jYT_
z2F*l~OaYrqriX1#st))ONqm;ApND#wT#4(=?XUEk5HCk`7|EKF2a%l%SuCF8&!A9<
zVXV4cxZ!MHx8paU*#=6?Ck;0MuY1{3y08*Vzx7l9?pcayTS8AS96-)NQ&lK*+Vna@
z*(XpsP~Rx48W&9JQ*!)QE(+WgG0K0c<p5PJ_Pgd=Il&N70BzKqC?%5A*v+`Ihod}$
z7z^V5hj;1Yc->|Em~OjJK&n?QUJ0ya7hdFoAf7X0#tnnEYc<UhSe0*x4Q*o|`D4>j
zgkv0ad$uyg-(C=cdz?oS#6A5N(f9{0uE=yD`(&`xU?NIVe;h|njiC7b9?{(h>XXbk
zV>Y8VL-($6%{?$rTsFa_sWpZ%8i5dB7U)~Cy}&Nl&P{RDUVa&Uux|YEY5V~wVMW@0
z&@*EdLGN<K9~6vQeIFRy`sxYy#X;%DK}?rt<N?;VNP9-T+&rkJ{TdJ5WzF-}YI?`z
z6gZvEu(a7%f=hGD+x1?MfFEAO0>5Np7}Rf!H{&K(P94oe)+!~0rT&y-6SY=;NlT4r
zh}t!a!FGR184ZnDUg7<uX2a;`gR2s<Au`-_GzU6u8oeRuL!2Ia602@!c2~3CQxxTJ
zD)$sF(RMUwnf#gBXfhQN-B%yhE1{!wx;G{`SP+!3Qq>K&mrXXgpA3zGJ4N;hL$H2W
zM9$XoDwW(&m{r^=Z$-xPbtgPz5f1g^Ta^T^HNBR^x6TRl57J;)tGnR6I;=?Qrtrn@
zjlHTT7{I0e)P4^h=^0+;%*K8u7+^B<54$hkbROw61HZT4%Bw($v)C)!+P6Q2s8i}p
zTO_~sbE+@TA72FNY3!`DM^oUcC&hLVk{KigbexuF5{%)+>c&z!P%KE$m|6XS69Iq+
z=WF?-k~0?Q{v`~~yhm3Vz3(<+8QvUp`~|*W$7bU;Ao8mmF^Z5iIh2GOa`)&7<9Q3C
zEw^LjEpqakc^Pee_+jX=BW{*{L+;3X6cdBTNbBvn>Lx-*86;A*I;j2k(PVz98^30a
zU8c|!bEdyV-kmQg@ePcyVp-N?#t6osBfQm?PAcrwJGIdXwJE=tqS~2F<GYqZK7{lp
zw7Ty_lA)x@(2J7wzCdg1<m=Z>eGimV^{5S>&0k0V?N}ogNRN9oN`y;udFiElUc(UL
zK4LT3A&Lv4v6(GFn`c1GQuMXDUwb9#AcbYvpbH#E$k9UNZGYzTRxWA~08ur-naUJ6
zQi`jC-5-XU)y+z_;Df)ack+co5A`^+C9K||y1n)OVzTF5{2bP|IA{OR)eyzwl90@C
zzS&Q5*Zoaz@>KGGm!o8_+v}nUqM=?xXX8u7h-X<v_snVx|3t_hYKB|1WsDb5Oz5sF
zA(-|8B7P=vr{B-%C{u_Q&ct0T-zlc!|0mjY+rg_TzN(?!ljG6BYLIN-=CX_WZI-NU
z{XqA1S{#})iq@Rk5BXNFyl0vldukQ<G0;DW{*vEt<0a%&wD!mk8?om25l1_wGX;Xp
z)7mIkmcC9aL#sC)4rtr-pyzyV_L~D?h6c7~9_^L))N<xsY<)Ou6^>OU!TWGQmfHZA
zv@?U!gr5F#g{DT^4X#)uxvraE>0YZcYaQyHf-WrYCh@@s0UkoV^`1?vKV$e>`j1|J
zPG1PX?7O(<DY$!64)%vK?jt1q(q&ftNm0~b--kXTj`kZfrGAymKuXaEgeA#=jq=ug
z4xrr+&74UyY21m?7B1D*HTcx>Mu;E5FvBxs!Gy}i34K%y;AK+ioy01!s)gN)EB+&a
z2ME}Da}|z|yw+<0tLnDYqwpz!Kb-s24-q48EmiT@fQmum+xV}mdweGo$E&d8CHC*O
zv_>F-ZMUZuj+cj)SY?YrN>8#~V)@!|ev{6)MV2x{^>)nYhat(vnrGS7!R_d+OB89q
zdrGU{$oKiDG&m-5ZZh;Tr|0Skfx%x1D64%eBag}LVPA(z;U)&Wi_D1u1bdXBkJxRo
znvDP%PGa8o$V6k|TBN`qceT((;nh7lGfiN1%-{3l%{!co-X92X`GWE?I*bSWTy{S9
zCvC|Rzu17~E5Olzf2vnp=vBN~_+2{`ae?t*9s2q6UCTX1xL)?qiM7bLAD(T`-L}GZ
z9{YXGtZXV3-|>#va`MW*C#$vIT3K#MtlGpD0C(Ur4+s_$tbRp*P2M|jCT6b%HUdi+
zUE-DWpbR7X5I%gX?)h;4cVs_oAS@6UzekUX@`TsT*`buPue){}jtM34bJ~o1gIm-$
zi-8j$`uY0W%k!?Cr%6|7Lcgy3df8-zKhSU#1p~e>`t@uPKdri+-<-fXa1U@cHf_6W
z|CgS9ul_I&=fsgoTp%JSwF=EVRb)-V7xlu}PS`Oht6*z)q7-&7{w#*oc~!hE32xu`
zJ(}y3-d?r)B-2JBlk1|bXt9DZ%5-WRr_Qd6V)i{w16v}b?*`pOBQdiYhjr6!3Z8yo
zKet6?!Sr)A6-Bj6k}Ri?@oZfF<67~8_4yHVp*74HnwsSIS$c>k*(r9?25!vKpMSv8
z+WJQ8PA2d<Zva2{CsOv@Y9dR;HR9c2nmA>gDeVX!zjscKU38mVF%qxF@7E4>U6-!d
zl~#(J0aBkSPQC0dfCK|^O0<pKE`)&Y;Q7H5v}MP%^ej|O{a#*fNS`KEmyBkGZ-MfF
z3StxLhB245x&916mz_$9nP$BO8o{?EQl&8_x;pWx)Wg)0ikWyZ+5I1{;GD8;BN#2a
zh$lRELt71WuO-pviV`}3&nCXlk}n-Xhd=00UB(yWX<34@4nw@4J`TSVH(9G=a+J(9
z;{wOY|G@*I1-(DVc6&_6a<f2zU3@l|p`@JC+rI}AW0!3%csRX3pyWe1<Sst5Vkf6g
zgH1)gu})RT7HpL{2C!g*+#E_i6pS{gQt{|s%f%|rsuUbaP>7W6&1t6mMy{7VGkS5i
z3jKb4jUpN9FIZ@(1f*W-)>`Wa<mYY0Fq76r;H`!0^A(v|$15Lr4rsf4ZWfTumeuq#
zc+T&BS8=T$GTsM!C!Ug|yVf6yO(ME4P*3q5BYa4Gvt>%f&wv#=;ZVFWS7AgRV%Jsy
zM@{Ta#H^0fC_9^rJUpw^Ee5HK;h<m4wYH&;H(U(0Jif4E#htY6p40r}v<w8IjLLW`
z0=b`hw_yEVL|V#KQa@!156v#lE6aCX_qb>UMd@H%ax&J0A6xOy(3z`7*o3#=i{q)N
z3wbudu%BCy7R%ODX(a}Epv6VF`fw;19Fd=PavDvSz4AItMT?+soH;g*^=5z6D6R<a
zE6oUKNlB?Y<NUe~jgzz(fNPR@nLls#aANCQ%yzV{=eUos#VkKRysqB!TanfmE%pz8
z?{B^^l}C-gCeMqzdg5f?$|CP2Xx|r7Ii|Y!Pa@1m;>K9`J&#Am)t^QM6{x7G8_s)~
zn$3pgUiK3YALO?+Ydg}<Qp4!Pgq}W9OrhD;@Oa4B_H0?%DJ1{#?y|rv(b@eQNa(LP
z6R(Hiv;hTv0&i+6fI2R+<4&x2$uj5EvFcB#KMppL5lv3lTt}++%C@~DG#Pi?NF4t`
z`TS%layyJdL`=no-DX~AdS@G0jIe8pWkt{Z^7thQ!xxFr_s+F{vcVTr_)G|JHV{1T
z_yIk#=b7i)=2P#9Z*lS1i|^w|YGGhM@u$!C_}ouE8-)jv<?h@>0&im{`x;F5`6IRr
zc9O<}PxdK}^eR@p3^&9heth)bqWLDzWb4cF(#@}W&DF_9V`4DB%gK4=S;Q8dFf~=q
zT4_tWiR(Ez;}pC4_?GBXG^h{R$II$Yir6Q=Xbj;d|C5s^JVRR6bi{CsTcB@n>6zRT
z=F+o{5*I*d_YK5uSF(xA&k(A-uEvKHjzt{qQM~1C+-ccJ?zP>fB*_K&)hO2*U6sPV
zWajJQSiXVOW5FzHEu0ZEl%wD{z1jFlAF=tE;t6a(FmLV9obi&K5CfjF4_F^Bn?Jus
zTmQo`|EETOQq#}^T%z3B`y%U^&K(zD>2a&%!K?FS()Ox6&-kBTh%T3G9pou)+*~d=
zqb?YLz1$=g&|-^IRFkJT9ZZE){T_svChV9;yY#248#mqjJAiv#_jIJ)WEvM>T{Bc;
zCk9MHF@gVId*A)mRMU1FET||TD5A7br56cElK@KZgb31$NG}qaKp+SrAkssx(h&ir
zcWg-Sy+%ra&?21xfwS>_pXWR0T-W&z&ius1&g{(W%$~W+TI&{0y)MIDr=eQ%q*K}~
zaIv$8uIpgss&aiNQpEK5<7p5L^FEpwUSzL*%S`>Lr&$B$Ymei?=S$(E`%z}nlwI$B
zDO|45KWZ)cO;kM>29Q@68)s@pHPJoGg=3VwJ$s#7W3REmJWQWdh%f0llTUNTklT5G
z(#Sh_rZS=#M{K$Qby5ALUX`Iw&P)ejt3DSbK|{!J8i!tBM}qGy!N|#EL|&kL2V`zv
zl%Q~G%KEt{tl#$J!xQ!xS?(!X;ePfv{^iZTq<5Ac;#=XEnn#eV*A<2fLaUSH7S;i(
z7S^(kzb8G<D%;uEui5*9bmrG207z^B4G^7dj*1M>7B#;esJE=#8EIU!FFH!Y$W}Co
zQ#FOneS9Mnc$HLh00MZMnRRcrpVutlHF?42w62l*e%Z-|6rBkN(aZ+09h*OEKdis?
zFn2&MIz}~md{)1?8wY-D6QJjy_K_fdO3U#g#QFE9QStZnqb%Kyt|y`#mvwBKo=DFj
z9c$ChkBCreB@=$`RK4i7ytQ?L1oz~#Y_3=DdCR?aN<!4Kj3iF4j7^u$wRI7;qWtRZ
z@UX!nt!+$Fp4Dl0O|i-`3XAdOC!AtB#>RH=Eisub5{F*hlXH`TTYd3$Ux;)vhQk56
z&MZ4u{GIS>aH)-v>LPa=31F5$ev_tcB2%M6yi*AP^b~%JQsa3PJUW5yV^9nVDeJy6
z_3c|-ZZ;;yCQggc5)DVV?;}$IOnF%LZ|>%qA1?-+7KhZKznr7Dab(b?J7%SRr};Q)
zfi{5!%#NKzK^5BVXb7RndphH;`*K#~ETOD{VAHiNHoow@*2JVXL21@f+I&TjQL(8o
zS<0TVp={=wMSwwO>zsSwLQPSt1$Qx*93xB2pAX-&5z@D6--<n9^&Os$FLF3os*4uB
zIRT4Aug11hL}ldRaO50@jnhtMtSJY4I_U)-_C(gq;O=eZ1Ghiac6+Q6L1x*d>DjJn
zi>hK<uNzX&5x8(3D@}BvKLU&S)6|sb-9Ghx+v#G%-BjM09z1TJdPKQOr+LJ<pUkZE
zSku{H4+U`s(i}<z^@Y*1_Jl#??BrWrr0$Yy5VK5JB{Z<lfL3`}h7lsh$+;H_cmhx_
z_o`W5owFBeQ3npVNAv)9G_N*ohuCmP{4E^SRquVO=B~NavVAK8Fxkn+Q2<p_h><S1
z+mw3GWLbkZhH1MRz#KF|&-ewE(R_8(U>SRlF#7_W5KnH8df`$muY<}&&K?PJ+C)GH
zp;7PTu>GjC#b}SWvmkDas)>&gqQAWT4UwFMYV*kZ!SSr#@xVEk!FgvqTV@F*HL13w
z&gY^6mF9|JOzEB9DQw-p16YV6kzJ1<;{6nW7gs^Yqf^d+T2oFL+s-(5Y}h<r)RI%G
z1SSLLLG}nez|uPa-mQIv$NE(v@?16oW;%&pZO!$?*3D*(f|GFv34d{6MiP>d8Uq@?
zJi6iHQql<%eY1v($A0H5cbQ+BN&$6W^~xeaPoxA@J0ss26vO+feIOOKfvKglA9MU^
z>%Fn&qg~&{Ad;`<=m$QX%>)_<Bd;q`3A$gmUgdL#utmtA*y`(iAQ0!nJMWcESMGHG
zB24q&t#edV?CL34lWD@O@+Yxa`8U5NOBT+o=mn%RxHC_My?gC*ZTVVyDVo#UP<Tdo
z(X))F!$L}(&rh(OO2Z{=%!f?<?At^qTScFzP|bZnp`!>5-bu`Yu<-!%Y5K^{CpVNW
z;Mn)|rO+znk&6EMX!ys6v29E9k`~?F6bn1JRNytw7;`}3xv_o*Fi;2Z$im)G(%cD7
zCW(07FxJqSgOudJDKPpTUrjOd4(cPFi>NG#eBT(rvw9(A;HYs?(X8}@9W&*al=dsL
zwU6Dz41k~`9XI%OHnwr6iC}APo{}Ce_;q)s{xG;;-5~Ddlu<B-@uD{+3-lb?N2_lZ
zSJ`sM+lL>vc@=hg6&k7dlBpnlZqfHiy9)Wd;l6pPE#>mtMde<zjJ|k>Z3#CTf<{DM
zlO9iHPeLI+`yqtcs6O-TxP*gDP9e@y@OSy{1*q5r6F+^1F^<gtc?(<2<<kf>xS`;Z
zUq~_uaY)A}ouMj4{>}<`%hmS-5L<v;w2$NMncHdG2E6v2(<ehyDK16vkuXkFW`T-T
z7&NJ6C{g*dd$kQI{BQW^2m3pLsWw5JK0(?{?ZW4dQ_)!OX$psFz6GJ+2SCtN6RwQm
zpklxD77vmSpSk4~ueKq^SC<WG+0N0Hx*(GN7&2IVnfBYlFqO7n-+H9>Ao)yI-)8WR
z@c~urs+0pGiK7+4QDr2A!s5-yPo{>{R=LU(Z&Xx%)hTC^o)LquD2?ISO?PwbPyx(h
zU5oG~{hgCcHnKRGanFP&zeO{+dLZr8P$*^k_!T=LzRY#O#taJLjK!NMHygl8deLs5
zbLV+$`>8CI0`f6&3#E#|-7zPG)&jP#f9%wNRK^!>GZXP@rP*wis~0V6DtbIxl$8!X
z3F_=<pXvec(Q!oTJ#$~JxX-pPygdU9<0fr5ym0O8VL$GVfmojOP%^_RoA%6T_c4@-
z<tnjIS!&PZ@Pd^6+{Z_`%<oc%Ki?E!5dDKr?dx9j^`%$PW7w(NHz{p1ebQ$9Dpj~O
zu}~9D9LS?_ky0Vyvp>qf1Bsy=v`bS$ON)tIo7{sY{-)v(YVU?ES_?YNx|W;Su)y}+
z=aH>3_{_Pj4P}94iaY?%{L$dSp!c%sVquxGRTyv7&5!S};Z6~ZpFplyhhy3_6Vw$I
zQA%tiU<mqXFIX!SC4GWg?3Y&bxN6r#T|~bEogZ<x8C7k*hJ3p^(u!+zD~VQAgDUKd
z*z)AkH&lD^S5{YcY&?eND6y>Uu<JD3E>WnXAB(n~s6?gY+Al}XXjC76KgdWxOGi%X
z!9adquGRwBDd!JJYii6W_Ewi%ZpS9#QRauy50Tr0VjrR;X%s`=7IYlc6d;d~9iblx
z`-1hNjcE?l++B&nbX5gZKT{DK&R^|qSI<ZnU9BHl-lsGE%atz_uV)InJl~_!2ZZtS
zj#|{2hBN7d{P;nS-vR-7E*<XhwHC4xOyY2Mqj!U9r+^t)bGiCdDf`u+Js8-K=^s&i
zdoNI}LWw)R?4P&J;INc!qkoi1J<_*Zy?l7OXKx;+1q3)Ju||S9kJl2Ar7}v0bzKod
z*w)a1lR)Js6LP-6%{e&3ae(~R;@V=oThI4>>MKe2Ko_5*$nBIw^=$(53Hd;JsYr@r
z<gTkhVC&d^*UQXi7Vc2tsfsm_<#`G>hWdLq2-_DkoXwyLh|rkT=ss|3kl(Tz^@XGR
z?K<z+T;cKv%>`n}o6+dYowGNBPPeOOs=hLcCfuK{2Lx<HwNuEFuOZ(v@!U(mX>d@<
zMAETw`-*53JU(I#n^%h=_JTsW`S6!6{-&gO{-lG3YwT99c^=oGRSP7*{khnYdaKoN
z6%1gLGthNxWJm+TM4^1=R$<q}ZQCe(Gi#9oq4G=adLi62$I|+({1#Sne{54ETZkO<
zp)q%TWt~u+o6udHCQvj<V_1fU6N_#>bkV3{5a14&q=B?>mY~>Tq~BRKsz*=CQ}2t{
za}^z~a`~lWQg4znn<r=0fA;&LVPOrr@pYGRmUBaz9mrsS&mz{2E$7!CI@6Ei9rand
z$5*0i-`5M|X8t}t`1o4E5X{9L$}^?5LGtrF-jHo+*<qXneoF2*s5UDydaz;2?g~p(
z<ylI5sW_%+L3di&9fve4CA|aR9TeP6HMmnWtQ}^bi{cOa)z9Ki8hskW4qE53id=R(
zcA_?zZ}aX0b)}P>Mx4uY1RlaHdRSi{eQtU!B=G*Ltb_$0jXSePud!P8;x-Yi+zCjW
zDbDcQJ-Z|GroKtSKZ?gsF59Xr{-7Wdak8%x+6B`Cv7ASG;D@VHvE@C+Cnj)!36?ia
zMLjV$$PxRl%#mO;%|n6vt}m}y*EtW96YxvEFY{12TY-CQ_7ez`Dp2%GIfnQv9!9V2
zyzfs<GCo&NtJX<<5FQ)K){supeCtdT4{+5VcV4rwBU_TPH6II#3O~3>r8>jT-B6jj
zPSF*aY;8R%H}ds9QA|ZdGfmM%ie;VN{6@Brqz|jI+1H+_wK^h_)}z-szI){d>G!{H
zKZRggFNm}1uou);(O}@J(C5<rtF)T??5;{MGi%PxdSU%bYLXM%vqoyozwy!O*Jli8
zWLsT5#r|xR0sD_~98s}sUkCsP8P2vOwJd;@t$w`pexGQslsDri;f}RXX-&j4a&jMk
zR@e3uPr%r4Ug)~}_z`vpRY2Cn5u;$w%7{8sz80&L2kcQMh9>+)zruzJ9ZE{$(M}s8
zb?8{YZ&st-e%*+Se(1Yl*P;$&*E{3a1$`aqRUd)Q7w=;s;qj`LQrE92W9rN}Lrow)
zctQ&h)Y9|7+jg!>Ve7h7AANEe!s)N+7UV*CjmpJ%6bIlcq6vfh^yI1=oo%z5)S^+2
z9jByA+M*n~AK;yvwlz7bTWdFy4oYVby?>??*=e*UCZ={&mW9IBe~7*aSc)68Fb*(%
zz2yG~qQa@RzaNqC%&W8~1noi$&;1|>z!+-HQKqcn!zsJXs!LlA0H>z5(ahK+JHqVP
z@8qPa{1QUG)ynL8z)iqG%o@0FCK~EqMkEK5S5R0sdu*|Nm3AVqs&orRs|%2`lxU*4
z2ur*Tv}00T_f-5;W70VSMTm(}3(jG5yb5#bEu&Pp^}<ABLfLdEmrk|RC+t7~dd3aW
zS%I>Ub_73lJnL~8l1QfHVAQIoc9vKM{8AJ6=bt3<q2NJV_q@oQ=gt_}>_zRSIb-(G
zBYXe8O8JQAzCk;m?Up4~Ch~ALVKY)+^$S$&uMN+hK(w?yN3kMbID6GJF0h|#qtQ9_
z+S^1U#uD2F6d)W`T-fm@<!s^s2aT2+SKq<hLvi;vZ`Jh#ZA}0>`VX`w<_`d)MMfv@
zX*(3`7RA~(RtTSQptv!`B)X}v>SexbvHbeNYZG|-+<3RkFS;1z>Y2LB9{_1URkZtA
zjFQwNlNdPCcB+fsdrM#$k*A4Bb5Xuc!FLM|=6-Mg<|=o-3-&m%`1OnUS-V)(x5jod
zP#q?|7?TuMVP0nIWln6!&^=j^BtvTP@q&t72SlNZ8VUD7!{S<EAV;vIi=hgFg!uvx
z#pqWa85pj!zj<MU51XBLDo^dnWoQK3irXMiv;?qmkZJAq)-D491%|5h+>c1LiVN>!
zuoWCM<eiqBo!`+Jx78ao${lfh{vR@(F7fhcmoDp52Rrnp0=g;$ol_})R*8;04-ZCt
zqqNz>h)^#WSgn(y&p-|M0)D4&X4O_Ea0hD|sM`hkcTh9xgrcUpCU*e-HtPRldyz1g
z79ve#rh%MAUp->OSX+Zzw8{B&S3#>Q6fSnnK5GCY+6UAHPUZ;SjjpNdNlzrEfwI1x
z&xxyUGrcIWG`1aT)*{#Xpn}~H4NDMQFMMUnZkJid%`KBl+P%CIDmuG^n+p6D#6L0E
z$I;4HqO##jJBa3PmLJeoHv4%n_dX!!k{~UbAKbT`u0rh>35}8Lq*cBd*}F}GH1_9J
z=JlWemn9BRjgeoUx0=*ua#+R@MEv@^sar$VCi$0*MMLk;Sa|IaeS}i)@_2lZpMy~d
zz{Y~Y@7=xmAmiAm!tIm1r(H@ysDa;Rzn!r_aD|oXJ8NY!Qu!(19N0lV&C1DX^!Ep2
zeX3d8AWm7YbtTo@W>eZc{+OvzfJCnT4NJd2EfE)UYjvXO_@Xty5o`G9g)h3`+=46P
zkweyUmkx6^u-PWcD}CPD<N<buV%@+tAm<&VL~mTfRb)HxGh$Y#u!q}-!e&X1%_9N8
zVh;}5gp0e2d$@-{o*v(<;Jj*bUnl-viKyexTf;iD3W(U&0ztKpo{>IEu&wfw?R^`_
ztN0#v8Ze&FvV2@pEZ3g>te1mo_C;t4?)facYA{?Z``e;-;$^nA1s?sQ%P7|$_gG4s
zmWz0w_wER3kNEhB2Qk6yB@nnlb5wmT*{P>)qH>k8rW5+;9=0<D&|N??fC?eN{x@t?
zdP4p&8h7=o7esO6R0kHmOnzFHyYAYASLI<sU}d(5`y*LQtgkwI3i_58w%Ee#$0XJq
z!q>q<FDbPF6DT$^LAA_I<kiSmVFgT{BX=W}>R!Uy=DgzQ?XZO2Pd2kU^&!6;^bzaN
z<WliRmW3XN4&K*2-1Vi(V_Ef?ddRhW(@Ao{iSmu=6OPbK%xr8sHDd(=Sn;{jLF^wd
z8t0W>cVxbyvnT}!FTNyxCGr%pB&yy9+@k}+^r2eV<KLEgvL$562enAbvdxm=(Ost2
zokP*|6a2@PKjGuX2uC12m<Ns(0S(^+e%e*;yEKo-bX0?hGrYR4-8r$Qjq@vswJ&QL
zGRn)E*@{+*Jue~4=zx=>wX+O=@|IzbCd__eZN<6JcVEC|2SaO`qhG{!c$XL<zZK7K
z_Va1P=|;Hae&(}^69$c1KaGx}2m=gULM^joUR;J!l0{NAhn;h2cP{qxPj7YVt++ug
z;6(1h8mlX|T=8{XxyA93(vjI1Eb#E=+pEG(lrldx9D4#@JK`v*Tb~FS>GAso=xOY#
zRetALx$#-9_|NLzTNozan0p%aqq;+%Ji9l8zT-m^dmEF+Eso0|^1q6V?Fk22#srqq
z%WuSatXL)ivmj5mb5nsj#B+)4M-9ij(sjOy8`mW-+<D398*w$yGaeqnlkzT^mp$~p
z*B?71cu8`#&EFIJ_HHr&*F|qMzr9V=MjR;>d8qXGRxjaxq<c|tDDnGsR9;r+e3RoG
zPUL$%?R3`W^4mttt~7q|EEaylKw!DM95In`>XM=Fk}lm+>Z=4Aj{1K4MkrUCv;k~R
zd-bNXZtHUuh9Rmf99`J=4<mws1~0W69v@vBJo2IrmS?<|O7XhmfHKERzbQty;VFb~
zd#~xlKcqs03$%L;_#M9kZ;jCSZQTKK1}B}K8>!8aWKMp?uZ~#wh*=&Pku4s%@&lHN
zbT(ZtV9t#R-dCrFN0$lDYE83k`-{v}c<SpteS)J7n)y)>ZENsbM*@4;{D+)I&K|9q
zmhOSWu2@7!tQIBA#?Gq)CqwL@PPk(D?sCPY`@PO_X@~6zdd#S1qBJLZGXc&Z%@VZ_
z2&2AeSgZYMEQ3|0-Ki^jz<390O**V_84r<%`|mCwPp|n}FHqJx?<&!u;%7K6I|-{@
z44$Qu71Lp1oc|Pbe{rY-=}%tg#zD$>0dOI;+@Q5F45E67b(m)3<U`OkKp+(~bw3ei
zEGG>}M5Gev_eC&#f@7cgG%<pjJT$*9XH?mls+kTB3{1?Z@;6ne=oBx0{SI@k<jJc?
zROe=wL`y`pSD?PQTD;4@M0+s*)!kU!UAk_QmZQMaa7MXq^9lQEgL#Chms!|RTQ~I|
zV)C#~J^Frcb;bHqlftebk9eo;-Jo+bH~TJG5!(|TT8+_xk4j*-V^U;y@4QU!W_R@I
zO9tcI(vaKxHv+D@k!GsLcREl9KN&c%D|xz^eJ&lEw$h&9--9wgq{YQQ?CJ~D7=j48
zF$zt;hLU(#3CZ>tyGNxs;(XEb_28lhvCorHY&3>02_O*NIkx*@Qn<oI2L4jhiHk4(
z<>fs~->$sf5CK?o{B?XYnN#I5<zdrR@01rZNqKsex=X4(q_Gz>#hkij+H-7;|LhQ3
z{^BMTGBrnX23`tb>+#sPKO$qP`0EBAo=7)9S8A^Y?}V7Utu_nmraFlL$_LEgC@r1p
zW04eNI@Y=YH#qJ(5;^@HfA-Zzhc7HAThyVjWjpzpphowd@N*(&mbWK&0Uz~Cz19(T
z1zz-sVUi093TR#hw0jW855(=hra2X5<LUYXUJETc6JZ`<@-(O2d?opMU1Fb00l2P+
zG`G&HZcx>2aLdGY4ST^BBJTpY#aeHJY~;OA?ZC_}imWhzpJ{H;oiYS~BC;a~t<k(u
zOBZ}yI`<g92V>!-e>~h~8@{!&=9T@15tUW*6x&e-Hm82Z+}*1EVgUXTdG=EDvj$@)
ze&i{&$lHk<!`=NtY9(LG^SJ0LE;5MuK-3cQ`+YcSIuGwWbl&BukL!~dFep0#FwrmB
z8pB)g2U`b@S{+qdL)^n7`=Rn(`?JT1s+|<rG+DRdE~KUuw=gY=R%p3S?lc{VTxXT=
zBxqJ1Y_41{agIlubawPN9=PNIB*Iu-RC=DEo8QkHvO*5Flx5Zrqx)>QubN$^zxw1S
z&998MgZ;#k<HoER?8)oCAbpWt<FvJ8*?R0&N}}?5>QvpP_xoIy+dR`vg3v0-#1wru
z-@V7p!LLaC@2Q8#djbIWein(j{#7lsOT^A#94PGdJ9fTKxV7@sl(tAYPYO?IK6+kh
zUP&u%*%l$O6n&ky`T4tSDY{+AZuLbvmO-6Wb|sCtcBn>F3@onpcZ&kzyN%+K_Nx5p
z9|}Omt^ecK_=7b00k@w-ev$PbiY|}$g|S2X2}wnX`xC{nn|^O#n{=_a1WLej3&)9Y
z<&XxZjo3;6=Rb|5;9O1b##8~aN9;$c(3(BJ%&7HGK`M1~{KfDh<YVW~_i4hAFW*f_
zE$k*;p2eqG+^TyoDTi3!!3o0EqY?{IlGe_enji6u;(j0gt3%+I$Q!Y6CZ0w?l9P>L
z35on*neU%!n9}6ER1mp<;4n<R=itXlBP^vh>;)*u$o~a~eb_q^USaV@V9793MuH{-
zmsV9X*>rZ=<BL7~QY)bWQ4N0t1KZ>P^~zg4egRR#f@+$zJn@vZNN9MPoDFHktyPw(
zCL<y904|fBtH}m^(D+H7@zie#s}8j)fE*(3(9EVYAsp%Km{>@$TWp+9%Ps0csxh;{
z)^S6>bP7E-ZSA>VX#GZcC>&XHITMVDOKZNK!<6{uxx1@6@ujKS8D=4Gpz>CY1U~jE
z^##bqtgyV>XBgxaW7Vy*b|)5n^7&Unv=4)VGy?$1H}@N_J{C#|PFJ<lIQ>FldeHpX
zC*UnW<}COuf;Y@CV*w>@$VlS)^7LU75+x^Ui|!Xozuva{zy;QM%}b=`JNk$YRnx7U
zom!N0JQYh1t$0|Ld@burOGoC&`#(|_R_6RYE&+%lCVQoT7*%rYK?eQlv~5>>8JTtX
zNF44GpzXyby^CGNP_Zhd#-wj0_I0ykjtk`X^M-+lT6o}s1+0DTY{{p39<|JnQZW18
zSSjGOh2q|8@;59L`S?H3Dsh<vE~IrrfI+D?pg1xX<=<Jew|MUuG2`oMa3WS0YML1d
z$_e&&tYQNte2U&=2BIE1jD@0vL+SLTy0Wb=#}1t{cV_+XP5^AIhBv%EHDh^fU$sx)
zka%1I$#R{iQ@v8^xiXUh=Dx?qDFr43Me?kO&wKZw{v!dhhdWhyIlc~^@mkoaIHY1j
z@L4($d++YmF{ky`mKf|SM}|gmLbusrb92SV?pgN89a;b%LW`==%vGrni?><7mcWAR
zg;gKlfyy#1e0&lKb7o9<G6aqrI%d3fl&=udVu7)VSbjTIh30K)$Dbu5r&8%(?n_Dc
zG>g>1I6crV9A4w}a-6a}D~%LUWo48C*_^=$fmnFbuegG<p0=-;!;116_?wBl;`-1?
zN!5ct2lFJogljq<^Ny^Bf3td+)Q-#EwX~c}GRXBvml7E1G?tN!cj<E@Vo6KePK3mx
zMlBmXiz?DS8291ilC^qeYWZfBGD;HNii#s4{Q-5i*mY#&3KDSZ2TTi|n~t5m^^Xd#
zjCS&QPuoe}>+IY4;(HN85o_}ua5T|idc1TYh?<l}<r#d4?-m-ABJknyk$D(t@B6d|
zG)wi{8O$5gUF2_ki}J8cbf(^K@~yfn4_YA(aeb)`(4yOCC6)JD_BYy#9ouN~|0KL9
z>gPSzxcO)@oOsQUS$}nI-r~KyM{ZvZGT)!A{d;V>2R*kJT0Y_r2qm+*c(5ZkD1d<?
z-{s>av@(q9uho>Q_hv;X6Og)QEM|Ds(n-~f62Xm)C&ATg?1t6GxMSq>XamH23kfab
zt`A4=+(6ywgM}aSrHiggczbREmc%X-`Es8lqqB(9<4Fvfq3W8<bLZ5{GP+vO%;({_
zWA@b+F8*7U0XPx1vBu234}0M!Vmp|wwxE4H2dXY4D5hM^G66IAWUSzhVHD;pg3vM(
zd1lnyYP;iWQkP1$aMUs&k#KD_bDA$R>V@xyINvQR%7kl%jmEgg(0+=z!UsG@7&Oca
zQFp?gSfRD$y|0oze7xwt%i>qrd8Kmt<hBI(eHLNK@>2yVO0c%^H{#h^S<FJz65e=0
z(xR$4A*$Gqqx^c;C4Q4?cfQ0nsw99N^<>d~XX}^&oZ9DRvg2K9*8F2HoU+iXgwl34
zPJkR-C%m`%w0(qkXN!3Gv_XT>pLe^lWEB!^^PqBb?s-l11mxNMLi~A(m+C)iB%#;N
z6g#7;YAg!wnP}|}+4plDJ^%R1tzDyMu40HGQYm^lidn3%QN&!AgD6!@o>=Qn>YhOP
z`|+7@*s9XN(r`S53wM-H$KI!JG#F=pX?iuyODl<oF7v;Fj!6jdgz2QDw&mz=(}zYG
zm_w|JH(Q67-hHK#`)6`N1PN#h4f{%J<hI{p|3hkpxFxr=Z5h^$?VN)L%Qxn_=ioX&
z>S7*N9zCJ@y;cE~YB0=x9cF1GtAt~Q{t}ES)LGg(mI`4&Q4j=VwjX4Lbx8`~yAe<J
z@`qiK#*+>0A8^jjLfH^z$7(^#9LQHDcTzrsYIo^OxAiKMl;ARaRvnvgd0K(r%8h5$
zjt9vnOcp~+{>6VD6j$dWjhxhs0KH&5u(|lXc@YSjZ+Bch+}idm$oBzj&D|o#QHzGs
z-Fd%TW-33Ev7pi6(PLe{2S|j{{-_kDT}#o@Khi>;`G^*#`ce;2nI~zyV4TdXtoF4i
zzHiUy{L`xTVU3u?rvGB$#UQ1M?&xgBz=1*6&8q;V)rqjT=XYO@jSJcq)4<cRyr!pA
z*svBi;l%=DBypGNVI1JrDZdR#L`Zm*$VUj*9+|j_p?iya0vA32Onqy9t2`!jJ4G-s
zWb<pJRmj9gP6ugmua3A8X<HvWF3DnU|6=MoKoyeqdu>{Xdi}G1O8L;y4DIs<Ti>I~
zo8kzBr;qtZrAfLltgZnf$KTu?xvm`R?@ocFF5GFi*=H$9b#X7|xBgl0Nujc<a7DjL
zS0S|tAuD06nDt`iIw*ic+->+u?yy%8%v%FtUIb`39g9NmT3v`p)H;h^Q+amKTn5-_
zrHNub5ATEcNB~D?UVcqe0L}@h<WKOyin*ypANWygbcwu5kPoV=9R32y8dPiTrlDq6
zY99HtW@r896`&ASPX`Q#%c|SjrA`0h3T>OjoBSw3qI`!)$)3z2%t?)z<42zHQmRHD
z323z-$^R{b*O>WFDZb!?8%1y<OY5Hdt<zM;aas_LK#qYUfNf4pmj~UHJ5TjouCR!n
z4JJ$eqohR(Fg`CddS7=bDa%GK-406`OU^OHiY=%Dsti&#$~@j=xV6LM1_z;ex`gjA
zrLPwVyDrK&{MLi%G&Mi+P}EJV5q;AeO}|TP-fN`zm?3S-p|&5C@gg8uKN?6>%=ToU
zw4(WgljC+dC0X|LmYz1aI-Z(u;=0;bkE6#+#@9g1+2`k*2e1(ndiA~)5`DO>D3*^~
zo(5TGpD8q1x%~KDtNT&2;^N!;zAE`hBe9m`hL@{Yds1Y=Y0OAuz-1sZ9i3gag>NxH
zn5&?nXqliR7rI$xLN_jV4<44H6nv%>|Kszi2uK7D@GiJST^MTI)}4*%y$msr`GY0{
z4IFQpSa+CaU;jvQ5;fWu3@f+-T00+-LA+N-!8N!tSbf<Fitax0&hwCAE%TPVB+{h(
zj^eo6cOxnMzQ!kBllkM2Vmal?Gku=TXZW6XYh_8zQBe;d{HHn{B-w3;JDAto$=5lm
z%XoMA&Q50}JS;^o-uaM*-#Om(4NFmDal4Uu(7<j_5qZjjZL_$j1+r}{YViBHktSWs
zT(68snE}@tPx6l&kz@;ILIbB(ZB#Gia&B*hA6#U<e?CqHLt5L}-{gf!&eMuXC_N_Z
zRC5QQbteEQpOiJ&Y1?zIb|O9N%bmA-17zFjukqG+KePujD_K#Qvu~q{mm2@n3u);0
zx6e)KTC7XQ-YNwuMIfdJ(#UGucuh0d`lgcWe;M<*x~-~eEiP;=Q*$IL^=+?BhrLT)
zI&CCj_5lc<u2TiN+4u9L0?F-JOyA)aY8|EDxVUi?5Nz=*FdwB(n-qtamu0)pVNkGL
z4|#hB9xH_V=86|b#g}(D@D{P`qa3S=Y^%DFN5n~=4qX@5BBUix6$=n0eObn>YLe4X
zR)RExA=HGP#@D#86DDL-zN0Q?@1!82Ih}seD0Z|!RTFZ2g%F=^=QEW~EmrgRXe@Um
zswms9LnS4($ZNLuL=qo+7sPDX-E7+|6R9f*horn2J!3^Ey88LnFUn5e9z?NYot{8v
zSR{I$thp8TSC6n>Tm><2pFd#oL{hf>?PNbL3|$lMn6N7tdQ#>Vb{4z%(=5+2|2Gp@
zeao1d5_0iE5nQ?A;;x8_Nao3SMaPg7;>VnShYD0{r!Jz8%^K~`MkCA_JJdi$sIMDs
zH<K;aJiXB@*Xr6~mQgH2R<f)U-a;eh7&ju9T@)^ZpTX5MG_gp4Km&kWK=@McRE939
z)S7+v@@~hKDVV?dj)}Zx3UX=z@G9mX*KkR6SWgb9qB7TpZ?RtiS&{<2b_DE0jdJnh
zmpVL|OW#H2fjK-cE12!nlb;2%b2fLk#w1v*Z5^M|cNjcUNV;bZfCy<s%8}K%DMP#6
zGt)1}6Li~luCdg$83sdHF*0w>@T1?n*~#)Zwnb<lS7NjpV#`u<s7m+vpao8v!K00f
zUza_qq`#9-rAv4iin9HxJiV3Zk&2kH%w7b6<TG)#-315B+p9*40}HXKeBZ7j$|B5`
zsQ~S^y>yOibDcKjxoIM_-zzvcU+;>FShXh%2fw&NGJmc)u(vqG;GA}rfh3Zr#F)mD
zmDK&th8w3fbxi@B6c)QGsg|T;b}>x>&M`vE?|J1hO}z3fjLV($?vSw7_Of4&e8S3(
zZ6`WcpkqKLUstuk?3%zVN8e>Mj2`B?4iC%8OKOgPm#l--JoOKxcczSn9iOA_>Icuz
zkDjF<k8E-qf#OYPfNrkE{#PB51R4Crx=+535sBS9k5ouN@=Aa<orrYPy_!^iP53}8
zy<5B&fnkQ}J#$};=$LSFPp*SHB^FYWCK<by(2*QmL^P^%n*LR`G37p5xXexqvIHt5
zfB>}~5`jm3teN}sofxudaT)ZxWK?^_7W}+Q%<~$yJc?!2Beqz2jS=$nUhL0mv=&%a
zx9z0(NFHnp?z%*b@=*k`6C=|V7ocU)-Ym7RY$PW|T8~f6508AdZ5@SH`7uW+i;9U8
z@DY(|q;yK(1g)C$v~)M;7x0$NXL64!Q-1G$ed`2SKtv8Hgzp;oWN(ee@e+R48-2c_
zsb9~Y-1WA}6~x@xogvU{8r8Lg`NN-)vm#1N#B1dyu&ixM_$Z3rq6W68%%!13!cHb%
zo2)v)>$1m(@8c$^&Sf>v40V`D63#26=y~_WX<6TU4^cVBaCptG!rL8x1TTMqt7Ei@
z$(L~UL`MC=8s?JbE}goRdsLIYp?FDP+omaf&!|99KO3{wrx1v}8waNU*|x96{xRJk
zj-Cxo+vi?MGl*f%t6qNlg9A|ZZ}tyr5wX!$aQPlFQ#Q2gZyf#po#m1z)aEPz{=x~W
zS@7rEQQ@Oo;C!~Xwk}Y-Q^#`C_2Oh9bHbOgZC?q2@p0HODt_9wf0q@LsUB$<BD01^
zun`khZxOSM(QcrdPOT=U^W}tBbi>=1p<6_>n`tSO%9TmFYZ=Ekwl=zmScFTup(o02
zq;vpTH^R;t2WIrQ4_0T<*VX>)QFnZ2P%TrXt~PDptEA)CvA6$<iUXEJ3ePNO+AA%e
zT{C7gOc%|7!=c{dW1@I7qwFN+wc)H$#<r7Lw&kQ_v^N9IFH;rWRI4A&0BZCk1@g^j
zcqIIFSvzF!g(RpFCCM$L5nNpYIE}rT-;}l;&j6I=NSc`cW3)XW6Gfqsc$-f}f30cy
zrMURG?$7sJWIT&hc|>Xqqk3!iD%wT*wRO^qJj+stX<m4HTK95v*=4nGyR%EZO-)Bu
zm+b0O`<i61Z5lz5Nk~cmt||HSiScrweWOL3;Px@{0){EhW>sCLZ7;fluul$EN@{Y+
z@2>=Kd2z%7U!7i(a}9nRa6%&iJ$s>=mcadIKjFK`=9V-hN+u;^Ho9-f`pLu_KDSGv
z+*ZALmQzdL1|udLM8@BU`Iz1>$&b~V0-{)Iol#Bd+bjZa^MOs;FYw~!J1*PAAM7JY
zQ{Q;ox~C%iEsaXL4<R&?cSC!(^*e@2kSTT?A)dd}s*1Iq!HXIdojcuor`0puZuQvu
znP!lrmSZ_<E61N@PbT$N-E->k^)Cu0XcBfcSY6Cun+dT-C>oz&nEjnI#+k*fg<bVW
ziYpWSriQ5uRlM9yjNX;}b@sg8`MtZvxj4_}8AX8^d6Q%tY?$pAlf<NQjmm^j@p>CS
z1X0g<+|6KlLw(Vit8ZL)03TfpW@&wq)>kv1bl9VZE6xQAKAdYRXM4Y|S<##ru^l(1
zm&+1QR9@SX?-e+&G!dJ78ey#8oHF}Vf2j3dg*^Aya%}wp%f|W-rzO8y=cd@j(Wyty
zWo8{7Q@lmI-+Aw4O|MA-pz<q#<ri>do7u!ucT}9~2k|v$ER`s0jbE;6*KyFv6BfVg
zy?GPke*I}JtmGuB-Bo73Vp!><0_>`Qd(sPGi_Bz3WHQJ5WV@t-o@hv_nl1OW&vHG-
zoYBy<ojzv0Tex~)$9W3fkXPN1r_n4x6DRyLGWs&lhE5of@8~DJwpv8P$&y!kVTCol
zQX#RJ2L8;{>V=hJq)}9*VPc=r8fo8H_fYOk!QsJHT_2LPH||6NGga24mE|AFL(Vxi
zkrs8OdEn@bccFb@yVoERJw}c8pb=Pu4R=Ku_qr0SJEdB&=?0WWaTA%X$XTsuv6X9*
zzQe^MW)vWh8V$O`t?i{$kjv>^Xjx>e%{rOD>DokEeZ*P%1l$&47clAGy;%RGD-x-`
z(cU&5X^c92_vm?I0eD%m^9R!R%^Da<??Ycx)T4S8)GrS#IN+Y6I;U48Dih_acCMga
zzmsYfpc)shLuVfhzs=nvH7-XbqbCFCsm+$?&Q=*HIS~m9nRJ>o%}=GsmtxA6|19_}
zw&&ox=XR&k{NJ4&7qn|gE7(N%L_S|B`%6EiTiYgSVX{McAkr*O`~{U%DcMTiZY>~A
z#;J|9Usl=$Y-uE?@~JGhX!4)D())$f|J-<9ISQ5BOHx+O!YvTaXc*--%9c5ltG7&_
z*pcsFndJ<>_bUpFI1Q0o{^T%-99n%U5NGyV1Owmu673%dCiy-Bl-fZUjbiKl_Qpkn
zG}N17hgEZ%u(h+z4B)XJ9m~%p#QkRg?ZGnv!mhv=5aU}&0`D~-(^|S(7QC~Ee^dq@
zT-Es2O#}K3343$_1VZW^cWZQLNDBaL%cWH79w*yRPgI`>1v3I81zHxt{GtU2^z0Gm
z-g5wlH+?TQQ1P+Sf3AUKbpezVa1nk?9@N2TjnDov+NZ;^^8%>W^`Dm2D+B@lx`U;t
zc*Gk(77Kje-xfd$C{=<$e43g}8+Q$$0KNFXt^XY80t)EsJTGCg_VgnCuD8gGcY^;`
zsuMZaH3$8!5MkutmO_93^iRJ4(sPgu=uZ(4FXQM#8MD$m7a)BqbN}g>CHHqA(5G7{
zR-ogr|LJ)3>-iK4C38jI=)p4qOiTz`D=J##pBsz(t@$N8Le?7vAkgv4K<@9qjT}I;
z0Uft@!(z*=R&J>L^MBbOz=#HcLd&sD+zQ12p>yLb{_o2B=K~_=f*POz<+TNXIfHn>
z`_KQGAAf!=1d^@jn%PIm&prompJQ6O4=Q5*ts{4n1VFKY`q$m_bYY(2Zw&sK9Eb{_
zSAhboM}hiLuvGy0@_#(io9D&lK(^LCvzJs1viowlb&>y;4`gONe@*&_hs)S4LSRWW
z{|x1?3ktH;tK0ND^aqj|^Pa1!XS##`US|1wOTEeD|Lb9f!p>{K_2lMM-jnED;yOLx
z$~4vapKcHSj{iNwQ5`@vBi{R;<!}{1V*xXln6I~xUzMUZ5WWrAApdPt=g&F78shkK
z3*fFFqw?k>%g$^2{iji#=j#IUb2MY$Fm|^7&(9hP5KKU_kdB0APJjhZ1W2(B9RIGq
z(7#^?*;>($E&svN{cp9rP!b?b4-yFv!}4?IHM7SsvJKg&{d<-5+|C8{d>`_(xf~6R
z%I7ls=Z2tAp}!r2&Wo9;Zg9sis#@s%GeTK)Ae#iry7wx`pPSprxuA4chU4Fgn^ZvA
zIbir%_f@}B<o$0;K@knSo3gQtuBh<bb#YXxDPY9@|MvfQ0wPQUXBT;@Lfj~mYgEra
OqoSZGU;5ZQ@P7dZMhxo!

literal 0
HcmV?d00001


From cbff73cae4b1c71778c401fef37a36d7af33e1f5 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 14:52:39 -0800
Subject: [PATCH 24/69] Update README.md

---
 README.md | 78 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 28c6f3e..cb788ba 100644
--- a/README.md
+++ b/README.md
@@ -1,52 +1,55 @@
-# Bayesian-Torch: Bayesian neural network layers for uncertainty estimation
-**[Get started](https://github.com/IntelLabs/bayesian-torch#installation)** | **[Example usage](https://github.com/IntelLabs/bayesian-torch#usage)** | **[Documentation](https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md)** | **[License](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)** | **[Citing](https://github.com/IntelLabs/bayesian-torch#citing)** 
+<div align="center">
 
-### Bayesian layers and utilities to perform stochastic variational inference in PyTorch
+<img src="assets/bayesian-torch.png" width="500px">
+<h2 >
+A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
+</h2>
 
-Bayesian-Torch is a library of neural network layers and utilities extending the core of PyTorch to enable the user to perform stochastic variational inference in Bayesian deep neural networks.
-Bayesian-Torch is designed to be flexible and seamless in extending a deterministic deep neural network architecture to corresponding Bayesian form by simply replacing the deterministic layers with Bayesian layers. 
+[![python](https://img.shields.io/badge/python-3.7%2B-blue)]()
+[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)]()
+[![version](https://img.shields.io/badge/release-0.2.0-green)]()
+[![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
+<h4 align="center">
+  <a href="https://github.com/IntelLabs/bayesian-torch#installation">Get Started</a> |
+  <a href="https://github.com/IntelLabs/bayesian-torch#usage">Example usage</a> |
+  <a href="https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md">Documentation</a> |
+  <a href="https://github.com/IntelLabs/bayesian-torch#citing">Citing</a>   
+</h4>
 
+</div>
 
-The repository has implementations for the following Bayesian layers:
-- [x] **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)]
+___
+
+Bayesian-Torch is a library of neural network layers and utilities extending the core of PyTorch to enable Bayesian inference in deep learning models to quantify principled uncertainty estimates in model predictions.
+
+## Overview
+Bayesian-Torch is designed to be flexible and enables seamless extension of deterministic deep neural network model to corresponding Bayesian form by simply replacing the deterministic layers with Bayesian layers. It enables user to perform stochastic variational inference in deep neural networks. 
+
+**Bayesian layers:**
+
+* **[Variational layers with reparameterized Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers)** [[Blundell et al. 2015](https://arxiv.org/abs/1505.05424)]
 
       
-      LinearVariational 
-      Conv1dVariational, Conv2dVariational, Conv3dVariational, ConvTranspose1dVariational, ConvTranspose2dVariational, ConvTranspose3dVariational
-      LSTMVariational
+      LinearReparameterization 
+      Conv1dReparameterization, Conv2dReparameterization, Conv3dReparameterization, ConvTranspose1dReparameterization, ConvTranspose2dReparameterization, ConvTranspose3dReparameterization
+      LSTMReparameterization
       
-- [x] **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)]
+* **[Variational layers with Flipout Monte Carlo estimators](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers)** [[Wen et al. 2018](https://arxiv.org/abs/1803.04386)]
       
       LinearFlipout 
       Conv1dFlipout, Conv2dFlipout, Conv3dFlipout, ConvTranspose1dFlipout, ConvTranspose2dFlipout, ConvTranspose3dFlipout
       LSTMFlipout
 
-<!--
-- [ ] **[Radial BNN layers](bayesian_torch/layers/radial_layers)** [[Farquhar et al. 2020](https://arxiv.org/abs/1907.00865)]
-
-      LinearRadial
-      Conv1dRadial, Conv2dRadial, Conv3dRadial, ConvTranspose1dRadial, ConvTranspose2dRadial, ConvTranspose3dRadial
-      LSTMRadial
 
-- [ ] **Variational layers with Gaussian mixture model (GMM) posteriors using reparameterized Monte Carlo estimators** (in `pre-alpha`)
 
-      LinearMixture
-      Conv1dMixture, Conv2dMixture, Conv3dMixture, ConvTranspose1dMixture, ConvTranspose2dMixture, ConvTranspose3dMixture
-      LSTMMixture
--->
-
-<!--
-Please refer to [documentation](doc/bayesian_torch.layers.md#layers) of Bayesian layers for details.
--->
-
-Other features include:
-- [x] [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
-- [x] [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)]
-- [x] [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
+**Key features:**
+* [dnn_to_bnn()](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/models/dnn_to_bnn.py#L127): An API to convert deterministic deep neural network (dnn) model of any architecture to Bayesian deep neural network (bnn) model, simplifying the model definition i.e. drop-in replacements  of Convolutional, Linear and LSTM layers to corresponding Bayesian layers. This will enable seamless conversion of existing topology of larger models to Bayesian deep neural network models for extending towards uncertainty-aware applications. 
+* [MOPED](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/util.py#L72): Specifying weight priors and variational posteriors in Bayesian neural networks with Empirical Bayes [[Krishnan et al. 2020](https://ojs.aaai.org/index.php/AAAI/article/view/5875)]
+* [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
 
-## Installation
+## Installing Bayesian-Torch
 <!--
-**To install from PyPI:**
+**To install core library using `pip`:**
 ```
 pip install bayesian-torch
 ```
@@ -68,10 +71,11 @@ Dependencies:
 - pip install tensorboard
 - pip install scikit-learn
 -->
+
 ## Usage
 There are two ways to build Bayesian deep neural networks using Bayesian-Torch: 
-1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn()
-2. Define your custom model using the Bayesian layers ([Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers) or [Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers))
+1. Convert an existing deterministic deep neural network (dnn) model to Bayesian deep neural network (bnn) model with dnn_to_bnn() API
+2. Define your custom model using the Bayesian layers ([Reparameterization](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/variational_layers) or [Flipout](https://github.com/IntelLabs/bayesian-torch/tree/main/bayesian_torch/layers/flipout_layers))
 
 (1) For instance, building Bayesian-ResNet18 from torchvision deterministic ResNet18 model is as simple as:
 ```
@@ -92,7 +96,7 @@ const_bnn_prior_parameters = {
 model = torchvision.models.resnet18()
 dnn_to_bnn(model, const_bnn_prior_parameters)
 ```
-To use MOPED method, setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models):
+To use MOPED method i.e. setting the prior and initializing variational parameters from a pretrained deterministic model (helps training convergence of larger models):
 ```
 const_bnn_prior_parameters = {
         "prior_mu": 0.0,
@@ -234,7 +238,7 @@ MOdel Priors with Empirical Bayes using DNN (MOPED)
 }
 ```
 
-This code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep neural network predictions using stochastic variational inference in Bayesian neural networks. 
+This library and code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep learning model predictions using stochastic variational inference in Bayesian neural networks. 
 Feedbacks, issues and contributions are welcome. Email to <ranganath.krishnan@intel.com> for any questions.
  
 
From a081d57c5c30de738c9c2dd411690f1218adb5f7 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 14:58:11 -0800
Subject: [PATCH 25/69] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index cb788ba..75e9363 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@
 A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
 </h2>
 
-[![python](https://img.shields.io/badge/python-3.7%2B-blue)]()
-[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)]()
-[![version](https://img.shields.io/badge/release-0.2.0-green)]()
+[![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch)
+[![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch)
+[![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases)
 [![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
 <h4 align="center">
   <a href="https://github.com/IntelLabs/bayesian-torch#installation">Get Started</a> |

From f0207c0273cea2402def384c859a1cbbaf016525 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 15:02:13 -0800
Subject: [PATCH 26/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 75e9363..77f65bf 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep
 [![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases)
 [![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
 <h4 align="center">
-  <a href="https://github.com/IntelLabs/bayesian-torch#installation">Get Started</a> |
+  <a href="https://github.com/IntelLabs/bayesian-torch#installing-bayesian-torch">Get Started</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch#usage">Example usage</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch/blob/main/doc/bayesian_torch.layers.md">Documentation</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch#citing">Citing</a>   

From 3793ce523b261740c635e7e0597854eb8ffe1fe5 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 15:13:58 -0800
Subject: [PATCH 27/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 77f65bf..075c8f9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<img src="assets/bayesian-torch.png" width="500px">
+<img src="https://github.com/IntelLabs/bayesian-torch/blob/main/assets/bayesian-torch.png" width="500px">
 <h2 >
 A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
 </h2>

From b71fc1790a3fe58080c73b2145dafce2748aa3d3 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Fri, 28 Jan 2022 15:14:29 -0800
Subject: [PATCH 28/69] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2ba22c1..ae08428 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
     name = "bayesian-torch",
     packages = find_packages(),
     version = "0.2.0",
-    description = "Bayesian-Torch: Bayesian neural network layers for uncertainty estimation",
+    description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning",
     author = "Intel Labs",
     author_email = "ranganath.krishnan@intel.com",
     url = "https://github.com/IntelLabs/bayesian-torch",

From 5019bf9e8f4f5f501ca8857e22b50a734487dce6 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Mon, 31 Jan 2022 10:20:59 -0800
Subject: [PATCH 29/69] release to PyPI, update install instruction through
 "pip" command

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 075c8f9..df8d4c7 100644
--- a/README.md
+++ b/README.md
@@ -48,12 +48,12 @@ Bayesian-Torch is designed to be flexible and enables seamless extension of dete
 * [AvUC](https://github.com/IntelLabs/bayesian-torch/blob/main/bayesian_torch/utils/avuc_loss.py): Accuracy versus Uncertainty Calibration loss [[Krishnan and Tickoo 2020](https://proceedings.neurips.cc/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf)]
 
 ## Installing Bayesian-Torch
-<!--
+
 **To install core library using `pip`:**
 ```
 pip install bayesian-torch
 ```
--->
+
 **To install latest development version from source:**
 ```sh
 git clone https://github.com/IntelLabs/bayesian-torch

From 7e8d246c2f2c2fd719eb7753345708e9b7cb84d8 Mon Sep 17 00:00:00 2001
From: Michael Beale <michael.beale@intel.com>
Date: Mon, 31 Jan 2022 10:28:35 -0800
Subject: [PATCH 30/69] Switched to permanent URL for the top image.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index df8d4c7..999023c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<img src="https://github.com/IntelLabs/bayesian-torch/blob/main/assets/bayesian-torch.png" width="500px">
+<img src="https://github.com/IntelLabs/bayesian-torch/blob/5a4793ba0a54660c891df8af0d5729e840340a88/assets/bayesian-torch.png" width="500px">
 <h2 >
 A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
 </h2>

From bf8c3e37f273a705955d091653360fa6526553d4 Mon Sep 17 00:00:00 2001
From: Michael Beale <michael.beale@intel.com>
Date: Mon, 31 Jan 2022 11:15:29 -0800
Subject: [PATCH 31/69] changing to raw.githubusercontent.com Url for top
 image.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 999023c..65bfe1b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<img src="https://github.com/IntelLabs/bayesian-torch/blob/5a4793ba0a54660c891df8af0d5729e840340a88/assets/bayesian-torch.png" width="500px">
+<img src=" https://raw.githubusercontent.com/IntelLabs/bayesian-torch/5a4793ba0a54660c891df8af0d5729e840340a88/assets/bayesian-torch.png" width="500px">
 <h2 >
 A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
 </h2>

From 3ca0190a6659a1760e30773bdec8ae3c32ec9df3 Mon Sep 17 00:00:00 2001
From: Michael Beale <michael.beale@intel.com>
Date: Mon, 31 Jan 2022 11:15:54 -0800
Subject: [PATCH 32/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 65bfe1b..0ed9e67 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<img src=" https://raw.githubusercontent.com/IntelLabs/bayesian-torch/5a4793ba0a54660c891df8af0d5729e840340a88/assets/bayesian-torch.png" width="500px">
+<img src="https://raw.githubusercontent.com/IntelLabs/bayesian-torch/5a4793ba0a54660c891df8af0d5729e840340a88/assets/bayesian-torch.png" width="500px">
 <h2 >
 A library for Bayesian neural network layers and uncertainty estimation in Deep Learning </a>
 </h2>

From a8cb7ebd13f55621e929180d484038c60a3a1738 Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Mon, 31 Jan 2022 04:33:22 -0800
Subject: [PATCH 33/69] update links and release number for PyPI documentation

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
---
 README.md | 6 ++----
 setup.py  | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0ed9e67..833acc5 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep
 
 [![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch)
 [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch)
-[![version](https://img.shields.io/badge/release-0.2.0-green)](https://github.com/IntelLabs/bayesian-torch/releases)
-[![license](https://img.shields.io/badge/license-BSD%203--Clause-darkblue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
+[![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases)
+[![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
 <h4 align="center">
   <a href="https://github.com/IntelLabs/bayesian-torch#installing-bayesian-torch">Get Started</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch#usage">Example usage</a> |
@@ -240,5 +240,3 @@ MOdel Priors with Empirical Bayes using DNN (MOPED)
 
 This library and code is intended for researchers and developers, enables to quantify principled uncertainty estimates from deep learning model predictions using stochastic variational inference in Bayesian neural networks. 
 Feedbacks, issues and contributions are welcome. Email to <ranganath.krishnan@intel.com> for any questions.
- 
-
diff --git a/setup.py b/setup.py
index ae08428..5a02fb8 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name = "bayesian-torch",
     packages = find_packages(),
-    version = "0.2.0",
+    version = "0.2.1",
     description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning",
     author = "Intel Labs",
     author_email = "ranganath.krishnan@intel.com",

From a5750c7c5bba7e5dd1c11464d4e96ead38cae2ee Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 9 Feb 2022 12:56:15 -0800
Subject: [PATCH 34/69] Update README.md

add downloads statistics badge
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 833acc5..6e428bd 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep
 [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch)
 [![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases)
 [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
+[![Downloads](https://pepy.tech/badge/bayesian-torch/month)](https://pepy.tech/project/bayesian-torch)
 <h4 align="center">
   <a href="https://github.com/IntelLabs/bayesian-torch#installing-bayesian-torch">Get Started</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch#usage">Example usage</a> |

From 789657dd753657d707f23289609f2c523eff99ff Mon Sep 17 00:00:00 2001
From: Ranganath Krishnan <ranganath.krishnan@intel.com>
Date: Wed, 2 Mar 2022 18:58:48 -0800
Subject: [PATCH 35/69] update download count badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6e428bd..e56245a 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep
 [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch)
 [![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases)
 [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
-[![Downloads](https://pepy.tech/badge/bayesian-torch/month)](https://pepy.tech/project/bayesian-torch)
+[![Downloads](https://static.pepy.tech/personalized-badge/bayesian-torch?period=total&units=international_system&left_color=grey&right_color=darkblue&left_text=downloads)](https://pepy.tech/project/bayesian-torch)
 <h4 align="center">
   <a href="https://github.com/IntelLabs/bayesian-torch#installing-bayesian-torch">Get Started</a> |
   <a href="https://github.com/IntelLabs/bayesian-torch#usage">Example usage</a> |

From 5802ef9a2730d5b3d9f93081d7f1e2267dd8bab8 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 13 Nov 2022 21:22:51 -0500
Subject: [PATCH 36/69] implement quantized convolution variational layers

---
 .../quantize_conv_variational.py              | 994 ++++++++++++++++++
 1 file changed, 994 insertions(+)
 create mode 100644 bayesian_torch/layers/variational_layers/quantize_conv_variational.py

diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
new file mode 100644
index 0000000..c7eafc0
--- /dev/null
+++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
@@ -0,0 +1,994 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# @authors: Jun-Liang Lin
+#
+# ======================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+from .conv_variational import *
+import math
+
+__all__ = [
+    'QuantizedConv1dReparameterization',
+    'QuantizedConv2dReparameterization',
+    'QuantizedConv3dReparameterization',
+    'QuantizedConvTranspose1dReparameterization',
+    'QuantizedConvTranspose2dReparameterization',
+    'QuantizedConvTranspose3dReparameterization',
+]
+
+
+class QuantizedConv1dReparameterization(Conv1dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(QuantizedConv1dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv1d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+
+        return out, 0 # disable kl divergence computing
+
+
+class QuantizedConv2dReparameterization(Conv2dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+
+        """
+        
+        super(QuantizedConv2dReparameterization, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias:
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv2d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+
+        return out, 0 # disable kl divergence computing
+
+
+class QuantizedConv3dReparameterization(Conv3dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(QuantizedConv3dReparameterization, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv3d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+
+        return out, 0 # disable kl divergence computing
+
+class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose1dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose1d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=0.1, zero_point=128)
+        
+
+        return out, 0 # disable kl divergence computing
+
+class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose2dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose2d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=0.1, zero_point=128)
+        
+
+        return out, 0 # disable kl divergence computing
+
+class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose3dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+        # symmetric quantization
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
+        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([0.1]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, mode=2):
+        
+        if mode==1: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose3d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=0.1, zero_point=128)
+        
+
+        return out, 0 # disable kl divergence computing
\ No newline at end of file

From d910ae8bf1b7a603c89991699f4c6909d86d1c5a Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Tue, 29 Nov 2022 21:12:28 -0500
Subject: [PATCH 37/69] replace hardcoded variables with function parameters
 and add comments

---
 .../quantize_conv_variational.py              | 540 +++++++++++++++---
 1 file changed, 468 insertions(+), 72 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
index c7eafc0..59470df 100644
--- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
@@ -94,19 +94,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -165,9 +199,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -182,7 +248,7 @@ def forward(self, input, mode=2):
                         self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -198,10 +264,10 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding,
-                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
         return out, 0 # disable kl divergence computing
 
@@ -250,19 +316,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -321,9 +421,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -338,7 +470,7 @@ def forward(self, input, mode=2):
                         self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -354,10 +486,10 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding,
-                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
         return out, 0 # disable kl divergence computing
 
@@ -405,19 +537,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -476,9 +642,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -493,7 +691,7 @@ def forward(self, input, mode=2):
                         self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -509,10 +707,10 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding,
-                        self.dilation, self.groups, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
         return out, 0 # disable kl divergence computing
 
@@ -559,19 +757,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -630,9 +862,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -648,7 +912,7 @@ def forward(self, input, mode=2):
                                  self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -664,13 +928,13 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
 
-            out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=0.1, zero_point=128)
+            out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
         return out, 0 # disable kl divergence computing
@@ -718,19 +982,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -789,9 +1087,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -807,7 +1137,7 @@ def forward(self, input, mode=2):
                                  self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -823,13 +1153,13 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
 
-            out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=0.1, zero_point=128)
+            out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
         return out, 0 # disable kl divergence computing
@@ -877,19 +1207,53 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
-        # symmetric quantization
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
         scale = torch.zeros(1).to(x.device) # initialize
         zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
-        xmax = torch.clamp(x.abs().max(), -100, 100) # determine and restrict the maximum value (select 100 empirically)
-        scale = xmax*2/255 # original range divided by target range (int8, -128 to 127)
-
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
         return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
         if scale == 0:
-            scale = torch.tensor([0.1]) # avoid zero scale
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
@@ -948,9 +1312,41 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, mode=2):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
         
-        if mode==1: # Deprecated. Use this method for reducing model size only.
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -966,7 +1362,7 @@ def forward(self, input, mode=2):
                                  self.dilation, self.groups)
 
         else:
-            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), 6/255, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
             new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
             weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
             new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
@@ -982,13 +1378,13 @@ def forward(self, input, mode=2):
                     bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
 
             if input.dtype!=torch.quint8: # check if input has been quantized
-                input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
 
             self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride,
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
 
-            out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=0.1, zero_point=128)
+            out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
         return out, 0 # disable kl divergence computing
\ No newline at end of file

From 878c3f2200e1a51e2ee9cb1f404c12d02ad760b2 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 4 Dec 2022 22:54:37 -0500
Subject: [PATCH 38/69] implement quantized linear variational layer

---
 bayesian_torch/layers/variational_layers/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bayesian_torch/layers/variational_layers/__init__.py b/bayesian_torch/layers/variational_layers/__init__.py
index 1c083e3..fa39917 100644
--- a/bayesian_torch/layers/variational_layers/__init__.py
+++ b/bayesian_torch/layers/variational_layers/__init__.py
@@ -1,3 +1,6 @@
 from .linear_variational import *
 from .conv_variational import *
 from .rnn_variational import *
+# from .quantize_linear_variational import *
+from .quantize_conv_variational import *
+# from .quantize_rnn_variational import *
\ No newline at end of file

From 9c1493a363a0c46db6c556c7c683f984a04ec12f Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 4 Dec 2022 22:56:13 -0500
Subject: [PATCH 39/69] update init file

---
 bayesian_torch/layers/variational_layers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bayesian_torch/layers/variational_layers/__init__.py b/bayesian_torch/layers/variational_layers/__init__.py
index fa39917..6fae454 100644
--- a/bayesian_torch/layers/variational_layers/__init__.py
+++ b/bayesian_torch/layers/variational_layers/__init__.py
@@ -1,6 +1,6 @@
 from .linear_variational import *
 from .conv_variational import *
 from .rnn_variational import *
-# from .quantize_linear_variational import *
+from .quantize_linear_variational import *
 from .quantize_conv_variational import *
 # from .quantize_rnn_variational import *
\ No newline at end of file

From 7a89b6deb8987790145cf5cc7cdfe29914743344 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 4 Dec 2022 22:59:18 -0500
Subject: [PATCH 40/69] quantized linear variational layer

---
 .../quantize_linear_variational.py            | 199 ++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 bayesian_torch/layers/variational_layers/quantize_linear_variational.py

diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
new file mode 100644
index 0000000..a1ce3fd
--- /dev/null
+++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
@@ -0,0 +1,199 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ======================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module, Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+import math
+from .linear_variational import LinearReparameterization
+
+
+
+class QuantizedLinearReparameterization(LinearReparameterization):
+    def __init__(self,
+                 in_features,
+                 out_features):
+        """
+
+        """
+        super(QuantizedLinearReparameterization, self).__init__(
+                 in_features,
+                 out_features)
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False)
+        self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False)
+        delattr(self, "mu_weight")
+        delattr(self, "rho_weight")
+
+        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        delattr(self, "mu_bias")
+        delattr(self, "rho_bias")
+
+    def dequantize(self): # Deprecated
+        self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+        self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+            weight = self.mu_weight + (self.sigma_weight * self.eps_weight.data.normal_())
+            bias = None
+            if self.sigma_bias is not None:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.linear(input, weight, bias)
+
+        else:
+            eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+            if self.quantized_sigma_bias is not None:
+                if not self.is_dequant:
+                    self.dequantize()
+                    self.is_dequant = True
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+            if input.dtype!=torch.quint8:
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8)
+
+            out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+            out = out.dequantize()
+            
+        return out, 0 # kl=0

From 3ea51a585d49b6e057f817a85629d141c51f52a1 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 4 Dec 2022 23:40:35 -0500
Subject: [PATCH 41/69] quantized conv flipout layers

---
 .../layers/flipout_layers/__init__.py         |   3 +
 .../flipout_layers/quantized_conv_flipout.py  | 661 ++++++++++++++++++
 2 files changed, 664 insertions(+)
 create mode 100644 bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py

diff --git a/bayesian_torch/layers/flipout_layers/__init__.py b/bayesian_torch/layers/flipout_layers/__init__.py
index 3aeb698..fda0925 100644
--- a/bayesian_torch/layers/flipout_layers/__init__.py
+++ b/bayesian_torch/layers/flipout_layers/__init__.py
@@ -1,3 +1,6 @@
 from .conv_flipout import *
 from .linear_flipout import *
 from .rnn_flipout import *
+# from .quantized_linear_flipout import *
+from .quantized_conv_flipout import *
+# from .quantize_rnn_flipout import *
\ No newline at end of file
diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
new file mode 100644
index 0000000..45eacfe
--- /dev/null
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -0,0 +1,661 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+# Convolutional layers with flipout Monte Carlo weight estimator to perform
+# variational inference in Bayesian neural networks. Variational layers
+# enables Monte Carlo approximation of the distribution over the kernel
+#
+#
+# ======================================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+from .conv_flipout import *
+
+from torch.distributions.normal import Normal
+from torch.distributions.uniform import Uniform
+
+__all__ = [
+    'QuantizedConv1dFlipout',
+    'QuantizedConv2dFlipout',
+    'QuantizedConv3dFlipout',
+    # 'QuantizedConvTranspose1dFlipout',
+    # 'QuantizedConvTranspose2dFlipout',
+    # 'QuantizedConvTranspose3dFlipout',
+]
+
+
+class QuantizedConv1dFlipout(Conv1dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConv1dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv1d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0
+
+
+class QuantizedConv2dFlipout(Conv2dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False): # be aware of bias
+        """
+
+        """
+        super(QuantizedConv2dFlipout, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self):
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv2d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0
+
+
+class QuantizedConv3dFlipout(Conv3dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConv3dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self):
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv3d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv3d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0

From 5b696ae7c36ca51842f91a331baca63c870c8a4f Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 5 Dec 2022 14:01:19 -0500
Subject: [PATCH 42/69] quantized linear flipout layer

---
 .../quantized_linear_flipout.py               | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py

diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
new file mode 100644
index 0000000..1449428
--- /dev/null
+++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+# Linear Flipout Layers with flipout weight estimator to perform
+# variational inference in Bayesian neural networks. Variational layers
+# enables Monte Carlo approximation of the distribution over the weights
+#
+# @authors: Ranganath Krishnan, Piero Esposito
+#
+# ======================================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module, Parameter
+from torch.distributions.normal import Normal
+from torch.distributions.uniform import Uniform
+
+from .linear_flipout import LinearFlipout
+
+__all__ = ["QuantizedLinearFlipout"]
+
+class QuantizedLinearFlipout(LinearFlipout):
+    def __init__(self,
+                 in_features,
+                 out_features):
+
+        super(QuantizedLinearFlipout, self).__init__(
+                 in_features,
+                 out_features)
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x):
+
+        # symmetry
+        scale = torch.zeros(1).to(x.device)
+        zero_point = torch.zeros(1).to(x.device)
+        xmax = torch.clamp(x.abs().max(), -100, 100)
+        scale = xmax*2/255
+
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x):
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+        # int8_x = dequantized_x*scale.to(torch.int8)
+
+        return dequantized_x
+
+
+    def quantize(self):
+        self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False)
+        self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False)
+        delattr(self, "mu_weight")
+        delattr(self, "rho_weight")
+
+        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        delattr(self, "mu_bias")
+        delattr(self, "rho_bias")
+
+    def dequantize(self):
+        self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+        self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        return
+
+    def forward(self, x):
+
+        bias = None
+        if self.quantized_mu_bias is not None:
+            if not self.is_dequant:
+                    self.dequantize()
+                    self.is_dequant = True
+            bias = self.mu_bias
+
+        outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, 1, 128, torch.quint8) # scale?
+        sign_output = torch.quantize_per_tensor(sign_output, 1, 128, torch.quint8) # scale?
+
+         # getting perturbation weights
+        eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), 6/255, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
+        delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
+
+        bias = None
+        if self.quantized_sigma_bias is not None:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, 0.1, 128)
+
+        perturbed_outputs = torch.nn.quantized.functional.linear(x,
+                            weight=delta_weight, bias=bias, scale=0.1, zero_point=128)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, 0.1, 128)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, 0.1, 128)
+        out = out.dequantize()
+
+        return out, 0

From 33986eb8c5b9c8303ac6f52e26040905ee421ab5 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 5 Dec 2022 14:02:34 -0500
Subject: [PATCH 43/69] template for quantized transposed conv1d flipout layer

---
 .../flipout_layers/quantized_conv_flipout.py  | 201 ++++++++++++++++++
 1 file changed, 201 insertions(+)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 45eacfe..972dad7 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -659,3 +659,204 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
         return out, 0
+
+class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose1dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        out = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+
+        outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv1d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0
\ No newline at end of file

From 3e39d223972713d5f5b743afa4a32400b8ef55fd Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 7 Dec 2022 10:32:34 -0500
Subject: [PATCH 44/69] quantized flipout layers

---
 .../flipout_layers/quantized_conv_flipout.py  | 417 +++++++++++++++++-
 .../quantized_linear_flipout.py               | 102 ++++-
 2 files changed, 494 insertions(+), 25 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 972dad7..2414cd6 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -691,6 +691,9 @@ def __init__(self,
 
         self.is_dequant = False
 
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
         
@@ -828,12 +831,210 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
                                  self.padding, self.output_padding,
                                  self.dilation, self.groups)
 
-        out = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
 
-        outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
-                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0
+
+class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose2dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
 
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
         # sampling perturbation signs
         sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
         sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
@@ -853,9 +1054,213 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         # perturbed feedforward
         x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
 
-        perturbed_outputs = torch.nn.quantized.functional.conv1d(x,
-                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
-                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        return out, 0
+
+class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose3dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
index 1449428..9673242 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
@@ -30,7 +30,7 @@
 # variational inference in Bayesian neural networks. Variational layers
 # enables Monte Carlo approximation of the distribution over the weights
 #
-# @authors: Ranganath Krishnan, Piero Esposito
+# @authors: Jun-Liang Lin
 #
 # ======================================================================================
 import torch
@@ -55,25 +55,59 @@ def __init__(self,
 
         self.is_dequant = False
 
-    def get_scale_and_zero_point(self, x):
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
 
-        # symmetry
-        scale = torch.zeros(1).to(x.device)
-        zero_point = torch.zeros(1).to(x.device)
-        xmax = torch.clamp(x.abs().max(), -100, 100)
-        scale = xmax*2/255
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
 
-        return scale, zero_point
 
-    def get_quantized_tensor(self, x):
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
         scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
         quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
 
         return quantized_x
 
     def get_dequantized_tensor(self, x):
+
         dequantized_x = x.dequantize()
-        # int8_x = dequantized_x*scale.to(torch.int8)
 
         return dequantized_x
 
@@ -97,7 +131,37 @@ def dequantize(self):
         self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
         return
 
-    def forward(self, x):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor. Already dequantized.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
 
         bias = None
         if self.quantized_mu_bias is not None:
@@ -106,16 +170,16 @@ def forward(self, x):
                     self.is_dequant = True
             bias = self.mu_bias
 
-        outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=0.1, zero_point=128) # input: quint8, weight: qint8, bias: fp32
+        outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
         # sampling perturbation signs
         sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
         sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
-        sign_input = torch.quantize_per_tensor(sign_input, 1, 128, torch.quint8) # scale?
-        sign_output = torch.quantize_per_tensor(sign_output, 1, 128, torch.quint8) # scale?
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
 
          # getting perturbation weights
-        eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), 6/255, 0, torch.qint8)
+        eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
         new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
         delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
 
@@ -125,12 +189,12 @@ def forward(self, x):
             bias = (self.sigma_bias * eps_bias)
 
         # perturbed feedforward
-        x = torch.ops.quantized.mul(x, sign_input, 0.1, 128)
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
 
         perturbed_outputs = torch.nn.quantized.functional.linear(x,
-                            weight=delta_weight, bias=bias, scale=0.1, zero_point=128)
-        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, 0.1, 128)
-        out = torch.ops.quantized.add(outputs, perturbed_outputs, 0.1, 128)
+                            weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
         out = out.dequantize()
 
         return out, 0

From 61c34079b9725542a6f85b274714b8aca76396ca Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 7 Dec 2022 10:35:50 -0500
Subject: [PATCH 45/69] update init file

---
 bayesian_torch/layers/flipout_layers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bayesian_torch/layers/flipout_layers/__init__.py b/bayesian_torch/layers/flipout_layers/__init__.py
index fda0925..b1b18c4 100644
--- a/bayesian_torch/layers/flipout_layers/__init__.py
+++ b/bayesian_torch/layers/flipout_layers/__init__.py
@@ -1,6 +1,6 @@
 from .conv_flipout import *
 from .linear_flipout import *
 from .rnn_flipout import *
-# from .quantized_linear_flipout import *
+from .quantized_linear_flipout import *
 from .quantized_conv_flipout import *
 # from .quantize_rnn_flipout import *
\ No newline at end of file

From f647bcf8c7ceb52f3e8b9bd128df0edb8f520717 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 7 Dec 2022 10:44:54 -0500
Subject: [PATCH 46/69] update name list

---
 .../layers/flipout_layers/quantized_conv_flipout.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 2414cd6..8cde630 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -46,9 +46,9 @@
     'QuantizedConv1dFlipout',
     'QuantizedConv2dFlipout',
     'QuantizedConv3dFlipout',
-    # 'QuantizedConvTranspose1dFlipout',
-    # 'QuantizedConvTranspose2dFlipout',
-    # 'QuantizedConvTranspose3dFlipout',
+    'QuantizedConvTranspose1dFlipout',
+    'QuantizedConvTranspose2dFlipout',
+    'QuantizedConvTranspose3dFlipout',
 ]
 
 
From 9524bc0af37d8fa4e42c38b97c77288caaf917dd Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 7 Dec 2022 10:59:01 -0500
Subject: [PATCH 47/69] bnn to qbnn conversion

---
 bayesian_torch/models/bnn_to_qbnn.py | 228 +++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 bayesian_torch/models/bnn_to_qbnn.py

diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
new file mode 100644
index 0000000..6732a65
--- /dev/null
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -0,0 +1,228 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Functions related to BNN to QBNN model conversion.
+#
+# @authors: Jun-Liang Lin
+#
+# ===============================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import bayesian_torch.layers as bayesian_layers
+import torch
+import torch.nn as nn
+from torch.nn import Identity
+from torch.nn.quantized import BatchNorm2d as QBatchNorm2d
+from torch.nn import Module, Parameter
+
+
+def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+    """ An implementation for symmetric quantization
+    
+    Parameters
+    ----------
+    x: tensor
+        Input tensor.
+    upper_bound: int, optional
+        Restrict the maximum value of the original tensor (select 100 empirically).
+    target_range: int, optional
+        The range of target data type (255 for int8)
+
+    Returns
+    ----------
+    scale: float
+
+    zero_point: int
+
+    """
+    # 
+    scale = torch.zeros(1).to(x.device) # initialize
+    zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+    xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+    scale = xmax*2/target_range # original range divided by target range
+    return scale, zero_point
+
+def get_quantized_tensor(self, x, default_scale=0.1):
+    """ Quantize tensors
+
+    Parameters
+    ----------
+    x: tensors
+        Input tensor.
+
+    default_scale: float, optional
+        Default scale for the case that the computed scale is zero.
+
+
+    Returns
+    ----------
+    quantized_x: tensors
+
+
+    """
+    scale, zero_point = self.get_scale_and_zero_point(x)
+    if scale == 0:
+        scale = torch.tensor([default_scale]) # avoid zero scale
+    quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+    return quantized_x
+
+def qbnn_linear_layer(d):
+    layer_type = "Quantized" + d.__class__.__name__
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get QBNN layer
+    qbnn_layer = layer_fn(
+        in_features=d.in_features,
+        out_features=d.out_features,
+    )
+    qbnn_layer.__dict__.update(d.__dict__)
+    qbnn_layer.quantize()
+    return qbnn_layer
+
+def qbnn_conv_layer(d):
+    layer_type = "Quantized" + d.__class__.__name__
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get QBNN layer
+    qbnn_layer = layer_fn(
+        in_channels=d.in_channels,
+        out_channels=d.out_channels,
+        kernel_size=d.kernel_size,
+        stride=d.stride,
+        padding=d.padding,
+        dilation=d.dilation,
+        groups=d.groups,
+    )
+    qbnn_layer.__dict__.update(d.__dict__)
+    qbnn_layer.quantize()
+    return qbnn_layer
+
+def qbnn_lstm_layer(d):
+    layer_type = "Quantized" + d.__class__.__name__
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get QBNN layer
+    qbnn_layer = layer_fn(
+        in_features=d.input_size,
+        out_features=d.hidden_size,
+    )
+    qbnn_layer.__dict__.update(d.__dict__)
+    qbnn_layer.quantize()
+    return qbnn_layer
+
+def qbnn_batchnorm2d_layer(d):
+    layer_fn = QBatchNorm2d  # Get QBNN layer
+    qbnn_layer = layer_fn(
+        num_features=d.num_features
+    )
+    qbnn_layer.__dict__.update(d.__dict__)
+    # qbnn_layer.weight = Parameter(get_quantized_tensor(d.weight), requires_grad=False)
+    # qbnn_layer.bias = Parameter(get_quantized_tensor(d.bias), requires_grad=False)
+    # qbnn_layer.running_mean = Parameter(get_quantized_tensor(d.running_mean), requires_grad=False)
+    # qbnn_layer.running_var = Parameter(get_quantized_tensor(d.running_var), requires_grad=False)
+    qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False)
+    qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False)
+    return qbnn_layer
+
+
+# batch norm folding
+def batch_norm_folding(conv, bn):
+    layer_type = "Quantized" + conv.__class__.__name__
+    layer_fn = getattr(bayesian_layers, layer_type)  # Get QBNN layer
+    qbnn_layer = layer_fn(
+        in_channels=conv.in_channels,
+        out_channels=conv.out_channels,
+        kernel_size=conv.kernel_size,
+        stride=conv.stride,
+        padding=conv.padding,
+        dilation=conv.dilation,
+        groups=conv.groups,
+    )
+    qbnn_layer.__dict__.update(conv.__dict__)
+    qbnn_layer.bn_weight = bn.weight
+    qbnn_layer.bn_bias = bn.bias
+    qbnn_layer.bn_running_mean = bn.running_mean
+    qbnn_layer.bn_running_var = bn.running_var
+    qbnn_layer.bn_eps = bn.eps
+    qbnn_layer.quantize()
+    return qbnn_layer
+
+# replaces linear and conv layers
+def bnn_to_qbnn(m, fuse_conv_bn=False):
+    for name, value in list(m._modules.items()):
+        if m._modules[name]._modules:
+            bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn)
+        elif "Linear" in m._modules[name].__class__.__name__:
+            setattr(m, name, qbnn_linear_layer(m._modules[name]))
+        elif "LSTM" in m._modules[name].__class__.__name__:
+            setattr(m, name, qbnn_lstm_layer(m._modules[name]))
+        else:
+            if fuse_conv_bn:
+                if 'conv1' in m._modules.keys() and 'bn1' in m._modules.keys():
+                    if 'Identity' not in m._modules['bn1'].__class__.__name__:
+                        setattr(m, 'conv1', batch_norm_folding(m._modules['conv1'], m._modules['bn1']))
+                        setattr(m, 'bn1', Identity())
+                if 'conv2' in m._modules.keys() and 'bn2' in m._modules.keys():
+                    if 'Identity' not in m._modules['bn2'].__class__.__name__:
+                        setattr(m, 'conv2', batch_norm_folding(m._modules['conv2'], m._modules['bn2']))
+                        setattr(m, 'bn2', Identity())
+                if 'conv3' in m._modules.keys() and 'bn3' in m._modules.keys():
+                    if 'Identity' not in m._modules['bn3'].__class__.__name__:
+                        setattr(m, 'conv3', batch_norm_folding(m._modules['conv3'], m._modules['bn3']))
+                        setattr(m, 'bn3', Identity())
+                if 'downsample' in m._modules.keys():
+                    if m._modules['downsample'].__class__.__name__=='Sequential' and len(m._modules['downsample'])==2:
+                        if 'Identity' not in m._modules['downsample'][1].__class__.__name__:
+                            m._modules['downsample'][0]=batch_norm_folding(m._modules['downsample'][0], m._modules['downsample'][1])
+                            m._modules['downsample'][1]=Identity()
+            else:
+                if "Conv" in m._modules[name].__class__.__name__:
+                    setattr(m, name, qbnn_conv_layer(m._modules[name]))
+                
+                elif "Batch" in m._modules[name].__class__.__name__:
+                    setattr(m, name, qbnn_batchnorm2d_layer(m._modules[name]))
+
+    return
+
+if __name__ == "__main__":
+    class FusionTest(nn.Module):
+        def __init__(self):
+            super(FusionTest, self).__init__()
+            self.conv1 = bayesian_layers.Conv2dReparameterization(1,3,2,bias=False)
+            self.bn1 = nn.BatchNorm2d(3)
+        def forward(self, x):
+            x = self.conv1(x)[0]
+            x = self.bn1(x)
+            return x
+    m = FusionTest()
+    m.conv1.rho_kernel = Parameter(torch.zeros(m.conv1.rho_kernel.shape)-100)
+    m.eval()
+    print(m)
+    input = torch.randn(1,1,3,3)
+    print(m(input))
+    bnn_to_qbnn(m)
+    print(m)
+    if input.dtype!=torch.quint8:
+        input = torch.quantize_per_tensor(input, 0.1, 128, torch.quint8)
+    print(m(input))
\ No newline at end of file

From 3930c61944c7d83cb2987ee5699e9b5439cb6a0f Mon Sep 17 00:00:00 2001
From: Jun-Liang Lin <82939287+junliang-lin@users.noreply.github.com>
Date: Mon, 6 Feb 2023 12:52:39 -0500
Subject: [PATCH 48/69] Merge remote-tracking branch (#5)

* fix minor typo.

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update links in README.md

* update MOPED layer example utility function

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update README.md

* feat: add possibility to return no kl, save it as attribute

* feat: add possibility to return no kl on flipout layers, save it as attribute

* updates to support dnn to bnn imodel auto conversion

* updates to support dnn to bnn imodel auto conversion

* remove duplicate kl_loss definition in Conv1dReparameterization layer

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* include kl_loss() function in Convolutional flipout layers,
to compute kl when 'return_kl' flag is set to False.
Fix for issue#12.

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update README.md

* Update README.md

* update the posterior variational param init value

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update release version with dnn_to_bnn() feature

* Update README.md

update usage instructions in README file

* Update requirements.txt

* Include training, testing and uncertainty quantification snippet in README.md

* update version in setup.py

* Update bayesian_torch.layers.md

* Update links in README.md

* Update setup.py

* Update README.md

* include assets folder

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update setup.py

* release to PyPI, update install instruction through "pip" command

* Switched to permanent URL for the top image.

* changing to raw.githubusercontent.com Url for top image.

* Update README.md

* update links and release number for PyPI documentation

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update README.md

add downloads statistics badge

* update download count badge

* Added support for arbitrary kernel sizes for Bayesian Conv layers

* update version number

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>

* Update README.md

* Add support for output padding in flipout layers

---------

Signed-off-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
Co-authored-by: Ranganath Krishnan <ranganath.krishnan@intel.com>
Co-authored-by: Pi <piero.skywalker@gmail.com>
Co-authored-by: msubedar <mahesh.subedar@intel.com>
Co-authored-by: Michael Beale <michael.beale@intel.com>
---
 README.md                                     |   2 +-
 .../layers/base_variational_layer.py          |   6 +
 .../layers/flipout_layers/conv_flipout.py     | 104 ++++++++++--------
 .../variational_layers/conv_variational.py    |  90 +++++++--------
 bayesian_torch/models/dnn_to_bnn.py           |   2 +-
 setup.py                                      |   2 +-
 6 files changed, 116 insertions(+), 90 deletions(-)

diff --git a/README.md b/README.md
index e56245a..36ebf19 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ A library for Bayesian neural network layers and uncertainty estimation in Deep
 
 [![python](https://img.shields.io/badge/python-3.7%2B-blue)](https://github.com/IntelLabs/bayesian-torch)
 [![pytorch](https://img.shields.io/badge/pytorch-1.7.0%2B-orange)](https://github.com/IntelLabs/bayesian-torch)
-[![version](https://img.shields.io/badge/release-0.2.1-green)](https://github.com/IntelLabs/bayesian-torch/releases)
+[![version](https://img.shields.io/badge/release-0.3.0-green)](https://github.com/IntelLabs/bayesian-torch/releases)
 [![license](https://img.shields.io/badge/license-BSD%203--Clause-blue)](https://github.com/IntelLabs/bayesian-torch/blob/main/LICENSE)
 [![Downloads](https://static.pepy.tech/personalized-badge/bayesian-torch?period=total&units=international_system&left_color=grey&right_color=darkblue&left_text=downloads)](https://pepy.tech/project/bayesian-torch)
 <h4 align="center">
diff --git a/bayesian_torch/layers/base_variational_layer.py b/bayesian_torch/layers/base_variational_layer.py
index 4d63cc9..8263e82 100644
--- a/bayesian_torch/layers/base_variational_layer.py
+++ b/bayesian_torch/layers/base_variational_layer.py
@@ -29,7 +29,13 @@
 import torch
 import torch.nn as nn
 import torch.distributions as distributions
+from itertools import repeat
+import collections
 
+def get_kernel_size(x, n):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
 
 class BaseVariationalLayer_(nn.Module):
     def __init__(self):
diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index ce13897..3cd81d1 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -36,7 +36,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from ..base_variational_layer import BaseVariationalLayer_
+from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size
 
 from torch.distributions.normal import Normal
 from torch.distributions.uniform import Uniform
@@ -263,28 +263,32 @@ def __init__(self,
         self.bias = bias
 
         self.kl = 0
+<<<<<<< HEAD
 
+=======
+        kernel_size = get_kernel_size(kernel_size, 2)
+>>>>>>> upstream/main
         self.mu_kernel = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.rho_kernel = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
 
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
 
         if self.bias:
@@ -430,27 +434,29 @@ def __init__(self,
         self.posterior_mu_init = posterior_mu_init
         self.posterior_rho_init = posterior_rho_init
 
+        kernel_size = get_kernel_size(kernel_size, 3)
+
         self.mu_kernel = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.rho_kernel = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
 
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
 
         if self.bias:
@@ -555,6 +561,7 @@ def __init__(self,
                  padding=0,
                  dilation=1,
                  groups=1,
+                 output_padding=0,
                  prior_mean=0,
                  prior_variance=1,
                  posterior_mu_init=0,
@@ -586,6 +593,7 @@ def __init__(self,
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
+        self.output_padding = output_padding
         self.dilation = dilation
         self.groups = groups
         self.bias = bias
@@ -667,6 +675,7 @@ def forward(self, x, return_kl=True):
                                      bias=self.mu_bias,
                                      stride=self.stride,
                                      padding=self.padding,
+                                     output_padding=self.output_padding,
                                      dilation=self.dilation,
                                      groups=self.groups)
 
@@ -700,6 +709,7 @@ def forward(self, x, return_kl=True):
             bias=bias,
             stride=self.stride,
             padding=self.padding,
+            output_padding=self.output_padding,
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
@@ -717,6 +727,7 @@ def __init__(self,
                  kernel_size,
                  stride=1,
                  padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
                  prior_mean=0,
@@ -750,6 +761,7 @@ def __init__(self,
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
+        self.output_padding = output_padding
         self.dilation = dilation
         self.groups = groups
         self.bias = bias
@@ -760,28 +772,28 @@ def __init__(self,
         self.prior_variance = prior_variance
         self.posterior_mu_init = posterior_mu_init
         self.posterior_rho_init = posterior_rho_init
-
+        kernel_size = get_kernel_size(kernel_size, 2)
         self.mu_kernel = nn.Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.rho_kernel = nn.Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]))
 
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
 
         if self.bias:
@@ -835,6 +847,7 @@ def forward(self, x, return_kl=True):
                                      weight=self.mu_kernel,
                                      stride=self.stride,
                                      padding=self.padding,
+                                     output_padding=self.output_padding,
                                      dilation=self.dilation,
                                      groups=self.groups)
 
@@ -868,6 +881,7 @@ def forward(self, x, return_kl=True):
             weight=delta_kernel,
             stride=self.stride,
             padding=self.padding,
+            output_padding=self.output_padding,
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
@@ -885,6 +899,7 @@ def __init__(self,
                  kernel_size,
                  stride=1,
                  padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
                  prior_mean=0,
@@ -918,6 +933,7 @@ def __init__(self,
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
+        self.output_padding = output_padding
         self.dilation = dilation
         self.groups = groups
 
@@ -928,28 +944,28 @@ def __init__(self,
         self.bias = bias
 
         self.kl = 0
-
+        kernel_size = get_kernel_size(kernel_size, 3)
         self.mu_kernel = nn.Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.rho_kernel = nn.Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
 
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
 
         if self.bias:
@@ -1003,6 +1019,7 @@ def forward(self, x, return_kl=True):
                                      bias=self.mu_bias,
                                      stride=self.stride,
                                      padding=self.padding,
+                                     output_padding=self.output_padding,
                                      dilation=self.dilation,
                                      groups=self.groups)
 
@@ -1035,6 +1052,7 @@ def forward(self, x, return_kl=True):
             bias=bias,
             stride=self.stride,
             padding=self.padding,
+            output_padding=self.output_padding,
             dilation=self.dilation,
             groups=self.groups) * sign_output
 
diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 7855ad8..0d2ebfd 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -46,7 +46,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import Parameter
-from ..base_variational_layer import BaseVariationalLayer_
+from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size
 import math
 
 __all__ = [
@@ -255,26 +255,28 @@ def __init__(self,
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
 
+        kernel_size = get_kernel_size(kernel_size, 2)
+
         self.mu_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.rho_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
 
         if self.bias:
@@ -403,27 +405,27 @@ def __init__(self,
         # variance of weight --> sigma = log (1 + exp(rho))
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
-
+        kernel_size = get_kernel_size(kernel_size, 3)
         self.mu_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.rho_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
 
         if self.bias:
@@ -698,27 +700,27 @@ def __init__(self,
         # variance of weight --> sigma = log (1 + exp(rho))
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
-
+        kernel_size = get_kernel_size(kernel_size, 2)
         self.mu_kernel = Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.rho_kernel = Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]))
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1]),
             persistent=False)
 
         if self.bias:
@@ -850,27 +852,27 @@ def __init__(self,
         # variance of weight --> sigma = log (1 + exp(rho))
         self.posterior_rho_init = posterior_rho_init,
         self.bias = bias
-
+        kernel_size = get_kernel_size(kernel_size, 3)
         self.mu_kernel = Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.rho_kernel = Parameter(
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size))
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]))
         self.register_buffer(
             'eps_kernel',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_mu',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
         self.register_buffer(
             'prior_weight_sigma',
-            torch.Tensor(in_channels, out_channels // groups, kernel_size,
-                         kernel_size, kernel_size),
+            torch.Tensor(in_channels, out_channels // groups, kernel_size[0],
+                         kernel_size[1], kernel_size[2]),
             persistent=False)
 
         if self.bias:
diff --git a/bayesian_torch/models/dnn_to_bnn.py b/bayesian_torch/models/dnn_to_bnn.py
index 18b9b51..92e18b4 100644
--- a/bayesian_torch/models/dnn_to_bnn.py
+++ b/bayesian_torch/models/dnn_to_bnn.py
@@ -79,7 +79,7 @@ def bnn_conv_layer(params, d):
     bnn_layer = layer_fn(
         in_channels=d.in_channels,
         out_channels=d.out_channels,
-        kernel_size=d.kernel_size[0],
+        kernel_size=d.kernel_size,
         stride=d.stride,
         padding=d.padding,
         dilation=d.dilation,
diff --git a/setup.py b/setup.py
index 5a02fb8..6629022 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name = "bayesian-torch",
     packages = find_packages(),
-    version = "0.2.1",
+    version = "0.3.0",
     description = "A library for Bayesian neural network layers and uncertainty estimation in Deep Learning",
     author = "Intel Labs",
     author_email = "ranganath.krishnan@intel.com",

From f3f32e88f3b0b1b162a6a9649cf573c5ee5a4546 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 6 Feb 2023 19:50:34 -0500
Subject: [PATCH 49/69] add kl flag for BNN to QBNN conversion

---
 .../flipout_layers/quantized_conv_flipout.py  | 60 ++++++++++++++----
 .../quantized_linear_flipout.py               | 10 ++-
 .../quantize_conv_variational.py              | 62 +++++++++++++++----
 .../quantize_linear_variational.py            |  9 ++-
 bayesian_torch/models/bnn_to_qbnn.py          |  8 +++
 5 files changed, 121 insertions(+), 28 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 8cde630..cf771c7 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -177,7 +177,7 @@ def quantize(self):
         delattr(self, "bn_running_var")
         delattr(self, "bn_eps")
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -209,6 +209,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -244,7 +247,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
 
 
 class QuantizedConv2dFlipout(Conv2dFlipout):
@@ -384,7 +390,7 @@ def dequantize(self):
         
         return
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -416,6 +422,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -451,7 +460,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
 
 
 class QuantizedConv3dFlipout(Conv3dFlipout):
@@ -591,7 +603,7 @@ def dequantize(self):
         
         return
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -623,6 +635,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -658,7 +673,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
 
 class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout):
     def __init__(self,
@@ -788,7 +806,7 @@ def quantize(self):
         delattr(self, "bn_running_var")
         delattr(self, "bn_eps")
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -820,6 +838,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -860,7 +881,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
 
 class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout):
     def __init__(self,
@@ -990,7 +1014,7 @@ def quantize(self):
         delattr(self, "bn_running_var")
         delattr(self, "bn_eps")
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -1022,6 +1046,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -1062,7 +1089,10 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
 
 class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout):
     def __init__(self,
@@ -1192,7 +1222,7 @@ def quantize(self):
         delattr(self, "bn_running_var")
         delattr(self, "bn_eps")
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -1224,6 +1254,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if x.dtype!=torch.quint8:
             x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
 
@@ -1264,4 +1297,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
-        return out, 0
\ No newline at end of file
+        if return_kl:
+            return out, 0
+        
+        return out
\ No newline at end of file
diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
index 9673242..289da98 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
@@ -131,7 +131,7 @@ def dequantize(self):
         self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
         return
 
-    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -163,6 +163,9 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
 
         """
 
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         bias = None
         if self.quantized_mu_bias is not None:
             if not self.is_dequant:
@@ -197,4 +200,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
         out = out.dequantize()
 
-        return out, 0
+        if return_kl:
+            return out, 0
+        
+        return out
diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
index 59470df..a8b25dc 100644
--- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
@@ -199,7 +199,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -233,6 +233,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -269,7 +273,11 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding,
                         self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
-        return out, 0 # disable kl divergence computing
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+        
 
 
 class QuantizedConv2dReparameterization(Conv2dReparameterization):
@@ -421,7 +429,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -455,6 +463,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+        
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -491,7 +502,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding,
                         self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
-        return out, 0 # disable kl divergence computing
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
 
 
 class QuantizedConv3dReparameterization(Conv3dReparameterization):
@@ -642,7 +656,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -676,6 +690,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -712,7 +729,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding,
                         self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
 
-        return out, 0 # disable kl divergence computing
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
 
 class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization):
     def __init__(self,
@@ -862,7 +882,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -896,6 +916,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -937,7 +960,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
-        return out, 0 # disable kl divergence computing
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
 
 class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization):
     def __init__(self,
@@ -1087,7 +1113,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -1121,6 +1147,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -1162,7 +1191,10 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
-        return out, 0 # disable kl divergence computing
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
 
 class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization):
     def __init__(self,
@@ -1312,7 +1344,7 @@ def dequantize(self): # Deprecated. Only for forward mode #1.
         
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -1346,6 +1378,9 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+            
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
@@ -1387,4 +1422,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
         
 
-        return out, 0 # disable kl divergence computing
\ No newline at end of file
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
\ No newline at end of file
diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
index a1ce3fd..e666f9b 100644
--- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
@@ -130,7 +130,7 @@ def dequantize(self): # Deprecated
         self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters
@@ -165,6 +165,8 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
 
         """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
 
         if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
@@ -196,4 +198,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
             out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
             out = out.dequantize()
             
-        return out, 0 # kl=0
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index 6732a65..d689465 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -102,6 +102,8 @@ def qbnn_linear_layer(d):
     )
     qbnn_layer.__dict__.update(d.__dict__)
     qbnn_layer.quantize()
+    if d.dnn_to_bnn_flag:
+        qbnn_layer.dnn_to_bnn_flag = True
     return qbnn_layer
 
 def qbnn_conv_layer(d):
@@ -118,6 +120,8 @@ def qbnn_conv_layer(d):
     )
     qbnn_layer.__dict__.update(d.__dict__)
     qbnn_layer.quantize()
+    if d.dnn_to_bnn_flag:
+        qbnn_layer.dnn_to_bnn_flag = True
     return qbnn_layer
 
 def qbnn_lstm_layer(d):
@@ -129,6 +133,8 @@ def qbnn_lstm_layer(d):
     )
     qbnn_layer.__dict__.update(d.__dict__)
     qbnn_layer.quantize()
+    if d.dnn_to_bnn_flag:
+        qbnn_layer.dnn_to_bnn_flag = True
     return qbnn_layer
 
 def qbnn_batchnorm2d_layer(d):
@@ -166,6 +172,8 @@ def batch_norm_folding(conv, bn):
     qbnn_layer.bn_running_var = bn.running_var
     qbnn_layer.bn_eps = bn.eps
     qbnn_layer.quantize()
+    if conv.dnn_to_bnn_flag:
+        qbnn_layer.dnn_to_bnn_flag = True
     return qbnn_layer
 
 # replaces linear and conv layers

From b0a99d218e72518ff7694fcf82d4524196fd5804 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Thu, 9 Feb 2023 19:25:35 -0500
Subject: [PATCH 50/69] resolve merge conflicts

---
 bayesian_torch/layers/flipout_layers/conv_flipout.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index 3cd81d1..c92d24b 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -263,11 +263,7 @@ def __init__(self,
         self.bias = bias
 
         self.kl = 0
-<<<<<<< HEAD
-
-=======
         kernel_size = get_kernel_size(kernel_size, 2)
->>>>>>> upstream/main
         self.mu_kernel = nn.Parameter(
             torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
                          kernel_size[1]))

From 5c691d6e7c1e7acd0197a1541108799350428d3c Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Feb 2023 02:04:06 -0500
Subject: [PATCH 51/69] quantized flipout models

---
 .../main_bayesian_imagenet_bnn2qbnn.py        | 293 ++++++++++
 .../main_bayesian_imagenet_dnn2bnn.py         | 551 ++++++++++++++++++
 .../quantized_resnet_flipout_large.py         | 282 +++++++++
 .../quantized_resnet_variational_large.py     | 282 +++++++++
 4 files changed, 1408 insertions(+)
 create mode 100644 bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
 create mode 100644 bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py
 create mode 100644 bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
 create mode 100644 bayesian_torch/models/bayesian/quantized_resnet_variational_large.py

diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
new file mode 100644
index 0000000..687c1d0
--- /dev/null
+++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
@@ -0,0 +1,293 @@
+import argparse
+import os
+import shutil
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import bayesian_torch.models.bayesian.resnet_variational_large as resnet
+import numpy as np
+from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn
+# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet
+import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet
+
+torch.cuda.is_available = lambda : False
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+torch.backends.quantized.engine='onednn'
+model_names = sorted(
+    name
+    for name in resnet.__dict__
+    if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name])
+)
+
+print(model_names)
+best_acc1 = 0
+len_trainset = 1281167
+len_valset = 50000
+
+
+parser = argparse.ArgumentParser(description="ImageNet")
+parser.add_argument('data',
+                    metavar='DIR',
+                    default='data/imagenet',
+                    help='path to dataset')
+parser.add_argument(
+    "--arch",
+    "-a",
+    metavar="ARCH",
+    default="resnet50",
+    choices=model_names,
+    help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
+)
+parser.add_argument(
+    "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)"
+)
+parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run")
+parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)")
+parser.add_argument("-b", "--batch-size", default=1000, type=int, metavar="N", help="mini-batch size (default: 512)")
+parser.add_argument('--val_batch_size', default=1000, type=int)
+parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate")
+parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
+parser.add_argument(
+    "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)"
+)
+parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)")
+parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)")
+parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set")
+parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model")
+parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ")
+parser.add_argument(
+    "--save-dir",
+    dest="save_dir",
+    help="The directory used to save the trained models",
+    default="./checkpoint/bayesian",
+    type=str,
+)
+parser.add_argument(
+    "--moped-init-model",
+    dest="moped_init_model",
+    help="DNN model to intialize MOPED method",
+    default="",
+    type=str,
+)
+parser.add_argument(
+    "--moped-delta-factor",
+    dest="moped_delta_factor",
+    help="MOPED delta scale factor",
+    default=0.2,
+    type=float,
+)
+
+parser.add_argument(
+    "--bnn-rho-init",
+    dest="bnn_rho_init",
+    help="rho init for bnn layers",
+    default=-3.0,
+    type=float,
+)
+
+parser.add_argument(
+    "--use-flipout-layers",
+    type=bool,
+    default=False,
+    metavar="use_flipout_layers",
+    help="Use Flipout layers for BNNs, default is Reparameterization layers",
+)
+
+parser.add_argument(
+    "--save-every",
+    dest="save_every",
+    help="Saves checkpoints at every specified number of epochs",
+    type=int,
+    default=10,
+)
+parser.add_argument("--mode", type=str, required=True, help="train | test")
+
+parser.add_argument(
+    "--num_monte_carlo",
+    type=int,
+    default=20,
+    metavar="N",
+    help="number of Monte Carlo samples to be drawn during inference",
+)
+parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training")
+parser.add_argument(
+    "--tensorboard",
+    type=bool,
+    default=True,
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./logs/cifar/bayesian",
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+
+def evaluate(args, model, val_loader):
+    pred_probs_mc = []
+    test_loss = 0
+    correct = 0
+    output_list = []
+    labels_list = []
+    model.eval()
+    with torch.no_grad():
+        begin = time.time()
+        i=0
+        for data, target in val_loader:
+            if torch.cuda.is_available():
+                data, target = data.cuda(), target.cuda()
+            else:
+                data, target = data.cpu(), target.cpu()
+            output_mc = []
+            for mc_run in range(args.num_monte_carlo):
+                output, _ = model.forward(data)
+                output_mc.append(output)
+            output_ = torch.stack(output_mc)
+            output_list.append(output_)
+            labels_list.append(target)
+            i+=1
+            end = time.time()
+            print("inference throughput: ", i*args.val_batch_size / (end - begin), " images/s")
+
+        output = torch.cat(output_list, 1)
+        output = torch.nn.functional.softmax(output, dim=2)
+        labels = torch.cat(labels_list)
+        pred_mean = output.mean(dim=0)
+        Y_pred = torch.argmax(pred_mean, axis=1)
+        print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100)
+
+
+def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+    """
+    Save the training model
+    """
+    torch.save(state, filename)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+best_prec1 = 0
+
+def main():
+    global args, best_prec1
+    args = parser.parse_args()
+    moped_enable = False
+    if len(args.moped_init_model) > 0:  # use moped method if trained dnn model weights are provided
+        moped_enable = True
+
+    # Check the save_dir exists or not
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
+    if moped_enable:
+        checkpoint = torch.load(args.moped_init_model)
+        if "state_dict" in checkpoint.keys():
+            model.load_state_dict(checkpoint["state_dict"])
+        else:
+            model.load_state_dict(checkpoint)
+
+    tb_writer = None
+
+    valdir = os.path.join(args.data, 'Imagenet_2012Val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                        std=[0.229, 0.224, 0.225])
+    val_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                                batch_size=args.val_batch_size,
+                                                shuffle=False,
+                                                num_workers=args.workers,
+                                                pin_memory=True)
+
+    print('len valset: ', len(val_dataset))
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    if args.mode == "test":
+        checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch)
+
+        checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
+        model.load_state_dict(checkpoint["state_dict"])
+        model.module = model.module.cpu()
+
+        bnn_to_qbnn(model, fuse_conv_bn=False)  # only replaces linear and conv layers
+
+        model = model.cpu()
+
+        # save weights
+        # save_checkpoint(
+        #             {
+        #                 'epoch': None,
+        #                 'state_dict': model.state_dict(),
+        #                 'best_prec1': None,
+        #             },
+        #             True,
+        #             filename=os.path.join(
+        #                 args.save_dir,
+        #                 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch)))
+
+        qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias
+        qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False)
+
+        # load weights
+        # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch)
+        # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
+        # qmodel.load_state_dict(checkpoint["state_dict"])
+
+        qmodel.load_state_dict(model.state_dict())
+        evaluate(args, qmodel, val_loader)
+
+if __name__ == "__main__":
+    main()
diff --git a/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py b/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py
new file mode 100644
index 0000000..28e03ae
--- /dev/null
+++ b/bayesian_torch/examples/main_bayesian_imagenet_dnn2bnn.py
@@ -0,0 +1,551 @@
+import argparse
+import os
+import shutil
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import bayesian_torch.models.deterministic.resnet_large as resnet
+import numpy as np
+from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn, get_kl_loss
+
+model_names = sorted(
+    name
+    for name in resnet.__dict__
+    if name.islower() and not name.startswith("__") and name.startswith("resnet") and callable(resnet.__dict__[name])
+)
+
+print(model_names)
+best_acc1 = 0
+len_trainset = 1281167
+len_valset = 50000
+
+
+parser = argparse.ArgumentParser(description="ImageNet")
+parser.add_argument('data',
+                    metavar='DIR',
+                    default='data/imagenet',
+                    help='path to dataset')
+parser.add_argument(
+    "--arch",
+    "-a",
+    metavar="ARCH",
+    default="resnet50",
+    choices=model_names,
+    help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
+)
+parser.add_argument(
+    "-j", "--workers", default=8, type=int, metavar="N", help="number of data loading workers (default: 8)"
+)
+parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run")
+parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number (useful on restarts)")
+parser.add_argument("-b", "--batch-size", default=128, type=int, metavar="N", help="mini-batch size (default: 128)")
+parser.add_argument('--val_batch_size', default=1000, type=int)
+parser.add_argument("--lr", "--learning-rate", default=0.001, type=float, metavar="LR", help="initial learning rate")
+parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
+parser.add_argument(
+    "--weight-decay", "--wd", default=1e-4, type=float, metavar="W", help="weight decay (default: 5e-4)"
+)
+parser.add_argument("--print-freq", "-p", default=50, type=int, metavar="N", help="print frequency (default: 20)")
+parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint (default: none)")
+parser.add_argument("-e", "--evaluate", dest="evaluate", action="store_true", help="evaluate model on validation set")
+parser.add_argument("--pretrained", dest="pretrained", action="store_true", help="use pre-trained model")
+parser.add_argument("--half", dest="half", action="store_true", help="use half-precision(16-bit) ")
+parser.add_argument(
+    "--save-dir",
+    dest="save_dir",
+    help="The directory used to save the trained models",
+    default="./checkpoint/bayesian",
+    type=str,
+)
+parser.add_argument(
+    "--moped-init-model",
+    dest="moped_init_model",
+    help="DNN model to intialize MOPED method",
+    default="",
+    type=str,
+)
+parser.add_argument(
+    "--moped-delta-factor",
+    dest="moped_delta_factor",
+    help="MOPED delta scale factor",
+    default=0.001,
+    type=float,
+)
+
+parser.add_argument(
+    "--bnn-rho-init",
+    dest="bnn_rho_init",
+    help="rho init for bnn layers",
+    default=-10.0,
+    type=float,
+)
+
+parser.add_argument(
+    "--use-flipout-layers",
+    type=bool,
+    default=False,
+    metavar="use_flipout_layers",
+    help="Use Flipout layers for BNNs, default is Reparameterization layers",
+)
+
+parser.add_argument(
+    "--save-every",
+    dest="save_every",
+    help="Saves checkpoints at every specified number of epochs",
+    type=int,
+    default=10,
+)
+parser.add_argument("--mode", type=str, required=True, help="train | test")
+
+parser.add_argument(
+    "--num_monte_carlo",
+    type=int,
+    default=20,
+    metavar="N",
+    help="number of Monte Carlo samples to be drawn during inference",
+)
+parser.add_argument("--num_mc", type=int, default=1, metavar="N", help="number of Monte Carlo runs during training")
+parser.add_argument(
+    "--tensorboard",
+    type=bool,
+    default=True,
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+parser.add_argument(
+    "--log_dir",
+    type=str,
+    default="./logs/imagenet/bayesian",
+    metavar="N",
+    help="use tensorboard for logging and visualization of training progress",
+)
+
+best_prec1 = 0
+
+
+def main():
+    global args, best_prec1
+    args = parser.parse_args()
+    moped_enable = False
+    if len(args.moped_init_model) > 0:  # use moped method if trained dnn model weights are provided
+        moped_enable = True
+
+    const_bnn_prior_parameters = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": args.bnn_rho_init,
+        "type": "Flipout" if args.use_flipout_layers else "Reparameterization",  # Flipout or Reparameterization
+        "moped_enable": moped_enable,  # initialize mu/sigma from the dnn weights
+        "moped_delta": args.moped_delta_factor,
+    }
+
+    # Check the save_dir exists or not
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    model = torch.nn.DataParallel(resnet.__dict__[args.arch](pretrained=True))
+    model.cuda() if torch.cuda.is_available() else model.cpu()
+    if moped_enable:
+        checkpoint = torch.load(args.moped_init_model)
+        if "state_dict" in checkpoint.keys():
+            model.load_state_dict(checkpoint["state_dict"])
+        else:
+            model.load_state_dict(checkpoint)
+
+    const_bnn_prior_parameters["moped_enable"]=True
+    dnn_to_bnn(model, const_bnn_prior_parameters)  # only replaces linear and conv layers
+
+    save_checkpoint(
+                    {
+                        "epoch": 0,
+                        "state_dict": model.state_dict(),
+                        "best_prec1": best_prec1,
+                    },
+                    False,
+                    filename=os.path.join(args.save_dir, "bayesian_{}_imagenet.pth".format(args.arch)),
+                )
+
+    if torch.cuda.is_available():
+        model.cuda()
+    else:
+        model.cpu()
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint["epoch"]
+            best_prec1 = checkpoint["best_prec1"]
+            model.load_state_dict(checkpoint)
+            print("=> loaded checkpoint '{}' (epoch {})".format(args.evaluate, checkpoint["epoch"]))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    tb_writer = None
+    if args.tensorboard:
+        logger_dir = os.path.join(args.log_dir, "tb_logger")
+        if not os.path.exists(logger_dir):
+            os.makedirs(logger_dir)
+        tb_writer = SummaryWriter(logger_dir)
+
+    valdir = os.path.join(args.data, 'val') #Imagenet_2012Val
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    # train_loader = torch.utils.data.DataLoader(
+    #     datasets.CIFAR10(
+    #         root="./data",
+    #         train=True,
+    #         transform=transforms.Compose(
+    #             [
+    #                 transforms.RandomHorizontalFlip(),
+    #                 transforms.RandomCrop(32, 4),
+    #                 transforms.ToTensor(),
+    #                 normalize,
+    #             ]
+    #         ),
+    #         download=True,
+    #     ),
+    #     batch_size=args.batch_size,
+    #     shuffle=True,
+    #     num_workers=args.workers,
+    #     pin_memory=True,
+    # )
+
+    val_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+    val_loader = torch.utils.data.DataLoader(val_dataset,
+                                             batch_size=args.val_batch_size,
+                                             shuffle=False,
+                                             num_workers=args.workers,
+                                             pin_memory=True)
+
+    print('len valset: ', len(val_dataset))
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    if torch.cuda.is_available():
+        criterion = nn.CrossEntropyLoss().cuda()
+    else:
+        criterion = nn.CrossEntropyLoss().cpu()
+
+    if args.half:
+        model.half()
+        criterion.half()
+
+    if args.arch in ["resnet110"]:
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = args.lr * 0.1
+
+    if args.evaluate:
+        validate(val_loader, model, criterion)
+        return
+
+    if args.mode == "train":
+        pass
+
+        for epoch in range(args.start_epoch, args.epochs):
+
+            lr = args.lr
+            if epoch >= 80 and epoch < 120:
+                lr = 0.1 * args.lr
+            elif epoch >= 120 and epoch < 160:
+                lr = 0.01 * args.lr
+            elif epoch >= 160 and epoch < 180:
+                lr = 0.001 * args.lr
+            elif epoch >= 180:
+                lr = 0.0005 * args.lr
+
+            optimizer = torch.optim.Adam(model.parameters(), lr)
+
+            # train for one epoch
+            print("current lr {:.5e}".format(optimizer.param_groups[0]["lr"]))
+            train(args, train_loader, model, criterion, optimizer, epoch, tb_writer)
+
+            prec1 = validate(args, val_loader, model, criterion, epoch, tb_writer)
+
+            is_best = prec1 > best_prec1
+            best_prec1 = max(prec1, best_prec1)
+
+            if is_best:
+                save_checkpoint(
+                    {
+                        "epoch": epoch + 1,
+                        "state_dict": model.state_dict(),
+                        "best_prec1": best_prec1,
+                    },
+                    is_best,
+                    filename=os.path.join(args.save_dir, "bayesian_{}_imagenet.pth".format(args.arch)),
+                )
+
+    elif args.mode == "test":
+        checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch)
+        if torch.cuda.is_available():
+            checkpoint = torch.load(checkpoint_file)
+        else:
+            checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
+        model.load_state_dict(checkpoint["state_dict"])
+        evaluate(args, model, val_loader)
+
+
+def train(args, train_loader, model, criterion, optimizer, epoch, tb_writer=None):
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if torch.cuda.is_available():
+            target = target.cuda()
+            input_var = input.cuda()
+            target_var = target
+        else:
+            target = target.cpu()
+            input_var = input.cpu()
+            target_var = target
+
+        if args.half:
+            input_var = input_var.half()
+
+        # compute output
+        output_ = []
+        kl_ = []
+        for mc_run in range(args.num_mc):
+            output = model(input_var)
+            kl = get_kl_loss(model)
+            output_.append(output)
+            kl_.append(kl)
+        output = torch.mean(torch.stack(output_), dim=0)
+        kl = torch.mean(torch.stack(kl_), dim=0)
+        cross_entropy_loss = criterion(output, target_var)
+        scaled_kl = kl / args.batch_size
+
+        # ELBO loss
+        loss = cross_entropy_loss + scaled_kl
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        output = output.float()
+        loss = loss.float()
+        # measure accuracy and record loss
+        prec1 = accuracy(output.data, target)[0]
+        losses.update(loss.item(), input.size(0))
+        top1.update(prec1.item(), input.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            print(
+                "Epoch: [{0}][{1}/{2}]\t"
+                "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
+                "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
+                "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format(
+                    epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1
+                )
+            )
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/cross_entropy_loss", cross_entropy_loss.item(), epoch)
+            tb_writer.add_scalar("train/kl_div", scaled_kl.item(), epoch)
+            tb_writer.add_scalar("train/elbo_loss", loss.item(), epoch)
+            tb_writer.add_scalar("train/accuracy", prec1.item(), epoch)
+            tb_writer.flush()
+
+
+def validate(args, val_loader, model, criterion, epoch, tb_writer=None):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    end = time.time()
+    with torch.no_grad():
+        for i, (input, target) in enumerate(val_loader):
+            if torch.cuda.is_available():
+                target = target.cuda()
+                input_var = input.cuda()
+                target_var = target.cuda()
+            else:
+                target = target.cpu()
+                input_var = input.cpu()
+                target_var = target.cpu()
+
+            if args.half:
+                input_var = input_var.half()
+
+            # compute output
+            output_ = []
+            kl_ = []
+            for mc_run in range(args.num_mc):
+                output = model(input_var)
+                kl = get_kl_loss(model)
+                output_.append(output)
+                kl_.append(kl)
+            output = torch.mean(torch.stack(output_), dim=0)
+            kl = torch.mean(torch.stack(kl_), dim=0)
+            cross_entropy_loss = criterion(output, target_var)
+            # scaled_kl = kl / len_trainset
+            scaled_kl = kl / args.batch_size
+            # scaled_kl = 0.2 * (kl / len_trainset)
+
+            # ELBO loss
+            loss = cross_entropy_loss + scaled_kl
+
+            output = output.float()
+            loss = loss.float()
+
+            # measure accuracy and record loss
+            prec1 = accuracy(output.data, target)[0]
+            losses.update(loss.item(), input.size(0))
+            top1.update(prec1.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                print(
+                    "Test: [{0}/{1}]\t"
+                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                    "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
+                    "Prec@1 {top1.val:.3f} ({top1.avg:.3f})".format(
+                        i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1
+                    )
+                )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar("val/cross_entropy_loss", cross_entropy_loss.item(), epoch)
+                tb_writer.add_scalar("val/kl_div", scaled_kl.item(), epoch)
+                tb_writer.add_scalar("val/elbo_loss", loss.item(), epoch)
+                tb_writer.add_scalar("val/accuracy", prec1.item(), epoch)
+                tb_writer.flush()
+
+    print(" * Prec@1 {top1.avg:.3f}".format(top1=top1))
+
+    return top1.avg
+
+
+def evaluate(args, model, val_loader):
+    pred_probs_mc = []
+    test_loss = 0
+    correct = 0
+    output_list = []
+    labels_list = []
+    model.eval()
+    with torch.no_grad():
+        begin = time.time()
+        i=0
+        for data, target in val_loader:
+            if torch.cuda.is_available():
+                data, target = data.cuda(), target.cuda()
+            else:
+                data, target = data.cpu(), target.cpu()
+            output_mc = []
+            for mc_run in range(args.num_monte_carlo):
+                output = model.forward(data)
+                output_mc.append(output)
+            output_ = torch.stack(output_mc)
+            output_list.append(output_)
+            labels_list.append(target)
+            i+=1
+            # if i==10:
+            #     break
+        end = time.time()
+        print("inference throughput: ", 50000 / (end - begin), " images/s")
+
+        # output = torch.stack(output_list)
+        # output = output.permute(1, 0, 2, 3)
+        # output = output.contiguous().view(args.num_monte_carlo, len_valset, -1)
+        output = torch.cat(output_list, 1)
+        output = torch.nn.functional.softmax(output, dim=2)
+        labels = torch.cat(labels_list)
+        pred_mean = output.mean(dim=0)
+        Y_pred = torch.argmax(pred_mean, axis=1)
+        
+        np.save("./probs_cifar_mc.npy", output.data.cpu().numpy())
+        np.save("./cifar_test_labels_mc.npy", labels.data.cpu().numpy())
+        print(Y_pred.shape, labels.shape)
+        print(Y_pred[:100], labels[:100])
+        print("Test accuracy:", (Y_pred.data.cpu().numpy() == labels.data.cpu().numpy()).mean() * 100)
+
+
+def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+    """
+    Save the training model
+    """
+    torch.save(state, filename)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
new file mode 100644
index 0000000..8b18234
--- /dev/null
+++ b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
@@ -0,0 +1,282 @@
+'''
+Bayesian ResNet for CIFAR10.
+
+ResNet architecture ref:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from bayesian_torch.layers import QuantizedConv2dFlipout
+from bayesian_torch.layers import QuantizedLinearFlipout
+from torch.nn.quantized import BatchNorm2d as QuantizedBatchNorm2d
+from torch.nn import Identity
+
+__all__ = [
+    'QResNet', 'qresnet18', 'qresnet34', 'qresnet50', 'qresnet101', 'qresnet152'
+]
+
+def _weights_init(m):
+    classname = m.__class__.__name__
+    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
+        init.kaiming_normal_(m.weight)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, lambd):
+        super(LambdaLayer, self).__init__()
+        self.lambd = lambd
+
+    def forward(self, x):
+        return self.lambd(x)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1, option='A', bias=False):
+        super(BasicBlock, self).__init__()
+        self.conv1 = QuantizedConv2dFlipout(
+            in_channels=in_planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=bias)
+        self.bn1 = QuantizedBatchNorm2d(planes)
+        self.conv2 = QuantizedConv2dFlipout(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias)
+        self.bn2 = QuantizedBatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != planes:
+            if option == 'A':
+                """
+                For CIFAR10 ResNet paper uses option A.
+                """
+                self.shortcut = LambdaLayer(lambda x: F.pad(
+                    x[:, :, ::2, ::2],
+                    (0, 0, 0, 0, planes // 4, planes // 4), "constant", 0))
+            elif option == 'B':
+                self.shortcut = nn.Sequential(
+                    QuantizedConv2dFlipout(
+                        in_channels=in_planes,
+                        out_channels=self.expansion * planes,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=bias), QuantizedBatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        out, _ = self.conv1(x)
+        out = self.bn1(out)
+        out = F.relu(out)
+        out, _ = self.conv2(out)
+        out = self.bn2(out)
+        sh = self.shortcut(x.contiguous()).contiguous()
+        new_scale = max(out.q_scale(), sh.q_scale())
+        out = torch.ops.quantized.add(out, sh, new_scale, 0)
+        # out += self.shortcut(x)
+        out = F.relu(out)
+        return out, 0 # kl=0
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False):
+        super(Bottleneck, self).__init__()
+        self.conv1 = QuantizedConv2dFlipout(
+            in_channels=inplanes,
+            out_channels=planes,
+            kernel_size=1,
+            bias=bias)
+        self.bn1 =QuantizedBatchNorm2d(planes)
+        self.conv2 = QuantizedConv2dFlipout(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=bias)
+        self.bn2 = QuantizedBatchNorm2d(planes)
+        self.conv3 = QuantizedConv2dFlipout(
+            in_channels=planes,
+            out_channels=planes * 4,
+            kernel_size=1,
+            bias=bias)
+        self.bn3 = QuantizedBatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        kl_sum = 0
+        out, kl = self.conv1(x)
+        kl_sum += kl
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out, kl = self.conv2(out)
+        kl_sum += kl
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out, kl = self.conv3(out)
+        kl_sum += kl
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual, kl = self.downsample(x)
+            kl_sum += kl
+
+        # out += residual
+        new_scale = max(out.q_scale(), residual.q_scale())
+        out = torch.ops.quantized.add(out, residual, new_scale, 0)
+        out = self.relu(out)
+
+        return out, kl_sum
+
+class QResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, bias=False):
+        super(QResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = QuantizedConv2dFlipout(
+            in_channels=3,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=bias)
+        self.bn1 = QuantizedBatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], bias=bias)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, bias=bias)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, bias=bias)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, bias=bias)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = QuantizedLinearFlipout(
+            in_features=512 * block.expansion,
+            out_features=num_classes,
+        )
+
+        self.apply(_weights_init)
+
+    def _make_layer(self, block, planes, blocks, stride=1, bias=False):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                QuantizedConv2dFlipout(in_channels=self.inplanes,
+                                         out_channels=planes * block.expansion,
+                                         kernel_size=1,
+                                         stride=stride,
+                                         bias=bias),
+                QuantizedBatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, bias=bias))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, bias=bias))
+
+        return nn.Sequential(*layers)
+
+    def quant_then_dequant(self, m, fuse_conv_bn=False): ## quantize only; need to rename this function
+        for name, value in list(m._modules.items()):
+            if m._modules[name]._modules:
+                self.quant_then_dequant(m._modules[name], fuse_conv_bn=fuse_conv_bn)
+                
+            if "QuantizedConv" in m._modules[name].__class__.__name__:
+                m._modules[name].quantize()
+                m._modules[name].quantized_sigma_bias = None ### work around
+                m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring
+
+            if "QuantizedLinear" in m._modules[name].__class__.__name__:
+                m._modules[name].quantize()
+                m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring
+
+            if fuse_conv_bn and "BatchNorm2d" in m._modules[name].__class__.__name__: # quite confusing, should be quantizedbatchnorm2d
+                setattr(m, name, Identity())
+
+    def forward(self, x):
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        for layer in self.layer1:
+            x = layer(x)
+
+        for layer in self.layer2:
+            x = layer(x)
+
+        for layer in self.layer3:
+            x = layer(x)
+
+        for layer in self.layer4:
+            x = layer(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def qresnet18(pretrained=False, **kwargs):
+    model = QResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def qresnet34(pretrained=False, **kwargs):
+    model = QResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def qresnet50(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def qresnet101(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def qresnet152(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
+
+
+
+def test(net):
+    import numpy as np
+    total_params = 0
+
+    for x in filter(lambda p: p.requires_grad, net.parameters()):
+        total_params += np.prod(x.data.numpy().shape)
+    print("Total number of params", total_params)
+    print(
+        "Total layers",
+        len(
+            list(
+                filter(lambda p: p.requires_grad and len(p.data.size()) > 1,
+                       net.parameters()))))
+
+
+if __name__ == "__main__":
+    for net_name in __all__:
+        if net_name.startswith('qresnet'):
+            print(net_name)
+            test(globals()[net_name]())
+            print()
diff --git a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py
new file mode 100644
index 0000000..6f3077e
--- /dev/null
+++ b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py
@@ -0,0 +1,282 @@
+'''
+Bayesian ResNet for CIFAR10.
+
+ResNet architecture ref:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from bayesian_torch.layers import QuantizedConv2dReparameterization
+from bayesian_torch.layers import QuantizedLinearReparameterization
+from torch.nn.quantized import BatchNorm2d as QuantizedBatchNorm2d
+from torch.nn import Identity
+
+__all__ = [
+    'QResNet', 'qresnet18', 'qresnet34', 'qresnet50', 'qresnet101', 'qresnet152'
+]
+
+def _weights_init(m):
+    classname = m.__class__.__name__
+    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
+        init.kaiming_normal_(m.weight)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, lambd):
+        super(LambdaLayer, self).__init__()
+        self.lambd = lambd
+
+    def forward(self, x):
+        return self.lambd(x)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1, option='A', bias=False):
+        super(BasicBlock, self).__init__()
+        self.conv1 = QuantizedConv2dReparameterization(
+            in_channels=in_planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=bias)
+        self.bn1 = QuantizedBatchNorm2d(planes)
+        self.conv2 = QuantizedConv2dReparameterization(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias)
+        self.bn2 = QuantizedBatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != planes:
+            if option == 'A':
+                """
+                For CIFAR10 ResNet paper uses option A.
+                """
+                self.shortcut = LambdaLayer(lambda x: F.pad(
+                    x[:, :, ::2, ::2],
+                    (0, 0, 0, 0, planes // 4, planes // 4), "constant", 0))
+            elif option == 'B':
+                self.shortcut = nn.Sequential(
+                    QuantizedConv2dReparameterization(
+                        in_channels=in_planes,
+                        out_channels=self.expansion * planes,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=bias), QuantizedBatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        out, _ = self.conv1(x)
+        out = self.bn1(out)
+        out = F.relu(out)
+        out, _ = self.conv2(out)
+        out = self.bn2(out)
+        sh = self.shortcut(x.contiguous()).contiguous()
+        new_scale = max(out.q_scale(), sh.q_scale())
+        out = torch.ops.quantized.add(out, sh, new_scale, 0)
+        # out += self.shortcut(x)
+        out = F.relu(out)
+        return out, 0 # kl=0
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False):
+        super(Bottleneck, self).__init__()
+        self.conv1 = QuantizedConv2dReparameterization(
+            in_channels=inplanes,
+            out_channels=planes,
+            kernel_size=1,
+            bias=bias)
+        self.bn1 =QuantizedBatchNorm2d(planes)
+        self.conv2 = QuantizedConv2dReparameterization(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=bias)
+        self.bn2 = QuantizedBatchNorm2d(planes)
+        self.conv3 = QuantizedConv2dReparameterization(
+            in_channels=planes,
+            out_channels=planes * 4,
+            kernel_size=1,
+            bias=bias)
+        self.bn3 = QuantizedBatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        kl_sum = 0
+        out, kl = self.conv1(x)
+        kl_sum += kl
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out, kl = self.conv2(out)
+        kl_sum += kl
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out, kl = self.conv3(out)
+        kl_sum += kl
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual, kl = self.downsample(x)
+            kl_sum += kl
+
+        # out += residual
+        new_scale = max(out.q_scale(), residual.q_scale())
+        out = torch.ops.quantized.add(out, residual, new_scale, 0)
+        out = self.relu(out)
+
+        return out, kl_sum
+
+class QResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, bias=False):
+        super(QResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = QuantizedConv2dReparameterization(
+            in_channels=3,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=bias)
+        self.bn1 = QuantizedBatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], bias=bias)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, bias=bias)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, bias=bias)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, bias=bias)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = QuantizedLinearReparameterization(
+            in_features=512 * block.expansion,
+            out_features=num_classes,
+        )
+
+        self.apply(_weights_init)
+
+    def _make_layer(self, block, planes, blocks, stride=1, bias=False):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                QuantizedConv2dReparameterization(in_channels=self.inplanes,
+                                         out_channels=planes * block.expansion,
+                                         kernel_size=1,
+                                         stride=stride,
+                                         bias=bias),
+                QuantizedBatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, bias=bias))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, bias=bias))
+
+        return nn.Sequential(*layers)
+
+    def quant_then_dequant(self, m, fuse_conv_bn=False): ## quantize only; need to rename this function
+        for name, value in list(m._modules.items()):
+            if m._modules[name]._modules:
+                self.quant_then_dequant(m._modules[name], fuse_conv_bn=fuse_conv_bn)
+                
+            if "QuantizedConv" in m._modules[name].__class__.__name__:
+                m._modules[name].quantize()
+                m._modules[name].quantized_sigma_bias = None ### work around
+                m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring
+
+            if "QuantizedLinear" in m._modules[name].__class__.__name__:
+                m._modules[name].quantize()
+                m._modules[name].dnn_to_bnn_flag = True ## since we don't compute kl in quantized models, this flag will be removed after refactoring
+
+            if fuse_conv_bn and "BatchNorm2d" in m._modules[name].__class__.__name__: # quite confusing, should be quantizedbatchnorm2d
+                setattr(m, name, Identity())
+
+    def forward(self, x):
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        for layer in self.layer1:
+            x = layer(x)
+
+        for layer in self.layer2:
+            x = layer(x)
+
+        for layer in self.layer3:
+            x = layer(x)
+
+        for layer in self.layer4:
+            x = layer(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def qresnet18(pretrained=False, **kwargs):
+    model = QResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+
+
+def qresnet34(pretrained=False, **kwargs):
+    model = QResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def qresnet50(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def qresnet101(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def qresnet152(pretrained=False, **kwargs):
+    model = QResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
+
+
+
+def test(net):
+    import numpy as np
+    total_params = 0
+
+    for x in filter(lambda p: p.requires_grad, net.parameters()):
+        total_params += np.prod(x.data.numpy().shape)
+    print("Total number of params", total_params)
+    print(
+        "Total layers",
+        len(
+            list(
+                filter(lambda p: p.requires_grad and len(p.data.size()) > 1,
+                       net.parameters()))))
+
+
+if __name__ == "__main__":
+    for net_name in __all__:
+        if net_name.startswith('qresnet'):
+            print(net_name)
+            test(globals()[net_name]())
+            print()

From 51385b39c23ae92815a8845d4cf95243f8c99e52 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Feb 2023 02:48:32 -0500
Subject: [PATCH 52/69] remove kl computations

---
 .../main_bayesian_imagenet_bnn2qbnn.py        |  2 +-
 .../quantized_resnet_flipout_large.py         | 21 +++++++------------
 .../quantized_resnet_variational_large.py     | 21 +++++++------------
 3 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
index 687c1d0..1577651 100644
--- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
+++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
@@ -151,7 +151,7 @@ def evaluate(args, model, val_loader):
                 data, target = data.cpu(), target.cpu()
             output_mc = []
             for mc_run in range(args.num_monte_carlo):
-                output, _ = model.forward(data)
+                output = model.forward(data)
                 output_mc.append(output)
             output_ = torch.stack(output_mc)
             output_list.append(output_)
diff --git a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
index 8b18234..61c0dd0 100644
--- a/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
+++ b/bayesian_torch/models/bayesian/quantized_resnet_flipout_large.py
@@ -75,17 +75,17 @@ def __init__(self, in_planes, planes, stride=1, option='A', bias=False):
                         bias=bias), QuantizedBatchNorm2d(self.expansion * planes))
 
     def forward(self, x):
-        out, _ = self.conv1(x)
+        out = self.conv1(x)
         out = self.bn1(out)
         out = F.relu(out)
-        out, _ = self.conv2(out)
+        out = self.conv2(out)
         out = self.bn2(out)
         sh = self.shortcut(x.contiguous()).contiguous()
         new_scale = max(out.q_scale(), sh.q_scale())
         out = torch.ops.quantized.add(out, sh, new_scale, 0)
         # out += self.shortcut(x)
         out = F.relu(out)
-        return out, 0 # kl=0
+        return out
 
 class Bottleneck(nn.Module):
     expansion = 4
@@ -118,31 +118,26 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False):
 
     def forward(self, x):
         residual = x
-        kl_sum = 0
-        out, kl = self.conv1(x)
-        kl_sum += kl
+        out = self.conv1(x)
         out = self.bn1(out)
         out = self.relu(out)
 
-        out, kl = self.conv2(out)
-        kl_sum += kl
+        out = self.conv2(out)
         out = self.bn2(out)
         out = self.relu(out)
 
-        out, kl = self.conv3(out)
-        kl_sum += kl
+        out = self.conv3(out)
         out = self.bn3(out)
 
         if self.downsample is not None:
-            residual, kl = self.downsample(x)
-            kl_sum += kl
+            residual = self.downsample(x)
 
         # out += residual
         new_scale = max(out.q_scale(), residual.q_scale())
         out = torch.ops.quantized.add(out, residual, new_scale, 0)
         out = self.relu(out)
 
-        return out, kl_sum
+        return out
 
 class QResNet(nn.Module):
     def __init__(self, block, layers, num_classes=1000, bias=False):
diff --git a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py
index 6f3077e..6d0a57e 100644
--- a/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py
+++ b/bayesian_torch/models/bayesian/quantized_resnet_variational_large.py
@@ -75,17 +75,17 @@ def __init__(self, in_planes, planes, stride=1, option='A', bias=False):
                         bias=bias), QuantizedBatchNorm2d(self.expansion * planes))
 
     def forward(self, x):
-        out, _ = self.conv1(x)
+        out = self.conv1(x)
         out = self.bn1(out)
         out = F.relu(out)
-        out, _ = self.conv2(out)
+        out = self.conv2(out)
         out = self.bn2(out)
         sh = self.shortcut(x.contiguous()).contiguous()
         new_scale = max(out.q_scale(), sh.q_scale())
         out = torch.ops.quantized.add(out, sh, new_scale, 0)
         # out += self.shortcut(x)
         out = F.relu(out)
-        return out, 0 # kl=0
+        return out
 
 class Bottleneck(nn.Module):
     expansion = 4
@@ -118,31 +118,26 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, bias=False):
 
     def forward(self, x):
         residual = x
-        kl_sum = 0
-        out, kl = self.conv1(x)
-        kl_sum += kl
+        out = self.conv1(x)
         out = self.bn1(out)
         out = self.relu(out)
 
-        out, kl = self.conv2(out)
-        kl_sum += kl
+        out = self.conv2(out)
         out = self.bn2(out)
         out = self.relu(out)
 
-        out, kl = self.conv3(out)
-        kl_sum += kl
+        out = self.conv3(out)
         out = self.bn3(out)
 
         if self.downsample is not None:
-            residual, kl = self.downsample(x)
-            kl_sum += kl
+            residual = self.downsample(x)
 
         # out += residual
         new_scale = max(out.q_scale(), residual.q_scale())
         out = torch.ops.quantized.add(out, residual, new_scale, 0)
         out = self.relu(out)
 
-        return out, kl_sum
+        return out
 
 class QResNet(nn.Module):
     def __init__(self, block, layers, num_classes=1000, bias=False):

From 525c4625c3cf5e971d8ae15eaf3624ba97904aaa Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Feb 2023 19:42:53 -0500
Subject: [PATCH 53/69] quantization directory structure

---
 bayesian_torch/ao/__init__.py                 |    0
 bayesian_torch/ao/nn/__init__.py              |    0
 bayesian_torch/ao/nn/quantized/__init__.py    |    0
 .../modules/quantize_conv_variational.py      | 1428 +++++++++++++++++
 .../modules/quantize_linear_variational.py    |  204 +++
 .../modules/quantized_conv_flipout.py         | 1303 +++++++++++++++
 .../modules/quantized_linear_flipout.py       |  206 +++
 bayesian_torch/ao/quantization/__init__.py    |    2 +
 bayesian_torch/ao/quantization/quantize.py    |    9 +
 9 files changed, 3152 insertions(+)
 create mode 100644 bayesian_torch/ao/__init__.py
 create mode 100644 bayesian_torch/ao/nn/__init__.py
 create mode 100644 bayesian_torch/ao/nn/quantized/__init__.py
 create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py
 create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py
 create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py
 create mode 100644 bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py
 create mode 100644 bayesian_torch/ao/quantization/__init__.py
 create mode 100644 bayesian_torch/ao/quantization/quantize.py

diff --git a/bayesian_torch/ao/__init__.py b/bayesian_torch/ao/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bayesian_torch/ao/nn/__init__.py b/bayesian_torch/ao/nn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bayesian_torch/ao/nn/quantized/__init__.py b/bayesian_torch/ao/nn/quantized/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py b/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py
new file mode 100644
index 0000000..a8b25dc
--- /dev/null
+++ b/bayesian_torch/ao/nn/quantized/modules/quantize_conv_variational.py
@@ -0,0 +1,1428 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# @authors: Jun-Liang Lin
+#
+# ======================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+from .conv_variational import *
+import math
+
+__all__ = [
+    'QuantizedConv1dReparameterization',
+    'QuantizedConv2dReparameterization',
+    'QuantizedConv3dReparameterization',
+    'QuantizedConvTranspose1dReparameterization',
+    'QuantizedConvTranspose2dReparameterization',
+    'QuantizedConvTranspose3dReparameterization',
+]
+
+
+class QuantizedConv1dReparameterization(Conv1dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(QuantizedConv1dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv1d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+        
+
+
+class QuantizedConv2dReparameterization(Conv2dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+
+        """
+        
+        super(QuantizedConv2dReparameterization, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias:
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+        
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv2d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+
+
+class QuantizedConv3dReparameterization(Conv3dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(QuantizedConv3dReparameterization, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv3d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+
+class QuantizedConvTranspose1dReparameterization(ConvTranspose1dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose1dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose1d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose1d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+
+class QuantizedConvTranspose2dReparameterization(ConvTranspose2dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose2dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose2d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose2d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
+
+class QuantizedConvTranspose3dReparameterization(ConvTranspose3dReparameterization):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+
+        super(ConvTranspose3dReparameterization, self).__init__(            
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias
+        )
+
+        ## redundant ##
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        ## redundant ##
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self): # Deprecated. Only for forward mode #1.
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+            
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+
+            weight = self.mu_kernel + (self.sigma_weight * self.eps_kernel.data.normal_())
+            bias = None
+
+            if self.bias:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.conv_transpose3d(input, weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        else:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale()) # Calculate the new scale after multiplying two quantized tensors.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())  # Calculate the new scale after adding two quantized tensors.
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+            out = torch.ops.quantized.conv_transpose3d(input, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
\ No newline at end of file
diff --git a/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py b/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py
new file mode 100644
index 0000000..e666f9b
--- /dev/null
+++ b/bayesian_torch/ao/nn/quantized/modules/quantize_linear_variational.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ======================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module, Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+import math
+from .linear_variational import LinearReparameterization
+
+
+
+class QuantizedLinearReparameterization(LinearReparameterization):
+    def __init__(self,
+                 in_features,
+                 out_features):
+        """
+
+        """
+        super(QuantizedLinearReparameterization, self).__init__(
+                 in_features,
+                 out_features)
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False)
+        self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False)
+        delattr(self, "mu_weight")
+        delattr(self, "rho_weight")
+
+        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        delattr(self, "mu_bias")
+        delattr(self, "rho_bias")
+
+    def dequantize(self): # Deprecated
+        self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+        self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        return
+
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        input: tensors
+            Input tensor.
+
+        enable_int8_compute: bool, optional
+            Whether to enable int8 computation.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+            if not self.is_dequant:
+                self.dequantize()
+                self.is_dequant = True
+            weight = self.mu_weight + (self.sigma_weight * self.eps_weight.data.normal_())
+            bias = None
+            if self.sigma_bias is not None:
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+
+            out = F.linear(input, weight, bias)
+
+        else:
+            eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
+            new_scale = max(new_scale, self.quantized_mu_weight.q_scale())
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, new_scale, 0)
+            bias = None
+
+            if self.quantized_sigma_bias is not None:
+                if not self.is_dequant:
+                    self.dequantize()
+                    self.is_dequant = True
+                bias = self.mu_bias + (self.sigma_bias * self.eps_bias.data.normal_())
+            if input.dtype!=torch.quint8:
+                input = torch.quantize_per_tensor(input, default_scale, default_zero_point, torch.quint8)
+
+            out = torch.nn.quantized.functional.linear(input, weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+            out = out.dequantize()
+            
+        if return_kl:
+            return out, 0 # disable kl divergence computing
+        
+        return out
diff --git a/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py b/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py
new file mode 100644
index 0000000..cf771c7
--- /dev/null
+++ b/bayesian_torch/ao/nn/quantized/modules/quantized_conv_flipout.py
@@ -0,0 +1,1303 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+# Convolutional layers with flipout Monte Carlo weight estimator to perform
+# variational inference in Bayesian neural networks. Variational layers
+# enables Monte Carlo approximation of the distribution over the kernel
+#
+#
+# ======================================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from ..base_variational_layer import BaseVariationalLayer_
+from .conv_flipout import *
+
+from torch.distributions.normal import Normal
+from torch.distributions.uniform import Uniform
+
+__all__ = [
+    'QuantizedConv1dFlipout',
+    'QuantizedConv2dFlipout',
+    'QuantizedConv3dFlipout',
+    'QuantizedConvTranspose1dFlipout',
+    'QuantizedConvTranspose2dFlipout',
+    'QuantizedConvTranspose3dFlipout',
+]
+
+
+class QuantizedConv1dFlipout(Conv1dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConv1dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv1d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv1d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
+
+
+class QuantizedConv2dFlipout(Conv2dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False): # be aware of bias
+        """
+
+        """
+        super(QuantizedConv2dFlipout, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self):
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv2d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
+
+
+class QuantizedConv3dFlipout(Conv3dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConv3dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        # 
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def dequantize(self):
+        self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        if self.bias:
+            self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+            self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        
+        return
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        outputs = torch.nn.quantized.functional.conv3d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.conv3d(x,
+                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
+
+class QuantizedConvTranspose1dFlipout(ConvTranspose1dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose1dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose1d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
+
+class QuantizedConvTranspose2dFlipout(ConvTranspose2dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose2dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose2d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
+
+class QuantizedConvTranspose3dFlipout(ConvTranspose3dFlipout):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False):
+        """
+        """
+        super(QuantizedConvTranspose3dFlipout).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        # for conv bn fusion
+        self.bn_weight = None
+        self.bn_bias = None
+        self.bn_running_mean = None
+        self.bn_running_var = None
+        self.bn_eps = None
+
+        self.is_dequant = False
+
+        if not hasattr(self, "output_padding"):
+            self.output_padding = 0
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        if self.bn_weight is None: # has batchnorm layer, no bn fusion
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))), requires_grad=False).cpu()
+        else: # fuse conv and bn
+            bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+            self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_kernel*(bn_coef.view(-1,1,1,1).expand(self.mu_kernel.shape))), requires_grad=False).cpu()
+            self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_kernel))*(bn_coef.view(-1,1,1,1).expand(self.rho_kernel.shape))), requires_grad=False).cpu()
+        delattr(self, "mu_kernel")
+        delattr(self, "rho_kernel")
+
+
+        ## DO NOT QUANTIZE BIAS!!!! Bias should be in fp32 format.
+        ## Variable names may be confusing. We don't quantize them.
+        ## TODO: rename variables
+        if self.bias: # if has bias
+            if self.bn_weight is None: # if no bn fusion
+                self.quantized_mu_bias = Parameter(self.mu_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False).cpu()
+            else: # if apply bn fusion
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)
+                self.quantized_mu_bias = Parameter((self.mu_bias-self.bn_running_mean)*bn_coef+self.bn_bias, requires_grad=False).cpu()
+                self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias))*bn_coef, requires_grad=False).cpu()
+            delattr(self, "mu_bias")
+            delattr(self, "rho_bias")
+        else:
+            if self.bn_weight is not None: # if no bias but apply bn fusion
+                self.bias = True
+                bn_coef = self.bn_weight/torch.sqrt(self.bn_running_var+self.bn_eps)*(-self.bn_running_mean)+self.bn_bias
+                self.quantized_mu_bias = Parameter(bn_coef, requires_grad=False).cpu()
+                self.quantized_sigma_bias = None
+
+        delattr(self, "bn_weight")
+        delattr(self, "bn_bias")
+        delattr(self, "bn_running_mean")
+        delattr(self, "bn_running_var")
+        delattr(self, "bn_eps")
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        if x.dtype!=torch.quint8:
+            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+        bias = None
+        if self.bias:
+            bias = self.quantized_mu_bias
+
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(self.quantized_mu_weight, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+
+        outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+        # getting perturbation weights
+        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+        bias = None
+        if self.bias:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.quantized_sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(delta_kernel, bias, self.stride,
+                                 self.padding, self.output_padding,
+                                 self.dilation, self.groups)
+        perturbed_outputs = torch.ops.quantized.conv_transpose3d(x, self._packed_params, scale=default_scale, zero_point=default_zero_point)
+        
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+
+        if return_kl:
+            return out, 0
+        
+        return out
\ No newline at end of file
diff --git a/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py b/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py
new file mode 100644
index 0000000..289da98
--- /dev/null
+++ b/bayesian_torch/ao/nn/quantized/modules/quantized_linear_flipout.py
@@ -0,0 +1,206 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+# Linear Flipout Layers with flipout weight estimator to perform
+# variational inference in Bayesian neural networks. Variational layers
+# enables Monte Carlo approximation of the distribution over the weights
+#
+# @authors: Jun-Liang Lin
+#
+# ======================================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module, Parameter
+from torch.distributions.normal import Normal
+from torch.distributions.uniform import Uniform
+
+from .linear_flipout import LinearFlipout
+
+__all__ = ["QuantizedLinearFlipout"]
+
+class QuantizedLinearFlipout(LinearFlipout):
+    def __init__(self,
+                 in_features,
+                 out_features):
+
+        super(QuantizedLinearFlipout, self).__init__(
+                 in_features,
+                 out_features)
+
+        self.is_dequant = False
+
+    def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
+        """ An implementation for symmetric quantization
+        
+        Parameters
+        ----------
+        x: tensor
+            Input tensor.
+        upper_bound: int, optional
+            Restrict the maximum value of the original tensor (select 100 empirically).
+        target_range: int, optional
+            The range of target data type (255 for int8)
+
+        Returns
+        ----------
+        scale: float
+
+        zero_point: int
+
+        """
+        scale = torch.zeros(1).to(x.device) # initialize
+        zero_point = torch.zeros(1).to(x.device) # zero point is zero since we only consider symmetric quantization
+        xmax = torch.clamp(x.abs().max(), 0, upper_bound) # determine and restrict the maximum value (minimum value should be 0 since the absolute value is always non-negative)
+        scale = xmax*2/target_range # original range divided by target range
+        return scale, zero_point
+
+    def get_quantized_tensor(self, x, default_scale=0.1):
+        """ Quantize tensors
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+
+        default_scale: float, optional
+            Default scale for the case that the computed scale is zero.
+
+
+        Returns
+        ----------
+        quantized_x: tensors
+
+
+        """
+        scale, zero_point = self.get_scale_and_zero_point(x)
+        if scale == 0:
+            scale = torch.tensor([default_scale]) # avoid zero scale
+        quantized_x = torch.quantize_per_tensor(x, scale, zero_point, torch.qint8)
+
+        return quantized_x
+
+    def get_dequantized_tensor(self, x):
+
+        dequantized_x = x.dequantize()
+
+        return dequantized_x
+
+
+    def quantize(self):
+        self.quantized_mu_weight = Parameter(self.get_quantized_tensor(self.mu_weight), requires_grad=False)
+        self.quantized_sigma_weight = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_weight))), requires_grad=False)
+        delattr(self, "mu_weight")
+        delattr(self, "rho_weight")
+
+        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        delattr(self, "mu_bias")
+        delattr(self, "rho_bias")
+
+    def dequantize(self):
+        self.mu_weight = self.get_dequantized_tensor(self.quantized_mu_weight)
+        self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
+
+        self.mu_bias = self.get_dequantized_tensor(self.quantized_mu_bias)
+        self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
+        return
+
+    def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+        """ Forward pass
+
+        Parameters
+        ----------
+        x: tensors
+            Input tensor.
+        
+        normal_scale: float, optional
+            Scale for quantized tensor sampled from normal distribution.
+            since 99.7% values will lie within 3 standard deviations, the original range is set as 6.
+        
+        default_scale: float, optional
+            Default scale for quantized input tensor and quantized output tensor.
+            Set to 0.1 by grid search.
+
+        default_zero_point: int, optional
+            Default zero point for quantized input tensor and quantized output tensor.
+            Set to 128 for quint8 tensor.
+
+
+
+        Returns
+        ----------
+        out: tensors
+            Output tensor. Already dequantized.
+        KL: float
+            set to 0 since we diable KL divergence computation in quantized layers.
+
+
+        """
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        bias = None
+        if self.quantized_mu_bias is not None:
+            if not self.is_dequant:
+                    self.dequantize()
+                    self.is_dequant = True
+            bias = self.mu_bias
+
+        outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+        # sampling perturbation signs
+        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+         # getting perturbation weights
+        eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
+        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
+        delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
+
+        bias = None
+        if self.quantized_sigma_bias is not None:
+            eps_bias = self.eps_bias.data.normal_()
+            bias = (self.sigma_bias * eps_bias)
+
+        # perturbed feedforward
+        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+        perturbed_outputs = torch.nn.quantized.functional.linear(x,
+                            weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point)
+        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+        out = out.dequantize()
+
+        if return_kl:
+            return out, 0
+        
+        return out
diff --git a/bayesian_torch/ao/quantization/__init__.py b/bayesian_torch/ao/quantization/__init__.py
new file mode 100644
index 0000000..5d672c7
--- /dev/null
+++ b/bayesian_torch/ao/quantization/__init__.py
@@ -0,0 +1,2 @@
+## bayesian_torch.quantization.prepare
+## bayesian_torch.quantization.convert
\ No newline at end of file
diff --git a/bayesian_torch/ao/quantization/quantize.py b/bayesian_torch/ao/quantization/quantize.py
new file mode 100644
index 0000000..fc7975a
--- /dev/null
+++ b/bayesian_torch/ao/quantization/quantize.py
@@ -0,0 +1,9 @@
+"""
+define prepare and convert function
+"""
+
+def prepare():
+    return
+
+def convert():
+    return
\ No newline at end of file

From 3360bcf06a9cb46ea9e610521fbecae3bd4d5252 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Feb 2023 22:40:12 -0500
Subject: [PATCH 54/69] example for prepare function

---
 .../variational_layers/conv_variational2.py   | 245 ++++++++++++++++++
 1 file changed, 245 insertions(+)
 create mode 100644 bayesian_torch/layers/variational_layers/conv_variational2.py

diff --git a/bayesian_torch/layers/variational_layers/conv_variational2.py b/bayesian_torch/layers/variational_layers/conv_variational2.py
new file mode 100644
index 0000000..8ec18d3
--- /dev/null
+++ b/bayesian_torch/layers/variational_layers/conv_variational2.py
@@ -0,0 +1,245 @@
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+# Convolutional Layers with reparameterization estimator to perform variational
+# inference in Bayesian neural networks. Reparameterization layers
+# enables Monte Carlo approximation of the distribution over 'kernel' and 'bias'.
+#
+# Kullback-Leibler divergence between the surrogate posterior and prior is computed
+# and returned along with the tensors of outputs after convolution operation, which is
+# required to compute Evidence Lower Bound (ELBO).
+#
+# @authors: Ranganath Krishnan
+#
+# ======================================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from bayesian_torch.layers.base_variational_layer import BaseVariationalLayer_, get_kernel_size
+import math
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
+from torch.quantization.qconfig import QConfig
+
+__all__ = [
+    # 'Conv1dReparameterization',
+    # 'Conv2dReparameterization',
+    # 'Conv3dReparameterization',
+    # 'ConvTranspose1dReparameterization',
+    # 'ConvTranspose2dReparameterization',
+    # 'ConvTranspose3dReparameterization',
+]
+
+
+
+
+class Conv2dReparameterization(BaseVariationalLayer_):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 prior_mean=0,
+                 prior_variance=1,
+                 posterior_mu_init=0,
+                 posterior_rho_init=-3.0,
+                 bias=True):
+        """
+        Implements Conv2d layer with reparameterization trick.
+
+        Inherits from bayesian_torch.layers.BaseVariationalLayer_
+
+        Parameters:
+            in_channels: int -> number of channels in the input image,
+            out_channels: int -> number of channels produced by the convolution,
+            kernel_size: int -> size of the convolving kernel,
+            stride: int -> stride of the convolution. Default: 1,
+            padding: int -> zero-padding added to both sides of the input. Default: 0,
+            dilation: int -> spacing between kernel elements. Default: 1,
+            groups: int -> number of blocked connections from input channels to output channels,
+            prior_mean: float -> mean of the prior arbitrary distribution to be used on the complexity cost,
+            prior_variance: float -> variance of the prior arbitrary distribution to be used on the complexity cost,
+            posterior_mu_init: float -> init trainable mu parameter representing mean of the approximate posterior,
+            posterior_rho_init: float -> init trainable rho parameter representing the sigma of the approximate posterior through softplus function,
+            bias: bool -> if set to False, the layer will not learn an additive bias. Default: True,
+        """
+
+        super(Conv2dReparameterization, self).__init__()
+        if in_channels % groups != 0:
+            raise ValueError('invalid in_channels size')
+        if out_channels % groups != 0:
+            raise ValueError('invalid in_channels size')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.prior_mean = prior_mean
+        self.prior_variance = prior_variance
+        self.posterior_mu_init = posterior_mu_init,  # mean of weight
+        # variance of weight --> sigma = log (1 + exp(rho))
+        self.posterior_rho_init = posterior_rho_init,
+        self.bias = bias
+
+        kernel_size = get_kernel_size(kernel_size, 2)
+
+        self.mu_kernel = Parameter(
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
+        self.rho_kernel = Parameter(
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]))
+        self.register_buffer(
+            'eps_kernel',
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
+            persistent=False)
+        self.register_buffer(
+            'prior_weight_mu',
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
+            persistent=False)
+        self.register_buffer(
+            'prior_weight_sigma',
+            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
+                         kernel_size[1]),
+            persistent=False)
+
+        if self.bias:
+            self.mu_bias = Parameter(torch.Tensor(out_channels))
+            self.rho_bias = Parameter(torch.Tensor(out_channels))
+            self.register_buffer('eps_bias', torch.Tensor(out_channels), persistent=False)
+            self.register_buffer('prior_bias_mu', torch.Tensor(out_channels), persistent=False)
+            self.register_buffer('prior_bias_sigma',
+                                 torch.Tensor(out_channels),
+                                 persistent=False)
+        else:
+            self.register_parameter('mu_bias', None)
+            self.register_parameter('rho_bias', None)
+            self.register_buffer('eps_bias', None, persistent=False)
+            self.register_buffer('prior_bias_mu', None, persistent=False)
+            self.register_buffer('prior_bias_sigma', None, persistent=False)
+
+        self.init_parameters()
+
+    def prepare(self):
+        myconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8),
+                          weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8))
+        self.quant = nn.ModuleList([torch.quantization.QuantStub(myconfig) for _ in range(10)])
+        self.dequant = torch.quantization.DeQuantStub()
+
+    def init_parameters(self):
+        self.prior_weight_mu.fill_(self.prior_mean)
+        self.prior_weight_sigma.fill_(self.prior_variance)
+
+        self.mu_kernel.data.normal_(mean=self.posterior_mu_init[0], std=0.1)
+        self.rho_kernel.data.normal_(mean=self.posterior_rho_init[0], std=0.1)
+        if self.bias:
+            self.prior_bias_mu.fill_(self.prior_mean)
+            self.prior_bias_sigma.fill_(self.prior_variance)
+
+            self.mu_bias.data.normal_(mean=self.posterior_mu_init[0], std=0.1)
+            self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
+                                       std=0.1)
+
+    def kl_loss(self):
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
+
+        return kl
+
+    def forward(self, input, return_kl=True):
+
+        input = self.quant[0](input) ###
+
+        if self.dnn_to_bnn_flag:
+            return_kl = False
+
+        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
+        eps_kernel = self.eps_kernel.data.normal_()
+
+        sigma_weight = self.quant[1](sigma_weight) ####
+        eps_kernel = self.quant[2](eps_kernel) ####
+        mu_kernel = self.quant[3](self.mu_kernel) ####
+        
+        tmp_result = sigma_weight * eps_kernel
+        tmp_result = self.quant[4](tmp_result) ####
+
+        weight = mu_kernel + tmp_result
+
+        weight = self.quant[5](weight) ####
+
+        if return_kl:
+            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
+                                    self.prior_weight_mu, self.prior_weight_sigma)
+        bias = None
+
+        if self.bias:
+            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
+            eps_bias = self.eps_bias.data.normal_()
+            bias = self.mu_bias + (sigma_bias * eps_bias)
+            if return_kl:
+                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
+                                      self.prior_bias_sigma)
+
+        out = F.conv2d(input, weight, bias, self.stride, self.padding,
+                       self.dilation, self.groups)
+
+        out = self.quant[6](out) ####
+
+        if return_kl:
+            if self.bias:
+                kl = kl_weight + kl_bias
+            else:
+                kl = kl_weight
+            return out, kl
+
+        return out
+
+if __name__=="__main__":
+    m = Conv2dReparameterization(3,3,3)
+    m.eval()
+    m.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+    mp = torch.quantization.prepare(m)
+    input = torch.randn(3,3,4,4)
+    mp(input)
+    mq = torch.quantization.convert(mp)

From 51bab43b90757f6404b148d41037feb580d47700 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 19 Feb 2023 20:36:21 -0500
Subject: [PATCH 55/69] quantization prepare function

---
 .../variational_layers/conv_variational.py    |  35 ++-
 .../variational_layers/conv_variational2.py   | 245 ------------------
 2 files changed, 34 insertions(+), 246 deletions(-)
 delete mode 100644 bayesian_torch/layers/variational_layers/conv_variational2.py

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 0d2ebfd..0fd065f 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -295,6 +295,15 @@ def __init__(self,
             self.register_buffer('prior_bias_sigma', None, persistent=False)
 
         self.init_parameters()
+        self.quant_prepare=False
+
+    def prepare(self):
+        self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
+        self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
+        self.dequant = torch.quantization.DeQuantStub()
+        self.quant_prepare=True
 
     def init_parameters(self):
         self.prior_weight_mu.fill_(self.prior_mean)
@@ -325,7 +334,8 @@ def forward(self, input, return_kl=True):
 
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
-        weight = self.mu_kernel + (sigma_weight * eps_kernel)
+        tmp_result = sigma_weight * eps_kernel
+        weight = mu_kernel + tmp_result
 
         if return_kl:
             kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
@@ -342,6 +352,20 @@ def forward(self, input, return_kl=True):
 
         out = F.conv2d(input, weight, bias, self.stride, self.padding,
                        self.dilation, self.groups)
+
+        if self.quant_prepare:
+            # quint8 quantstub
+            input = self.quint_quant[0](input) # input
+            out = self.quint_quant[1](out) # output
+
+            # qint8 quantstub
+            sigma_weight = self.qint_quant[0](sigma_weight) # weight
+            mu_kernel = self.qint_quant[1](self.mu_kernel) # weight
+            eps_kernel = self.qint_quant[2](eps_kernel) # random variable
+            tmp_result =self.qint_quant[3](tmp_result) # multiply activation
+            weight = self.qint_quant[4](weight) # add activatation
+            
+
         if return_kl:
             if self.bias:
                 kl = kl_weight + kl_bias
@@ -946,3 +970,12 @@ def forward(self, input, return_kl=True):
             return out, kl
 
         return out
+
+if __name__=="__main__":
+    m = Conv2dReparameterization(3,3,3)
+    m.eval()
+    m.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+    mp = torch.quantization.prepare(m)
+    input = torch.randn(3,3,4,4)
+    mp(input)
+    mq = torch.quantization.convert(mp)
\ No newline at end of file
diff --git a/bayesian_torch/layers/variational_layers/conv_variational2.py b/bayesian_torch/layers/variational_layers/conv_variational2.py
deleted file mode 100644
index 8ec18d3..0000000
--- a/bayesian_torch/layers/variational_layers/conv_variational2.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (C) 2021 Intel Labs
-#
-# BSD-3-Clause License
-#
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-# 1. Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 3. Neither the name of the copyright holder nor the names of its contributors
-#    may be used to endorse or promote products derived from this software
-#    without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
-# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-# Convolutional Layers with reparameterization estimator to perform variational
-# inference in Bayesian neural networks. Reparameterization layers
-# enables Monte Carlo approximation of the distribution over 'kernel' and 'bias'.
-#
-# Kullback-Leibler divergence between the surrogate posterior and prior is computed
-# and returned along with the tensors of outputs after convolution operation, which is
-# required to compute Evidence Lower Bound (ELBO).
-#
-# @authors: Ranganath Krishnan
-#
-# ======================================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Parameter
-from bayesian_torch.layers.base_variational_layer import BaseVariationalLayer_, get_kernel_size
-import math
-from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
-from torch.quantization.qconfig import QConfig
-
-__all__ = [
-    # 'Conv1dReparameterization',
-    # 'Conv2dReparameterization',
-    # 'Conv3dReparameterization',
-    # 'ConvTranspose1dReparameterization',
-    # 'ConvTranspose2dReparameterization',
-    # 'ConvTranspose3dReparameterization',
-]
-
-
-
-
-class Conv2dReparameterization(BaseVariationalLayer_):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 prior_mean=0,
-                 prior_variance=1,
-                 posterior_mu_init=0,
-                 posterior_rho_init=-3.0,
-                 bias=True):
-        """
-        Implements Conv2d layer with reparameterization trick.
-
-        Inherits from bayesian_torch.layers.BaseVariationalLayer_
-
-        Parameters:
-            in_channels: int -> number of channels in the input image,
-            out_channels: int -> number of channels produced by the convolution,
-            kernel_size: int -> size of the convolving kernel,
-            stride: int -> stride of the convolution. Default: 1,
-            padding: int -> zero-padding added to both sides of the input. Default: 0,
-            dilation: int -> spacing between kernel elements. Default: 1,
-            groups: int -> number of blocked connections from input channels to output channels,
-            prior_mean: float -> mean of the prior arbitrary distribution to be used on the complexity cost,
-            prior_variance: float -> variance of the prior arbitrary distribution to be used on the complexity cost,
-            posterior_mu_init: float -> init trainable mu parameter representing mean of the approximate posterior,
-            posterior_rho_init: float -> init trainable rho parameter representing the sigma of the approximate posterior through softplus function,
-            bias: bool -> if set to False, the layer will not learn an additive bias. Default: True,
-        """
-
-        super(Conv2dReparameterization, self).__init__()
-        if in_channels % groups != 0:
-            raise ValueError('invalid in_channels size')
-        if out_channels % groups != 0:
-            raise ValueError('invalid in_channels size')
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-        self.prior_mean = prior_mean
-        self.prior_variance = prior_variance
-        self.posterior_mu_init = posterior_mu_init,  # mean of weight
-        # variance of weight --> sigma = log (1 + exp(rho))
-        self.posterior_rho_init = posterior_rho_init,
-        self.bias = bias
-
-        kernel_size = get_kernel_size(kernel_size, 2)
-
-        self.mu_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
-                         kernel_size[1]))
-        self.rho_kernel = Parameter(
-            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
-                         kernel_size[1]))
-        self.register_buffer(
-            'eps_kernel',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
-                         kernel_size[1]),
-            persistent=False)
-        self.register_buffer(
-            'prior_weight_mu',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
-                         kernel_size[1]),
-            persistent=False)
-        self.register_buffer(
-            'prior_weight_sigma',
-            torch.Tensor(out_channels, in_channels // groups, kernel_size[0],
-                         kernel_size[1]),
-            persistent=False)
-
-        if self.bias:
-            self.mu_bias = Parameter(torch.Tensor(out_channels))
-            self.rho_bias = Parameter(torch.Tensor(out_channels))
-            self.register_buffer('eps_bias', torch.Tensor(out_channels), persistent=False)
-            self.register_buffer('prior_bias_mu', torch.Tensor(out_channels), persistent=False)
-            self.register_buffer('prior_bias_sigma',
-                                 torch.Tensor(out_channels),
-                                 persistent=False)
-        else:
-            self.register_parameter('mu_bias', None)
-            self.register_parameter('rho_bias', None)
-            self.register_buffer('eps_bias', None, persistent=False)
-            self.register_buffer('prior_bias_mu', None, persistent=False)
-            self.register_buffer('prior_bias_sigma', None, persistent=False)
-
-        self.init_parameters()
-
-    def prepare(self):
-        myconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8),
-                          weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8))
-        self.quant = nn.ModuleList([torch.quantization.QuantStub(myconfig) for _ in range(10)])
-        self.dequant = torch.quantization.DeQuantStub()
-
-    def init_parameters(self):
-        self.prior_weight_mu.fill_(self.prior_mean)
-        self.prior_weight_sigma.fill_(self.prior_variance)
-
-        self.mu_kernel.data.normal_(mean=self.posterior_mu_init[0], std=0.1)
-        self.rho_kernel.data.normal_(mean=self.posterior_rho_init[0], std=0.1)
-        if self.bias:
-            self.prior_bias_mu.fill_(self.prior_mean)
-            self.prior_bias_sigma.fill_(self.prior_variance)
-
-            self.mu_bias.data.normal_(mean=self.posterior_mu_init[0], std=0.1)
-            self.rho_bias.data.normal_(mean=self.posterior_rho_init[0],
-                                       std=0.1)
-
-    def kl_loss(self):
-        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
-        kl = self.kl_div(self.mu_kernel, sigma_weight, self.prior_weight_mu, self.prior_weight_sigma)
-        if self.bias:
-            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
-            kl += self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu, self.prior_bias_sigma)
-
-        return kl
-
-    def forward(self, input, return_kl=True):
-
-        input = self.quant[0](input) ###
-
-        if self.dnn_to_bnn_flag:
-            return_kl = False
-
-        sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
-        eps_kernel = self.eps_kernel.data.normal_()
-
-        sigma_weight = self.quant[1](sigma_weight) ####
-        eps_kernel = self.quant[2](eps_kernel) ####
-        mu_kernel = self.quant[3](self.mu_kernel) ####
-        
-        tmp_result = sigma_weight * eps_kernel
-        tmp_result = self.quant[4](tmp_result) ####
-
-        weight = mu_kernel + tmp_result
-
-        weight = self.quant[5](weight) ####
-
-        if return_kl:
-            kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
-                                    self.prior_weight_mu, self.prior_weight_sigma)
-        bias = None
-
-        if self.bias:
-            sigma_bias = torch.log1p(torch.exp(self.rho_bias))
-            eps_bias = self.eps_bias.data.normal_()
-            bias = self.mu_bias + (sigma_bias * eps_bias)
-            if return_kl:
-                kl_bias = self.kl_div(self.mu_bias, sigma_bias, self.prior_bias_mu,
-                                      self.prior_bias_sigma)
-
-        out = F.conv2d(input, weight, bias, self.stride, self.padding,
-                       self.dilation, self.groups)
-
-        out = self.quant[6](out) ####
-
-        if return_kl:
-            if self.bias:
-                kl = kl_weight + kl_bias
-            else:
-                kl = kl_weight
-            return out, kl
-
-        return out
-
-if __name__=="__main__":
-    m = Conv2dReparameterization(3,3,3)
-    m.eval()
-    m.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-    mp = torch.quantization.prepare(m)
-    input = torch.randn(3,3,4,4)
-    mp(input)
-    mq = torch.quantization.convert(mp)

From df094087e4c05cb1d411bae669947d6685c21650 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 19 Feb 2023 20:43:07 -0500
Subject: [PATCH 56/69] import quantization module

---
 bayesian_torch/layers/variational_layers/conv_variational.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index 0fd065f..a9e33ba 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -48,6 +48,8 @@
 from torch.nn import Parameter
 from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size
 import math
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
+from torch.quantization.qconfig import QConfig
 
 __all__ = [
     'Conv1dReparameterization',

From b4ce3f5e3d042432a63a0c17d32453165370b839 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 19 Feb 2023 23:08:39 -0500
Subject: [PATCH 57/69] finish quantization function

---
 .../layers/variational_layers/conv_variational.py          | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index a9e33ba..bb7d1a7 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -301,9 +301,9 @@ def __init__(self,
 
     def prepare(self):
         self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
+                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
         self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
+                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
         self.dequant = torch.quantization.DeQuantStub()
         self.quant_prepare=True
 
@@ -337,7 +337,7 @@ def forward(self, input, return_kl=True):
         sigma_weight = torch.log1p(torch.exp(self.rho_kernel))
         eps_kernel = self.eps_kernel.data.normal_()
         tmp_result = sigma_weight * eps_kernel
-        weight = mu_kernel + tmp_result
+        weight = self.mu_kernel + tmp_result
 
         if return_kl:
             kl_weight = self.kl_div(self.mu_kernel, sigma_weight,
@@ -976,6 +976,7 @@ def forward(self, input, return_kl=True):
 if __name__=="__main__":
     m = Conv2dReparameterization(3,3,3)
     m.eval()
+    m.prepare()
     m.qconfig = torch.quantization.get_default_qconfig("fbgemm")
     mp = torch.quantization.prepare(m)
     input = torch.randn(3,3,4,4)

From 9b5a9dca7a62091ec586784b409b3990cf01142c Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 1 Mar 2023 23:07:58 -0500
Subject: [PATCH 58/69] quantization module prototype

---
 bayesian_torch/__init__.py                    |   1 +
 bayesian_torch/ao/quantization/__init__.py    |   3 +-
 bayesian_torch/ao/quantization/quantize.py    | 168 +++++++++++++++++-
 bayesian_torch/examples/quantization_test.py  |  34 ++++
 .../quantize_conv_variational.py              |  70 +++++++-
 .../bayesian/resnet_variational_large.py      |   4 +-
 bayesian_torch/quantization/__init__.py       |   3 +
 bayesian_torch/quantization/quantize.py       |   2 +
 8 files changed, 272 insertions(+), 13 deletions(-)
 create mode 100644 bayesian_torch/examples/quantization_test.py
 create mode 100644 bayesian_torch/quantization/__init__.py
 create mode 100644 bayesian_torch/quantization/quantize.py

diff --git a/bayesian_torch/__init__.py b/bayesian_torch/__init__.py
index e69de29..da64647 100644
--- a/bayesian_torch/__init__.py
+++ b/bayesian_torch/__init__.py
@@ -0,0 +1 @@
+from bayesian_torch import quantization as quantization
\ No newline at end of file
diff --git a/bayesian_torch/ao/quantization/__init__.py b/bayesian_torch/ao/quantization/__init__.py
index 5d672c7..dab2378 100644
--- a/bayesian_torch/ao/quantization/__init__.py
+++ b/bayesian_torch/ao/quantization/__init__.py
@@ -1,2 +1,3 @@
 ## bayesian_torch.quantization.prepare
-## bayesian_torch.quantization.convert
\ No newline at end of file
+## bayesian_torch.quantization.convert
+from .quantize import *
\ No newline at end of file
diff --git a/bayesian_torch/ao/quantization/quantize.py b/bayesian_torch/ao/quantization/quantize.py
index fc7975a..06fa99f 100644
--- a/bayesian_torch/ao/quantization/quantize.py
+++ b/bayesian_torch/ao/quantization/quantize.py
@@ -1,9 +1,163 @@
-"""
-define prepare and convert function
-"""
+# Copyright (C) 2021 Intel Labs
+#
+# BSD-3-Clause License
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+# OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Define prepare and convert function
+#
 
-def prepare():
-    return
+import torch
+import torch.nn as nn
+from bayesian_torch.models.bayesian.resnet_variational_large import (
+    BasicBlock,
+    Bottleneck,
+    ResNet,
+)
+from typing import Any, List, Optional, Type, Union
+from torch import Tensor
+from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn
+# import copy
 
-def convert():
-    return
\ No newline at end of file
+__all__ = [
+    "prepare",
+    "convert",
+]
+
+class QuantizableBasicBlock(BasicBlock):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.add_relu = torch.nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = self.add_relu.add_relu(out, identity)
+
+        return out
+
+
+class QuantizableBottleneck(Bottleneck):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.skip_add_relu = nn.quantized.FloatFunctional()
+        self.relu1 = nn.ReLU(inplace=False)
+        self.relu2 = nn.ReLU(inplace=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu2(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out = self.skip_add_relu.add_relu(out, identity)
+
+        return out
+
+
+class QuantizableResNet(ResNet):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.quant = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.quant(x)
+
+        x= self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        for layer in self.layer1:
+            x=layer(x)
+
+        for layer in self.layer2:
+            x = layer(x)
+
+        for layer in self.layer3:
+            x = layer(x)
+
+        for layer in self.layer4:
+            x = layer(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+
+        # x = self.dequant(x)
+        return x
+
+
+
+def enable_prepare(m):
+    for name, value in list(m._modules.items()):
+        if m._modules[name]._modules:
+            enable_prepare(m._modules[name])
+        elif "Reparameterization" in m._modules[name].__class__.__name__ or "Flipout" in m._modules[name].__class__.__name__:
+            prepare = getattr(m._modules[name], "prepare", None)
+            if callable(prepare):
+                m._modules[name].prepare()
+                m._modules[name].dnn_to_bnn_flag=True
+
+
+def prepare(model):
+    """
+    1. construct quantizable model
+    2. traverse the model to enable the prepare function in each layer
+    3. run torch.quantize.prepare()
+    """
+    qmodel = QuantizableResNet(QuantizableBottleneck, [3, 4, 6, 3])
+    qmodel.load_state_dict(model.state_dict())
+    qmodel.eval()
+    enable_prepare(qmodel)
+    qmodel.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+    qmodel = torch.quantization.prepare(qmodel)
+
+    return qmodel
+
+def convert(model):
+    qmodel = torch.quantization.convert(model) # torch layers
+    bnn_to_qbnn(qmodel) # bayesian layers
+    return qmodel
\ No newline at end of file
diff --git a/bayesian_torch/examples/quantization_test.py b/bayesian_torch/examples/quantization_test.py
new file mode 100644
index 0000000..bc18c25
--- /dev/null
+++ b/bayesian_torch/examples/quantization_test.py
@@ -0,0 +1,34 @@
+# import torch 
+# import bayesian_torch
+# from bayesian_torch.ao.quantization import prepare, convert
+# import bayesian_torch.models.bayesian.resnet_variational_large as resnet
+# from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn
+
+# model = resnet.__dict__['resnet50']()
+
+# input = torch.randn(1,3,224,224)
+# mp = prepare(model)
+# mp(input) # haven't replaced the batchnorm layer
+# qmodel = torch.quantization.convert(mp)
+# bnn_to_qbnn(qmodel)
+
+
+import torch
+import bayesian_torch
+import bayesian_torch.models.bayesian.resnet_variational_large as resnet
+
+m = resnet.__dict__['resnet50']()
+# alternative way to construct a bnn model
+# from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn
+# m = torchvision.models.resnet50(weights="IMAGENET1K_V1")
+# dnn_to_bnn(m)
+
+
+
+mp = bayesian_torch.quantization.prepare(m)
+input = torch.randn(1,3,224,224)
+mp(input) # calibration
+mq = bayesian_torch.quantization.convert(mp)
+
+
+
diff --git a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
index a8b25dc..31ed9e7 100644
--- a/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_conv_variational.py
@@ -93,6 +93,7 @@ def __init__(self,
         self.bn_eps = None
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -237,7 +238,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
         if self.dnn_to_bnn_flag:
             return_kl = False
 
-        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+        if self.quant_dict is not None:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'])
+            bias = None
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv1d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+
+        elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -323,6 +343,7 @@ def __init__(self,
         self.bn_eps = None
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -419,6 +440,10 @@ def quantize(self):
         delattr(self, "bn_running_var")
         delattr(self, "bn_eps")
 
+        delattr(self, "qint_quant")
+        delattr(self, "quint_quant")
+        delattr(self, "dequant")
+
     def dequantize(self): # Deprecated. Only for forward mode #1.
         self.mu_kernel = self.get_dequantized_tensor(self.quantized_mu_weight)
         self.sigma_weight = self.get_dequantized_tensor(self.quantized_sigma_weight)
@@ -466,7 +491,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
         if self.dnn_to_bnn_flag:
             return_kl = False
         
-        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+        if self.quant_dict is not None:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'])
+            bias = None
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv2d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+
+        elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
@@ -550,6 +594,7 @@ def __init__(self,
         self.bn_eps = None
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -693,7 +738,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
         if self.dnn_to_bnn_flag:
             return_kl = False
 
-        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+        if self.quant_dict is not None:
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'])
+            bias = None
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.conv3d(input, weight, bias, self.stride, self.padding,
+                        self.dilation, self.groups, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+
+        elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
diff --git a/bayesian_torch/models/bayesian/resnet_variational_large.py b/bayesian_torch/models/bayesian/resnet_variational_large.py
index bc641d6..6fdf561 100644
--- a/bayesian_torch/models/bayesian/resnet_variational_large.py
+++ b/bayesian_torch/models/bayesian/resnet_variational_large.py
@@ -14,7 +14,7 @@
 from bayesian_torch.layers import BatchNorm2dLayer
 
 __all__ = [
-    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'BasicBlock', 'Bottleneck'
 ]
 
 prior_mu = 0.0
@@ -200,7 +200,7 @@ def _make_layer(self, block, planes, blocks, stride=1):
                                          posterior_mu_init=posterior_mu_init,
                                          posterior_rho_init=posterior_rho_init,
                                          bias=False),
-                BatchNorm2dLayer(planes * block.expansion),
+                nn.BatchNorm2d(planes * block.expansion),
             )
 
         layers = []
diff --git a/bayesian_torch/quantization/__init__.py b/bayesian_torch/quantization/__init__.py
new file mode 100644
index 0000000..91a6e8b
--- /dev/null
+++ b/bayesian_torch/quantization/__init__.py
@@ -0,0 +1,3 @@
+from .quantize import * 
+
+# __all__ = ['prepare', 'convert']
\ No newline at end of file
diff --git a/bayesian_torch/quantization/quantize.py b/bayesian_torch/quantization/quantize.py
new file mode 100644
index 0000000..967f79a
--- /dev/null
+++ b/bayesian_torch/quantization/quantize.py
@@ -0,0 +1,2 @@
+from bayesian_torch.ao.quantization.quantize import prepare
+from bayesian_torch.ao.quantization.quantize import convert
\ No newline at end of file

From 9b0118f11f72e214a66f5b8b721dbce3b1eb9f2a Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Fri, 3 Mar 2023 09:30:46 -0500
Subject: [PATCH 59/69] bnn to qbnn

---
 bayesian_torch/models/bnn_to_qbnn.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index d689465..37441b5 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -119,6 +119,15 @@ def qbnn_conv_layer(d):
         groups=d.groups,
     )
     qbnn_layer.__dict__.update(d.__dict__)
+
+    if d.quant_prepare:
+        qbnn_layer.quant_dict = []
+        for qstub in d.qint_quant:
+            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+        qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:]
+        for qstub in d.quint_quant:
+            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+
     qbnn_layer.quantize()
     if d.dnn_to_bnn_flag:
         qbnn_layer.dnn_to_bnn_flag = True
@@ -180,7 +189,10 @@ def batch_norm_folding(conv, bn):
 def bnn_to_qbnn(m, fuse_conv_bn=False):
     for name, value in list(m._modules.items()):
         if m._modules[name]._modules:
-            bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn)
+            if "Conv" in m._modules[name].__class__.__name__:
+                setattr(m, name, qbnn_conv_layer(m._modules[name]))
+            else:
+                bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn)
         elif "Linear" in m._modules[name].__class__.__name__:
             setattr(m, name, qbnn_linear_layer(m._modules[name]))
         elif "LSTM" in m._modules[name].__class__.__name__:

From b780aad54cb3c6baba568b53c794c38844933455 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 6 Mar 2023 02:11:38 -0500
Subject: [PATCH 60/69] qbnn example

---
 .../main_bayesian_imagenet_bnn2qbnn.py        | 23 ++++++++++-----
 .../variational_layers/linear_variational.py  | 29 +++++++++++++++++--
 .../quantize_linear_variational.py            | 22 +++++++++++++-
 bayesian_torch/models/bnn_to_qbnn.py          |  9 ++++++
 4 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
index 1577651..2de3604 100644
--- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
+++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
@@ -16,8 +16,8 @@
 import bayesian_torch.models.bayesian.resnet_variational_large as resnet
 import numpy as np
 from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn
-# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet
-import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet
+import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet
+# import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet
 
 torch.cuda.is_available = lambda : False
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -262,9 +262,16 @@ def main():
         model.load_state_dict(checkpoint["state_dict"])
         model.module = model.module.cpu()
 
-        bnn_to_qbnn(model, fuse_conv_bn=False)  # only replaces linear and conv layers
+        mp = bayesian_torch.quantization.prepare(model)
+        evaluate(args, mp, val_loader) # calibration
+        qmodel = bayesian_torch.quantization.convert(mp)
+        evaluate(args, qmodel, val_loader)
+
+
 
-        model = model.cpu()
+        # bnn_to_qbnn(model, fuse_conv_bn=False)  # only replaces linear and conv layers
+
+        # model = model.cpu()
 
         # save weights
         # save_checkpoint(
@@ -278,16 +285,16 @@ def main():
         #                 args.save_dir,
         #                 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch)))
 
-        qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias
-        qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False)
+        # qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias
+        # qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False)
 
         # load weights
         # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch)
         # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
         # qmodel.load_state_dict(checkpoint["state_dict"])
 
-        qmodel.load_state_dict(model.state_dict())
-        evaluate(args, qmodel, val_loader)
+        # qmodel.load_state_dict(model.state_dict())
+        # evaluate(args, qmodel, val_loader)
 
 if __name__ == "__main__":
     main()
diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py
index 7efb667..d69bfff 100644
--- a/bayesian_torch/layers/variational_layers/linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/linear_variational.py
@@ -116,6 +116,15 @@ def __init__(self,
             self.register_buffer('eps_bias', None, persistent=False)
 
         self.init_parameters()
+        self.quant_prepare=False
+    
+    def prepare(self):
+        self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
+        self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
+        self.dequant = torch.quantization.DeQuantStub()
+        self.quant_prepare=True
 
     def init_parameters(self):
         self.prior_weight_mu.fill_(self.prior_mean)
@@ -147,8 +156,10 @@ def forward(self, input, return_kl=True):
         if self.dnn_to_bnn_flag:
             return_kl = False
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
-        weight = self.mu_weight + \
-            (sigma_weight * self.eps_weight.data.normal_())
+        eps_weight = self.eps_weight.data.normal_()
+        tmp_result = sigma_weight * eps_kernel
+        weight = self.mu_weight + tmp_result
+
         if return_kl:
             kl_weight = self.kl_div(self.mu_weight, sigma_weight,
                                     self.prior_weight_mu, self.prior_weight_sigma)
@@ -162,6 +173,20 @@ def forward(self, input, return_kl=True):
                                       self.prior_bias_sigma)
 
         out = F.linear(input, weight, bias)
+
+        if self.quant_prepare:
+            # quint8 quantstub
+            input = self.quint_quant[0](input) # input
+            out = self.quint_quant[1](out) # output
+
+            # qint8 quantstub
+            sigma_weight = self.qint_quant[0](sigma_weight) # weight
+            mu_weight = self.qint_quant[1](self.mu_weight) # weight
+            eps_weight = self.qint_quant[2](eps_weight) # random variable
+            tmp_result =self.qint_quant[3](tmp_result) # multiply activation
+            weight = self.qint_quant[4](weight) # add activatation
+
+
         if return_kl:
             if self.mu_bias is not None:
                 kl = kl_weight + kl_bias
diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
index e666f9b..d2e48bc 100644
--- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
@@ -53,6 +53,7 @@ def __init__(self,
                  out_features)
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -168,7 +169,26 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
         if self.dnn_to_bnn_flag:
             return_kl = False
 
-        if not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
+        if self.quant_dict is not None:
+            eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+            weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'])
+            bias = None
+
+            ## DO NOT QUANTIZE BIAS!!!
+            if self.bias:
+                if self.quantized_sigma_bias is None: # the case that bias comes from bn fusion
+                    bias = self.quantized_mu_bias
+                else: # original case
+                    bias = self.quantized_mu_bias + (self.quantized_sigma_bias * self.eps_bias.data.normal_())
+
+            if input.dtype!=torch.quint8: # check if input has been quantized
+                input = torch.quantize_per_tensor(input, self.quant_dict[3]['scale'], self.quant_dict[3]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            out = torch.nn.quantized.functional.linear(input, weight, bias, scale=self.quant_dict[4]['scale'], zero_point=self.quant_dict[4]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+            out = out.dequantize()
+
+        elif not enable_int8_compute: # Deprecated. Use this method for reducing model size only.
             if not self.is_dequant:
                 self.dequantize()
                 self.is_dequant = True
diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index 37441b5..7660aa0 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -101,6 +101,15 @@ def qbnn_linear_layer(d):
         out_features=d.out_features,
     )
     qbnn_layer.__dict__.update(d.__dict__)
+
+    if d.quant_prepare:
+        qbnn_layer.quant_dict = []
+        for qstub in d.qint_quant:
+            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+        qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:]
+        for qstub in d.quint_quant:
+            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+
     qbnn_layer.quantize()
     if d.dnn_to_bnn_flag:
         qbnn_layer.dnn_to_bnn_flag = True

From 87488e2887be351060b46e8048e7521994a0e41d Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 6 Mar 2023 02:54:34 -0500
Subject: [PATCH 61/69] qbnn performance test

---
 .../layers/variational_layers/conv_variational.py         | 6 +++---
 .../layers/variational_layers/linear_variational.py       | 8 +++++---
 .../variational_layers/quantize_linear_variational.py     | 6 +++---
 bayesian_torch/models/bnn_to_qbnn.py                      | 2 ++
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/conv_variational.py b/bayesian_torch/layers/variational_layers/conv_variational.py
index bb7d1a7..0d85f09 100644
--- a/bayesian_torch/layers/variational_layers/conv_variational.py
+++ b/bayesian_torch/layers/variational_layers/conv_variational.py
@@ -48,7 +48,7 @@
 from torch.nn import Parameter
 from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size
 import math
-from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver
 from torch.quantization.qconfig import QConfig
 
 __all__ = [
@@ -301,9 +301,9 @@ def __init__(self,
 
     def prepare(self):
         self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(5)])
         self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
         self.dequant = torch.quantization.DeQuantStub()
         self.quant_prepare=True
 
diff --git a/bayesian_torch/layers/variational_layers/linear_variational.py b/bayesian_torch/layers/variational_layers/linear_variational.py
index d69bfff..8bdf644 100644
--- a/bayesian_torch/layers/variational_layers/linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/linear_variational.py
@@ -47,6 +47,8 @@
 from torch.nn import Module, Parameter
 from ..base_variational_layer import BaseVariationalLayer_
 import math
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver
+from torch.quantization.qconfig import QConfig
 
 
 class LinearReparameterization(BaseVariationalLayer_):
@@ -120,9 +122,9 @@ def __init__(self,
     
     def prepare(self):
         self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.qint8), activation=HistogramObserver.with_args(dtype=torch.qint8))) for _ in range(5)])
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(5)])
         self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
-                                         QConfig(weight=HistogramObserver.with_args(dtype=torch.quint8), activation=HistogramObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(2)])
         self.dequant = torch.quantization.DeQuantStub()
         self.quant_prepare=True
 
@@ -157,7 +159,7 @@ def forward(self, input, return_kl=True):
             return_kl = False
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
         eps_weight = self.eps_weight.data.normal_()
-        tmp_result = sigma_weight * eps_kernel
+        tmp_result = sigma_weight * eps_weight
         weight = self.mu_weight + tmp_result
 
         if return_kl:
diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
index d2e48bc..34a970f 100644
--- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
@@ -118,8 +118,8 @@ def quantize(self):
         delattr(self, "mu_weight")
         delattr(self, "rho_weight")
 
-        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
-        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
         delattr(self, "mu_bias")
         delattr(self, "rho_bias")
 
@@ -171,7 +171,7 @@ def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_s
 
         if self.quant_dict is not None:
             eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8) # Quantize a tensor from normal distribution. 99.7% values will lie within 3 standard deviations, so the original range is set as 6.
-            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+            weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
             weight = torch.ops.quantized.add(weight, self.quantized_mu_weight, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'])
             bias = None
 
diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index 7660aa0..d201e75 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -200,6 +200,8 @@ def bnn_to_qbnn(m, fuse_conv_bn=False):
         if m._modules[name]._modules:
             if "Conv" in m._modules[name].__class__.__name__:
                 setattr(m, name, qbnn_conv_layer(m._modules[name]))
+            elif "Linear" in m._modules[name].__class__.__name__:
+                setattr(m, name, qbnn_linear_layer(m._modules[name]))
             else:
                 bnn_to_qbnn(m._modules[name], fuse_conv_bn=fuse_conv_bn)
         elif "Linear" in m._modules[name].__class__.__name__:

From 1e8bd696aaefff4281a56d987b944a0f89e16626 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Tue, 7 Mar 2023 18:32:22 -0500
Subject: [PATCH 62/69] fix accuracy drop

---
 bayesian_torch/models/bnn_to_qbnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index d201e75..09dcaed 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -165,8 +165,8 @@ def qbnn_batchnorm2d_layer(d):
     # qbnn_layer.bias = Parameter(get_quantized_tensor(d.bias), requires_grad=False)
     # qbnn_layer.running_mean = Parameter(get_quantized_tensor(d.running_mean), requires_grad=False)
     # qbnn_layer.running_var = Parameter(get_quantized_tensor(d.running_var), requires_grad=False)
-    qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False)
-    qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False)
+    # qbnn_layer.scale = Parameter(torch.tensor([0.1]), requires_grad=False)
+    # qbnn_layer.zero_point = Parameter(torch.tensor([128]), requires_grad=False)
     return qbnn_layer
 
 
From b3d998094238164405c2ba0a73860b88eadf1b18 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Mar 2023 20:22:45 -0400
Subject: [PATCH 63/69] support load and store quantized models

---
 .../main_bayesian_imagenet_bnn2qbnn.py        | 83 ++++++++++++-------
 bayesian_torch/models/bnn_to_qbnn.py          | 12 +--
 2 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
index 2de3604..73dea9b 100644
--- a/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
+++ b/bayesian_torch/examples/main_bayesian_imagenet_bnn2qbnn.py
@@ -10,14 +10,17 @@
 import torch.optim
 import torch.utils.data
 from torch.utils.tensorboard import SummaryWriter
+import torchvision
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
 
+import bayesian_torch
 import bayesian_torch.models.bayesian.resnet_variational_large as resnet
 import numpy as np
 from bayesian_torch.models.bnn_to_qbnn import bnn_to_qbnn
-import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet
-# import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet
+from bayesian_torch.models.dnn_to_bnn import dnn_to_bnn
+# import bayesian_torch.models.bayesian.quantized_resnet_variational_large as qresnet
+import bayesian_torch.models.bayesian.quantized_resnet_flipout_large as qresnet
 
 torch.cuda.is_available = lambda : False
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -68,7 +71,7 @@
     "--save-dir",
     dest="save_dir",
     help="The directory used to save the trained models",
-    default="./checkpoint/bayesian",
+    default="../../bayesian-torch-20221214/bayesian_torch/checkpoint/bayesian",
     type=str,
 )
 parser.add_argument(
@@ -134,7 +137,7 @@
     help="use tensorboard for logging and visualization of training progress",
 )
 
-def evaluate(args, model, val_loader):
+def evaluate(args, model, val_loader, calibration=False):
     pred_probs_mc = []
     test_loss = 0
     correct = 0
@@ -159,6 +162,9 @@ def evaluate(args, model, val_loader):
             i+=1
             end = time.time()
             print("inference throughput: ", i*args.val_batch_size / (end - begin), " images/s")
+            # break
+            if calibration and i==3:
+                break
 
         output = torch.cat(output_list, 1)
         output = torch.nn.functional.softmax(output, dim=2)
@@ -232,7 +238,7 @@ def main():
 
     tb_writer = None
 
-    valdir = os.path.join(args.data, 'Imagenet_2012Val')
+    valdir = os.path.join(args.data, 'val')
     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
     val_dataset = datasets.ImageFolder(
@@ -256,6 +262,20 @@ def main():
         os.makedirs(args.save_dir)
 
     if args.mode == "test":
+        const_bnn_prior_parameters = {
+        "prior_mu": 0.0,
+        "prior_sigma": 1.0,
+        "posterior_mu_init": 0.0,
+        "posterior_rho_init": args.bnn_rho_init,
+        "type": "Flipout" if args.use_flipout_layers else "Reparameterization",  # Flipout or Reparameterization
+        "moped_enable": moped_enable,  # initialize mu/sigma from the dnn weights
+        "moped_delta": args.moped_delta_factor,
+        }
+        quantizable_model = torchvision.models.quantization.resnet50()
+        dnn_to_bnn(quantizable_model, const_bnn_prior_parameters)
+        model = torch.nn.DataParallel(quantizable_model)
+    
+        
         checkpoint_file = args.save_dir + "/bayesian_{}_imagenet.pth".format(args.arch)
 
         checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
@@ -263,38 +283,37 @@ def main():
         model.module = model.module.cpu()
 
         mp = bayesian_torch.quantization.prepare(model)
-        evaluate(args, mp, val_loader) # calibration
+        evaluate(args, mp, val_loader, calibration=True) # calibration
         qmodel = bayesian_torch.quantization.convert(mp)
         evaluate(args, qmodel, val_loader)
 
+        # save weights
+        save_checkpoint(
+                    {
+                        'epoch': None,
+                        'state_dict': qmodel.state_dict(),
+                        'best_prec1': None,
+                    },
+                    True,
+                    filename=os.path.join(
+                        args.save_dir,
+                        'quantized_bayesian_{}_imagenetv2.pth'.format(args.arch)))
+
+        # reconstruct (no calibration)
+        quantizable_model = torchvision.models.quantization.resnet50()
+        dnn_to_bnn(quantizable_model, const_bnn_prior_parameters)
+        model = torch.nn.DataParallel(quantizable_model)
+        mp = bayesian_torch.quantization.prepare(model)
+        qmodel1 = bayesian_torch.quantization.convert(mp)
 
+        # load
+        checkpoint_file = args.save_dir + "/quantized_bayesian_{}_imagenetv2.pth".format(args.arch)
+        checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
+        qmodel1.load_state_dict(checkpoint["state_dict"])
+        evaluate(args, qmodel1, val_loader)
 
-        # bnn_to_qbnn(model, fuse_conv_bn=False)  # only replaces linear and conv layers
-
-        # model = model.cpu()
 
-        # save weights
-        # save_checkpoint(
-        #             {
-        #                 'epoch': None,
-        #                 'state_dict': model.state_dict(),
-        #                 'best_prec1': None,
-        #             },
-        #             True,
-        #             filename=os.path.join(
-        #                 args.save_dir,
-        #                 'quantized_bayesian_q{}_imagenet.pth'.format(args.arch)))
-
-        # qmodel = torch.nn.DataParallel(qresnet.__dict__['q'+args.arch](bias=False)) # set bias=True to make qconv has bias
-        # qmodel.module.quant_then_dequant(qmodel, fuse_conv_bn=False)
-
-        # load weights
-        # checkpoint_file = args.save_dir + "/quantized_bayesian_q{}_imagenet.pth".format(args.arch)
-        # checkpoint = torch.load(checkpoint_file, map_location=torch.device("cpu"))
-        # qmodel.load_state_dict(checkpoint["state_dict"])
-
-        # qmodel.load_state_dict(model.state_dict())
-        # evaluate(args, qmodel, val_loader)
+        return mp, qmodel, qmodel1
 
 if __name__ == "__main__":
-    main()
+    mp, qmodel, qmodel1 = main()
diff --git a/bayesian_torch/models/bnn_to_qbnn.py b/bayesian_torch/models/bnn_to_qbnn.py
index 09dcaed..85953cf 100644
--- a/bayesian_torch/models/bnn_to_qbnn.py
+++ b/bayesian_torch/models/bnn_to_qbnn.py
@@ -103,12 +103,12 @@ def qbnn_linear_layer(d):
     qbnn_layer.__dict__.update(d.__dict__)
 
     if d.quant_prepare:
-        qbnn_layer.quant_dict = []
+        qbnn_layer.quant_dict = nn.ModuleList()
         for qstub in d.qint_quant:
-            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+            qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())}))
         qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:]
         for qstub in d.quint_quant:
-            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+            qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())}))
 
     qbnn_layer.quantize()
     if d.dnn_to_bnn_flag:
@@ -130,12 +130,12 @@ def qbnn_conv_layer(d):
     qbnn_layer.__dict__.update(d.__dict__)
 
     if d.quant_prepare:
-        qbnn_layer.quant_dict = []
+        qbnn_layer.quant_dict = nn.ModuleList()
         for qstub in d.qint_quant:
-            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+            qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())}))
         qbnn_layer.quant_dict = qbnn_layer.quant_dict[2:]
         for qstub in d.quint_quant:
-            qbnn_layer.quant_dict.append({'scale':qstub.scale.item(), 'zero_point':qstub.zero_point.item()})
+            qbnn_layer.quant_dict.append(nn.ParameterDict({'scale': torch.nn.Parameter(qstub.scale.float()), 'zero_point': torch.nn.Parameter(qstub.zero_point.float())}))
 
     qbnn_layer.quantize()
     if d.dnn_to_bnn_flag:

From ccc52ee3cf9740c6b3dce0f859dd4533ed3093d6 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Mon, 13 Mar 2023 22:21:43 -0400
Subject: [PATCH 64/69] calibration support for quantized flipout layers

---
 .../layers/flipout_layers/conv_flipout.py     | 48 +++++++++-
 .../layers/flipout_layers/linear_flipout.py   | 45 ++++++++--
 .../flipout_layers/quantized_conv_flipout.py  | 90 ++++++++++++-------
 .../quantized_linear_flipout.py               | 89 ++++++++++++------
 4 files changed, 202 insertions(+), 70 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index c92d24b..1bf0405 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -37,6 +37,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from ..base_variational_layer import BaseVariationalLayer_, get_kernel_size
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver
+from torch.quantization.qconfig import QConfig
 
 from torch.distributions.normal import Normal
 from torch.distributions.uniform import Uniform
@@ -136,6 +138,15 @@ def __init__(self,
             self.register_buffer('prior_bias_sigma', None, persistent=False)
 
         self.init_parameters()
+        self.quant_prepare=False
+    
+    def prepare(self):
+        self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)])
+        self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)])
+        self.dequant = torch.quantization.DeQuantStub()
+        self.quant_prepare=True
 
     def init_parameters(self):
         # prior values
@@ -303,6 +314,15 @@ def __init__(self,
             self.register_buffer('prior_bias_sigma', None, persistent=False)
 
         self.init_parameters()
+        self.quant_prepare=False
+    
+    def prepare(self):
+        self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)])
+        self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)])
+        self.dequant = torch.quantization.DeQuantStub()
+        self.quant_prepare=True
 
     def init_parameters(self):
         # prior values
@@ -365,18 +385,38 @@ def forward(self, x, return_kl=True):
                                       self.prior_bias_sigma)
 
         # perturbed feedforward
-        perturbed_outputs = F.conv2d(x * sign_input,
+        x_tmp = x * sign_input
+        perturbed_outputs_tmp = F.conv2d(x * sign_input,
                                      weight=delta_kernel,
                                      bias=bias,
                                      stride=self.stride,
                                      padding=self.padding,
                                      dilation=self.dilation,
-                                     groups=self.groups) * sign_output
+                                     groups=self.groups)
+        perturbed_outputs = perturbed_outputs_tmp * sign_output
+        out = outputs + perturbed_outputs
+
+        if self.quant_prepare:
+            # quint8 quantstub
+            input = self.quint_quant[0](input) # input
+            outputs = self.quint_quant[1](outputs) # output
+            sign_input = self.quint_quant[2](sign_input)
+            sign_output = self.quint_quant[3](sign_output)
+            x_tmp = self.quint_quant[4](x_tmp)
+            perturbed_outputs_tmp = self.quint_quant[5](perturbed_outputs_tmp) # output
+            perturbed_outputs = self.quint_quant[6](perturbed_outputs) # output
+            out = self.quint_quant[7](out) # output
+
+            # qint8 quantstub
+            sigma_weight = self.qint_quant[0](sigma_weight) # weight
+            mu_kernel = self.qint_quant[1](self.mu_kernel) # weight
+            eps_kernel = self.qint_quant[2](eps_kernel) # random variable
+            delta_kernel =self.qint_quant[3](delta_kernel) # multiply activation
 
         # returning outputs + perturbations
         if return_kl:
-            return outputs + perturbed_outputs, kl
-        return outputs + perturbed_outputs
+            return out, kl
+        return out
 
 
 class Conv3dFlipout(BaseVariationalLayer_):
diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py
index af34d5d..3555290 100644
--- a/bayesian_torch/layers/flipout_layers/linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py
@@ -40,6 +40,8 @@
 from torch.distributions.normal import Normal
 from torch.distributions.uniform import Uniform
 from ..base_variational_layer import BaseVariationalLayer_
+from torch.quantization.observer import HistogramObserver, PerChannelMinMaxObserver, MinMaxObserver
+from torch.quantization.qconfig import QConfig
 
 __all__ = ["LinearFlipout"]
 
@@ -107,6 +109,15 @@ def __init__(self,
             self.register_buffer('eps_bias', None, persistent=False)
 
         self.init_parameters()
+        self.quant_prepare=False
+    
+    def prepare(self):
+        self.qint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric), activation=MinMaxObserver.with_args(dtype=torch.qint8,qscheme=torch.per_tensor_symmetric))) for _ in range(4)])
+        self.quint_quant = nn.ModuleList([torch.quantization.QuantStub(
+                                         QConfig(weight=MinMaxObserver.with_args(dtype=torch.quint8), activation=MinMaxObserver.with_args(dtype=torch.quint8))) for _ in range(8)])
+        self.dequant = torch.quantization.DeQuantStub()
+        self.quant_prepare=True
 
     def init_parameters(self):
         # init prior mu
@@ -136,7 +147,9 @@ def forward(self, x, return_kl=True):
             return_kl = False
         # sampling delta_W
         sigma_weight = torch.log1p(torch.exp(self.rho_weight))
-        delta_weight = (sigma_weight * self.eps_weight.data.normal_())
+        eps_weight = self.eps_weight.data.normal_()
+        delta_weight = sigma_weight * eps_weight
+        # delta_weight = (sigma_weight * self.eps_weight.data.normal_())
 
         # get kl divergence
         if return_kl:
@@ -153,14 +166,32 @@ def forward(self, x, return_kl=True):
 
         # linear outputs
         outputs = F.linear(x, self.mu_weight, self.mu_bias)
-
         sign_input = x.clone().uniform_(-1, 1).sign()
         sign_output = outputs.clone().uniform_(-1, 1).sign()
-
-        perturbed_outputs = F.linear(x * sign_input, delta_weight,
-                                     bias) * sign_output
+        x_tmp = x * sign_input
+        perturbed_outputs_tmp = F.linear(x_tmp, delta_weight, bias)
+        perturbed_outputs = perturbed_outputs_tmp * sign_output
+        out = outputs + perturbed_outputs
+
+        if self.quant_prepare:
+            # quint8 quantstub
+            input = self.quint_quant[0](input) # input
+            outputs = self.quint_quant[1](outputs) # output
+            sign_input = self.quint_quant[2](sign_input)
+            sign_output = self.quint_quant[3](sign_output)
+            x_tmp = self.quint_quant[4](x_tmp)
+            perturbed_outputs_tmp = self.quint_quant[5](perturbed_outputs_tmp) # output
+            perturbed_outputs = self.quint_quant[6](perturbed_outputs) # output
+            out = self.quint_quant[7](out) # output
+
+            # qint8 quantstub
+            sigma_weight = self.qint_quant[0](sigma_weight) # weight
+            mu_weight = self.qint_quant[1](self.mu_weight) # weight
+            eps_weight = self.qint_quant[2](eps_weight) # random variable
+            delta_weight =self.qint_quant[3](delta_weight) # multiply activation
+            
 
         # returning outputs + perturbations
         if return_kl:
-            return outputs + perturbed_outputs, kl
-        return outputs + perturbed_outputs
+            return out, kl
+        return out
diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index cf771c7..55acd67 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -284,6 +284,7 @@ def __init__(self,
         self.bn_eps = None
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -425,40 +426,67 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
         if self.dnn_to_bnn_flag:
             return_kl = False
 
-        if x.dtype!=torch.quint8:
-            x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
-
-        bias = None
-        if self.bias:
-            bias = self.quantized_mu_bias
-
-        outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
-                        self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
-
-        # sampling perturbation signs
-        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
-        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
-        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
-        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
-
-        # getting perturbation weights
-        eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
-        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
-        delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
-
         bias = None
         if self.bias:
-            eps_bias = self.eps_bias.data.normal_()
-            bias = (self.quantized_sigma_bias * eps_bias)
+            bias = self.quantized_mu_bias # TODO: check correctness
+
+        if self.quant_dict is not None:
+            # getting perturbation weights
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8)
+            delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+
+            if x.dtype!=torch.quint8: # check if input has been quantized
+                x = torch.quantize_per_tensor(x, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                            self.dilation, self.groups, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+
+            # sampling perturbation signs
+            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8)
+            sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8)
+            
+            # perturbed feedforward
+            x = torch.ops.quantized.mul(x, sign_input, self.quant_dict[6]['scale'], self.quant_dict[6]['zero_point'])
+            perturbed_outputs = torch.nn.quantized.functional.conv2d(x,
+                                weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                                dilation=self.dilation, groups=self.groups, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point'])
+            perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point'])
+            out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point'])
+            out = out.dequantize()
 
-        # perturbed feedforward
-        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
-
-        perturbed_outputs = torch.nn.quantized.functional.conv2d(x,
-                            weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
-                            dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
-        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
-        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+        else:
+            if x.dtype!=torch.quint8:
+                x = torch.quantize_per_tensor(x, default_scale, default_zero_point, torch.quint8)
+
+            outputs = torch.nn.quantized.functional.conv2d(x, self.quantized_mu_weight, bias, self.stride, self.padding,
+                            self.dilation, self.groups, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+            # sampling perturbation signs
+            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+            sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+            # getting perturbation weights
+            eps_kernel = torch.quantize_per_tensor(self.eps_kernel.data.normal_(), normal_scale, 0, torch.qint8)
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_kernel.q_scale())
+            delta_kernel = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_kernel, new_scale, 0)
+
+            bias = None
+            if self.bias:
+                eps_bias = self.eps_bias.data.normal_()
+                bias = (self.quantized_sigma_bias * eps_bias)
+
+            # perturbed feedforward
+            x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+            perturbed_outputs = torch.nn.quantized.functional.conv2d(x,
+                                weight=delta_kernel, bias=bias, stride=self.stride, padding=self.padding,
+                                dilation=self.dilation, groups=self.groups, scale=default_scale, zero_point=default_zero_point)
+            perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+            out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
 
         if return_kl:
             return out, 0
diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
index 289da98..388817d 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
@@ -54,6 +54,7 @@ def __init__(self,
                  out_features)
 
         self.is_dequant = False
+        self.quant_dict = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -118,8 +119,8 @@ def quantize(self):
         delattr(self, "mu_weight")
         delattr(self, "rho_weight")
 
-        self.quantized_mu_bias = Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
-        self.quantized_sigma_bias = Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
+        self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
         delattr(self, "mu_bias")
         delattr(self, "rho_bias")
 
@@ -173,32 +174,64 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
                     self.is_dequant = True
             bias = self.mu_bias
 
-        outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
-
-        # sampling perturbation signs
-        sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
-        sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
-        sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
-        sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
-
-         # getting perturbation weights
-        eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
-        new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
-        delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
-
-        bias = None
-        if self.quantized_sigma_bias is not None:
-            eps_bias = self.eps_bias.data.normal_()
-            bias = (self.sigma_bias * eps_bias)
-
-        # perturbed feedforward
-        x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
-
-        perturbed_outputs = torch.nn.quantized.functional.linear(x,
-                            weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point)
-        perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
-        out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
-        out = out.dequantize()
+        if self.quant_dict is not None:
+            
+            # getting perturbation weights
+            eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), self.quant_dict[0]['scale'], self.quant_dict[0]['zero_point'], torch.qint8)
+            delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, self.quant_dict[1]['scale'], self.quant_dict[1]['zero_point'])
+
+            bias = None
+            if self.quantized_sigma_bias is not None:
+                eps_bias = self.eps_bias.data.normal_()
+                bias = (self.sigma_bias * eps_bias)
+
+            if x.dtype!=torch.quint8: # check if input has been quantized
+                x = torch.quantize_per_tensor(x, self.quant_dict[2]['scale'], self.quant_dict[2]['zero_point'], torch.quint8) # scale=0.1 by grid search; zero_point=128 for uint8 format
+
+            outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32
+
+            # sampling perturbation signs
+            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8)
+            sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8)
+            
+            # perturbed feedforward
+            x = torch.ops.quantized.mul(x, sign_input, self.quant_dict[6]['scale'], self.quant_dict[6]['zero_point'])
+            perturbed_outputs = torch.nn.quantized.functional.linear(x,
+                                weight=delta_weight, bias=bias, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point'])
+            perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point'])
+            out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point'])
+            out = out.dequantize()
+
+        else:
+
+            outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=default_scale, zero_point=default_zero_point) # input: quint8, weight: qint8, bias: fp32
+
+            # sampling perturbation signs
+            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            sign_input = torch.quantize_per_tensor(sign_input, default_scale, default_zero_point, torch.quint8)
+            sign_output = torch.quantize_per_tensor(sign_output, default_scale, default_zero_point, torch.quint8)
+
+            # getting perturbation weights
+            eps_weight = torch.quantize_per_tensor(self.eps_weight.data.normal_(), normal_scale, 0, torch.qint8)
+            new_scale = (self.quantized_sigma_weight.q_scale())*(eps_weight.q_scale())
+            delta_weight = torch.ops.quantized.mul(self.quantized_sigma_weight, eps_weight, new_scale, 0)
+
+            bias = None
+            if self.quantized_sigma_bias is not None:
+                eps_bias = self.eps_bias.data.normal_()
+                bias = (self.sigma_bias * eps_bias)
+
+            # perturbed feedforward
+            x = torch.ops.quantized.mul(x, sign_input, default_scale, default_zero_point)
+
+            perturbed_outputs = torch.nn.quantized.functional.linear(x,
+                                weight=delta_weight, bias=bias, scale=default_scale, zero_point=default_zero_point)
+            perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, default_scale, default_zero_point)
+            out = torch.ops.quantized.add(outputs, perturbed_outputs, default_scale, default_zero_point)
+            out = out.dequantize()
 
         if return_kl:
             return out, 0

From 17480e67f3af59703356aada07fe126900c35543 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Sun, 19 Mar 2023 18:46:02 -0400
Subject: [PATCH 65/69] fix qconv2d flipout layers

---
 bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 55acd67..18fd2ce 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -454,7 +454,7 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
                                 dilation=self.dilation, groups=self.groups, scale=self.quant_dict[7]['scale'], zero_point=self.quant_dict[7]['zero_point'])
             perturbed_outputs = torch.ops.quantized.mul(perturbed_outputs, sign_output, self.quant_dict[8]['scale'], self.quant_dict[8]['zero_point'])
             out = torch.ops.quantized.add(outputs, perturbed_outputs, self.quant_dict[9]['scale'], self.quant_dict[9]['zero_point'])
-            out = out.dequantize()
+            # out = out.dequantize()
 
         else:
             if x.dtype!=torch.quint8:

From 18b296fa981fde4ca0cf9d510d97a0e08e927219 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 29 Mar 2023 14:27:44 -0400
Subject: [PATCH 66/69] modify the bias to a torch.Parameter to allow for JIT
 tracing

---
 .../layers/variational_layers/quantize_linear_variational.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
index 34a970f..a12a569 100644
--- a/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
+++ b/bayesian_torch/layers/variational_layers/quantize_linear_variational.py
@@ -119,7 +119,7 @@ def quantize(self):
         delattr(self, "rho_weight")
 
         self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
-        self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False)#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
         delattr(self, "mu_bias")
         delattr(self, "rho_bias")
 
@@ -131,7 +131,7 @@ def dequantize(self): # Deprecated
         self.sigma_bias = self.get_dequantized_tensor(self.quantized_sigma_bias)
         return
 
-    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.1, default_zero_point=128, return_kl=True):
+    def forward(self, input, enable_int8_compute=True, normal_scale=6/255, default_scale=0.2, default_zero_point=128, return_kl=True):
         """ Forward pass
 
         Parameters

From 69dc4db534cc81ad031a2c4263a3733b5b546dc7 Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 29 Mar 2023 14:32:23 -0400
Subject: [PATCH 67/69] pre-sampling for flipout layers

---
 .../flipout_layers/quantized_conv_flipout.py  | 24 ++++++++++++++--
 .../quantized_linear_flipout.py               | 28 +++++++++++++++++--
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
index 18fd2ce..4be011a 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_conv_flipout.py
@@ -38,6 +38,7 @@
 from torch.nn import Parameter
 from ..base_variational_layer import BaseVariationalLayer_
 from .conv_flipout import *
+import random
 
 from torch.distributions.normal import Normal
 from torch.distributions.uniform import Uniform
@@ -285,6 +286,9 @@ def __init__(self,
 
         self.is_dequant = False
         self.quant_dict = None
+        self.presampled_input_perturb = None
+        self.presampled_output_perturb = None
+
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -442,8 +446,24 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
                             self.dilation, self.groups, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32
 
             # sampling perturbation signs
-            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
-            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            input_tsize = torch.prod(torch.tensor(x.shape))*1
+            output_tsize = torch.prod(torch.tensor(outputs.shape))*1
+
+            if self.presampled_input_perturb is None:
+                self.presampled_input_perturb = torch.randint(0, 1, (input_tsize + torch.prod(torch.tensor(x.shape)),)).float()
+                self.presampled_input_perturb[self.presampled_input_perturb==0] = -1
+            
+            if self.presampled_output_perturb is None:
+                self.presampled_output_perturb = torch.randint(0, 1, (output_tsize + torch.prod(torch.tensor(outputs.shape)),)).float()
+                self.presampled_output_perturb[self.presampled_output_perturb==0] = -1
+
+            st = random.randint(0, input_tsize)
+            sign_input = self.presampled_input_perturb[st:st+torch.prod(torch.tensor(x.shape))].reshape(x.shape)
+
+            st = random.randint(0, output_tsize)
+            sign_output = self.presampled_output_perturb[st:st+torch.prod(torch.tensor(outputs.shape))].reshape(outputs.shape)
+            # sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            # sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
             sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8)
             sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8)
             
diff --git a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
index 388817d..3cce873 100644
--- a/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/quantized_linear_flipout.py
@@ -39,6 +39,7 @@
 from torch.nn import Module, Parameter
 from torch.distributions.normal import Normal
 from torch.distributions.uniform import Uniform
+import random
 
 from .linear_flipout import LinearFlipout
 
@@ -55,6 +56,8 @@ def __init__(self,
 
         self.is_dequant = False
         self.quant_dict = None
+        self.presampled_input_perturb = None
+        self.presampled_output_perturb = None
 
     def get_scale_and_zero_point(self, x, upper_bound=100, target_range=255):
         """ An implementation for symmetric quantization
@@ -120,7 +123,7 @@ def quantize(self):
         delattr(self, "rho_weight")
 
         self.quantized_mu_bias = self.mu_bias#Parameter(self.get_quantized_tensor(self.mu_bias), requires_grad=False)
-        self.quantized_sigma_bias = torch.log1p(torch.exp(self.rho_bias))#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
+        self.quantized_sigma_bias = Parameter(torch.log1p(torch.exp(self.rho_bias)), requires_grad=False)#Parameter(self.get_quantized_tensor(torch.log1p(torch.exp(self.rho_bias))), requires_grad=False)
         delattr(self, "mu_bias")
         delattr(self, "rho_bias")
 
@@ -191,8 +194,27 @@ def forward(self, x, normal_scale=6/255, default_scale=0.1, default_zero_point=1
             outputs = torch.nn.quantized.functional.linear(x, self.quantized_mu_weight, bias, scale=self.quant_dict[3]['scale'], zero_point=self.quant_dict[3]['zero_point']) # input: quint8, weight: qint8, bias: fp32
 
             # sampling perturbation signs
-            sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
-            sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
+            # sampling perturbation signs
+            input_tsize = torch.prod(torch.tensor(x.shape))*1
+            output_tsize = torch.prod(torch.tensor(outputs.shape))*1
+
+            if self.presampled_input_perturb is None:
+                self.presampled_input_perturb = torch.randint(0, 1, (input_tsize + torch.prod(torch.tensor(x.shape)),)).float()
+                self.presampled_input_perturb[self.presampled_input_perturb==0] = -1
+            
+            if self.presampled_output_perturb is None:
+                self.presampled_output_perturb = torch.randint(0, 1, (output_tsize + torch.prod(torch.tensor(outputs.shape)),)).float()
+                self.presampled_output_perturb[self.presampled_output_perturb==0] = -1
+
+            st = random.randint(0, input_tsize)
+            sign_input = self.presampled_input_perturb[st:st+torch.prod(torch.tensor(x.shape))].reshape(x.shape)
+
+            st = random.randint(0, output_tsize)
+            sign_output = self.presampled_output_perturb[st:st+torch.prod(torch.tensor(outputs.shape))].reshape(outputs.shape)
+
+
+            # sign_input = torch.zeros(x.shape).uniform_(-1, 1).sign()
+            # sign_output = torch.zeros(outputs.shape).uniform_(-1, 1).sign()
             sign_input = torch.quantize_per_tensor(sign_input, self.quant_dict[4]['scale'], self.quant_dict[4]['zero_point'], torch.quint8)
             sign_output = torch.quantize_per_tensor(sign_output, self.quant_dict[5]['scale'], self.quant_dict[5]['zero_point'], torch.quint8)
             

From c3e47ed1c9005776580678e3d47f3bed4495431d Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 19 Apr 2023 12:24:50 -0400
Subject: [PATCH 68/69] fix input

---
 bayesian_torch/layers/flipout_layers/conv_flipout.py   | 2 +-
 bayesian_torch/layers/flipout_layers/linear_flipout.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bayesian_torch/layers/flipout_layers/conv_flipout.py b/bayesian_torch/layers/flipout_layers/conv_flipout.py
index 1bf0405..6463028 100644
--- a/bayesian_torch/layers/flipout_layers/conv_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/conv_flipout.py
@@ -398,7 +398,7 @@ def forward(self, x, return_kl=True):
 
         if self.quant_prepare:
             # quint8 quantstub
-            input = self.quint_quant[0](input) # input
+            x = self.quint_quant[0](x) # input
             outputs = self.quint_quant[1](outputs) # output
             sign_input = self.quint_quant[2](sign_input)
             sign_output = self.quint_quant[3](sign_output)
diff --git a/bayesian_torch/layers/flipout_layers/linear_flipout.py b/bayesian_torch/layers/flipout_layers/linear_flipout.py
index 3555290..a3de14e 100644
--- a/bayesian_torch/layers/flipout_layers/linear_flipout.py
+++ b/bayesian_torch/layers/flipout_layers/linear_flipout.py
@@ -175,7 +175,7 @@ def forward(self, x, return_kl=True):
 
         if self.quant_prepare:
             # quint8 quantstub
-            input = self.quint_quant[0](input) # input
+            x = self.quint_quant[0](x) # input
             outputs = self.quint_quant[1](outputs) # output
             sign_input = self.quint_quant[2](sign_input)
             sign_output = self.quint_quant[3](sign_output)

From 86adb6d6fa7dced490f9a24b95e54ebc1c43ea0a Mon Sep 17 00:00:00 2001
From: junliang-lin <jun-liang.lin@intel.com>
Date: Wed, 19 Apr 2023 12:25:31 -0400
Subject: [PATCH 69/69] fix batchnorm

---
 bayesian_torch/layers/batchnorm.py            | 23 ++++++++++++-------
 .../bayesian/resnet_variational_large.py      |  2 +-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/bayesian_torch/layers/batchnorm.py b/bayesian_torch/layers/batchnorm.py
index 145997c..25ab8f3 100644
--- a/bayesian_torch/layers/batchnorm.py
+++ b/bayesian_torch/layers/batchnorm.py
@@ -54,7 +54,6 @@ def _check_input_dim(self, input):
                 input.dim()))
 
     def forward(self, input):
-        self._check_input_dim(input[0])
         exponential_average_factor = 0.0
         if self.training and self.track_running_stats:
             self.num_batches_tracked += 1
@@ -63,13 +62,21 @@ def forward(self, input):
         else:  # use exponential moving average
             exponential_average_factor = self.momentum
 
-        out = F.batch_norm(input[0], self.running_mean, self.running_var,
-                           self.weight, self.bias, self.training
-                           or not self.track_running_stats,
-                           exponential_average_factor, self.eps)
-        kl = 0
-        return out, kl
-
+        if len(input) == 2:
+            self._check_input_dim(input[0])
+            out = F.batch_norm(input[0], self.running_mean, self.running_var,
+                            self.weight, self.bias, self.training
+                            or not self.track_running_stats,
+                            exponential_average_factor, self.eps)
+            kl = 0
+            return out, kl
+        else:
+            out = F.batch_norm(input, self.running_mean, self.running_var,
+                            self.weight, self.bias, self.training
+                            or not self.track_running_stats,
+                            exponential_average_factor, self.eps)
+            return out
+            
 
 class BatchNorm1dLayer(nn.Module):
     def __init__(self,
diff --git a/bayesian_torch/models/bayesian/resnet_variational_large.py b/bayesian_torch/models/bayesian/resnet_variational_large.py
index 6fdf561..e5fb9fd 100644
--- a/bayesian_torch/models/bayesian/resnet_variational_large.py
+++ b/bayesian_torch/models/bayesian/resnet_variational_large.py
@@ -200,7 +200,7 @@ def _make_layer(self, block, planes, blocks, stride=1):
                                          posterior_mu_init=posterior_mu_init,
                                          posterior_rho_init=posterior_rho_init,
                                          bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
+                BatchNorm2dLayer(planes * block.expansion),
             )
 
         layers = []