diff --git a/get_free_port.py b/get_free_port.py
new file mode 100644
index 0000000..693bc31
--- /dev/null
+++ b/get_free_port.py
@@ -0,0 +1,21 @@
+import socket
+import random
+
+
+def next_free_port( port=1994, max_port=65535):
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    while port <= max_port:
+        try:
+            sock.bind(('', port))
+            sock.close()
+            return port
+        except OSError:
+            port += 1
+    raise IOError('no free ports')
+
+
+if __name__=='__main__':
+    start_port = random.choice(list(range(1994, 2994)))
+    port = next_free_port(port=start_port)
+    print(port)
+    exit(port)
diff --git a/methods/segmentation_module.py b/methods/segmentation_module.py
index cd2118b..753a6dd 100644
--- a/methods/segmentation_module.py
+++ b/methods/segmentation_module.py
@@ -22,8 +22,6 @@ def make_model(opts, cls=None, head_channels=None):
         norm = partial(ABR, activation="leaky_relu", activation_param=.01)
     elif opts.norm_act == 'iabr':
         norm = partial(InPlaceABR, activation="leaky_relu", activation_param=.01)
-    elif opts.norm_act == 'iabrr':
-        norm = partial(InPlaceABR_R, activation="leaky_relu", activation_param=.01)
     elif opts.norm_act == 'ain':
         norm = partial(AIN, activation="leaky_relu", activation_param=.01)
     elif opts.norm_act == 'rabn':
@@ -114,7 +112,7 @@ def fix_bn(self):
                 m.weight.requires_grad = False
                 m.bias.requires_grad = False
 
-    def bn_set_momentum(self, momentum=0.1):
+    def bn_set_momentum(self, momentum=0.0):
         for m in self.modules():
             if isinstance(m, nn.BatchNorm2d) or isinstance(m, ABN) or isinstance(m, AIN) or isinstance(m, ABR):
                 m.momentum = momentum
diff --git a/methods/trainer.py b/methods/trainer.py
index a65f779..38d0c81 100644
--- a/methods/trainer.py
+++ b/methods/trainer.py
@@ -8,7 +8,7 @@
 from modules.classifier import IncrementalClassifier, CosineClassifier, SPNetClassifier
 from .utils import get_scheduler, MeanReduction
 
-CLIP = 100
+CLIP = 10
 
 
 class Trainer:
diff --git a/modules/custom_bn.py b/modules/custom_bn.py
index 78eb9b5..f8c4df2 100644
--- a/modules/custom_bn.py
+++ b/modules/custom_bn.py
@@ -124,7 +124,7 @@ class ABR(nn.Module):
     activation_param : float
         Negative slope for the `leaky_relu` activation.
     """
-    def __init__(self, num_features, eps=1e-9, momentum=0.0, affine=True, activation="leaky_relu",
+    def __init__(self, num_features, eps=1e-5, momentum=0.0, affine=True, activation="leaky_relu",
                  activation_param=0.01, group=distributed.group.WORLD, renorm=True):
         super(ABR, self).__init__()
         self.num_features = num_features
@@ -161,12 +161,12 @@ def forward(self, x):
             bias = self.bias
         else:
             with torch.no_grad():
-                running_std = self.running_var.pow(0.5) + self.eps
+                running_std = (self.running_var + self.eps).pow(0.5)
                 xt = x.transpose(1, 0).reshape(x.shape[1], -1)
-                r = xt.std(dim=1) / running_std
+                r = (xt.var(dim=1) + self.eps).pow(0.5) / running_std
                 d = (xt.mean(dim=1) - self.running_mean) / running_std
             weight = self.weight * r
-            bias = self.bias + self.weight*d
+            bias = self.bias + self.weight * d
 
         x = functional.batch_norm(x, self.running_mean, self.running_var, weight, bias,
                                   self.training, self.momentum, self.eps)
@@ -200,7 +200,7 @@ def extra_repr(self):
 
 
 class InPlaceABR(ABR):
-    def __init__(self, num_features, eps=1e-8, momentum=0.0, affine=True, activation="leaky_relu",
+    def __init__(self, num_features, eps=1e-5, momentum=0.0, affine=True, activation="leaky_relu",
                  activation_param=0.01):
         super().__init__(num_features, eps, momentum, affine, activation, activation_param)
 
@@ -210,10 +210,10 @@ def forward(self, x):
             bias = self.bias
         else:
             with torch.no_grad():
-                mean, var, count = _backend.statistics(x)
                 running_std = (self.running_var + self.eps).pow(0.5)
-                r = (var + self.eps).pow(0.5) / running_std
-                d = (mean - self.running_mean) / running_std
+                xt = x.transpose(1, 0).reshape(x.shape[1], -1)
+                r = (xt.var(dim=1) + self.eps).pow(0.5) / running_std
+                d = (xt.mean(dim=1) - self.running_mean) / running_std
             weight = self.weight * r
             bias = self.bias + self.weight * d
 
diff --git a/utils/loss.py b/utils/loss.py
index 5a89dea..b017822 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -107,7 +107,7 @@ def forward(self, inputs, targets):
         labels = torch.softmax(targets / self.alpha, dim=1)
 
         if not self.kl:
-            loss = -(outputs * labels).mean(dim=1)
+            loss = -(outputs * labels).mean(dim=1) * (self.alpha ** 2)
         else:
             loss = F.kl_div(outputs, labels, reduction='none') * (self.alpha ** 2)
             loss = loss.sum(dim=1)