diff --git a/README.md b/README.md
index 4d9e7a3..5e103f6 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@ Fast implementation of BERT inference directly on NVIDIA (CUDA, CUBLAS) and Inte
 [![Build Status](https://travis-ci.org/zhihu/cuBERT.svg?branch=master)](https://travis-ci.org/zhihu/cuBERT)
 
 Highly customized and optimized BERT inference directly on NVIDIA (CUDA,
-CUBLAS) or Intel MKL, without tensorflow and its framework overhead.
+CUBLAS) or Intel MKL, *without* tensorflow and its framework overhead.
 
-ONLY BERT (Transformer) is supported.
+**ONLY** BERT (Transformer) is supported.
 
 # Benchmark
 
@@ -23,17 +23,17 @@ ONLY BERT (Transformer) is supported.
 
 ### GPU (cuBERT)
 
-|batch size|128 (ms)|32 (ms)|
-|---       |---     |---    |
-|tensorflow|255.2   |70.0   |
-|cuBERT    |184.6   |54.5   |
+|batch size|128 (ms) |32 (ms) |
+|---       |---      |---     |
+|tensorflow|255.2    |70.0    |
+|cuBERT    |**184.6**|**54.5**|
 
 ### CPU (mklBERT)
 
-|batch size|128 (ms)|1 (ms)|
-|---       |---     |---   |
-|tensorflow|1504.0  |69.9  |
-|mklBERT   |984.9   |24.0  |
+|batch size|128 (ms) |1 (ms)  |
+|---       |---      |---     |
+|tensorflow|1504.0   |69.9    |
+|mklBERT   |**984.9**|**24.0**|
 
 Note: MKL should be run under `OMP_NUM_THREADS=?` to control its thread
 number. Other environment variables and their possible values includes:
@@ -84,7 +84,8 @@ Following outputs are supported:
 
 |cuBERT_OutputType      |python code                   |
 |---                    |---                           |
-|cuBERT_LOGITS          |[`model.get_pooled_output()` * output_weights + output_bias](https://github.com/google-research/bert/blob/d66a146741588fb208450bde15aa7db143baaa69/run_classifier.py#L607)|
+|cuBERT_LOGITS          |[`model.get_pooled_output() * output_weights + output_bias`](https://github.com/google-research/bert/blob/d66a146741588fb208450bde15aa7db143baaa69/run_classifier.py#L607)|
+|cuBERT_PROBS           |`probs = tf.nn.softmax(logits, axis=-1)`|
 |cuBERT_POOLED_OUTPUT   |`model.get_pooled_output()`   |
 |cuBERT_SEQUENCE_OUTPUT |`model.get_sequence_output()` |
 |cuBERT_EMBEDDING_OUTPUT|`model.get_embedding_output()`|
@@ -248,3 +249,4 @@ to achieve the best trade-off. Good luck!
 * fanliwen
 * wangruixin
 * fangkuan
+* sunxian
diff --git a/java/src/main/java/com/zhihu/cubert/OutputType.java b/java/src/main/java/com/zhihu/cubert/OutputType.java
index dd6605e..7bf7d93 100644
--- a/java/src/main/java/com/zhihu/cubert/OutputType.java
+++ b/java/src/main/java/com/zhihu/cubert/OutputType.java
@@ -5,4 +5,5 @@ public enum OutputType {
     POOLED_OUTPUT,
     SEQUENCE_OUTPUT,
     EMBEDDING_OUTPUT,
+    PROBS,
 }
diff --git a/python/_cuBERT.pxd b/python/_cuBERT.pxd
index 50b6424..26d3002 100644
--- a/python/_cuBERT.pxd
+++ b/python/_cuBERT.pxd
@@ -3,7 +3,7 @@ cdef extern from "../src/cuBERT.h":
         cuBERT_COMPUTE_FLOAT, cuBERT_COMPUTE_HALF
     
     cdef enum cuBERT_OutputType:
-        cuBERT_LOGITS, cuBERT_POOLED_OUTPUT, cuBERT_SEQUENCE_OUTPUT, cuBERT_EMBEDDING_OUTPUT
+        cuBERT_LOGITS, cuBERT_POOLED_OUTPUT, cuBERT_SEQUENCE_OUTPUT, cuBERT_EMBEDDING_OUTPUT, cuBERT_PROBS
 
     void cuBERT_initialize() except +;
     void cuBERT_finalize() except +;
diff --git a/python/cuBERT.pyx b/python/cuBERT.pyx
index 1f0d939..bc7daca 100644
--- a/python/cuBERT.pyx
+++ b/python/cuBERT.pyx
@@ -33,6 +33,7 @@ class OutputType:
     POOLED_OUTPUT = _cuBERT.cuBERT_OutputType.cuBERT_POOLED_OUTPUT
     SEQUENCE_OUTPUT = _cuBERT.cuBERT_OutputType.cuBERT_SEQUENCE_OUTPUT
     EMBEDDING_OUTPUT = _cuBERT.cuBERT_OutputType.cuBERT_EMBEDDING_OUTPUT
+    PROBS = _cuBERT.cuBERT_OutputType.cuBERT_PROBS
 
 cdef class Model:
     cdef void* _c_model
diff --git a/src/cuBERT.h b/src/cuBERT.h
index cf46871..0739393 100644
--- a/src/cuBERT.h
+++ b/src/cuBERT.h
@@ -19,6 +19,7 @@ enum cuBERT_OutputType {
     cuBERT_POOLED_OUTPUT = 1,
     cuBERT_SEQUENCE_OUTPUT = 2,
     cuBERT_EMBEDDING_OUTPUT = 3,
+    cuBERT_PROBS = 4,
 };
 
 void cuBERT_initialize();
diff --git a/src/cuBERT/Bert.cpp b/src/cuBERT/Bert.cpp
index fd9b6ec..c60070d 100644
--- a/src/cuBERT/Bert.cpp
+++ b/src/cuBERT/Bert.cpp
@@ -50,6 +50,7 @@ namespace cuBERT {
         this->_embedding_output = static_cast<T *>(cuBERT::malloc(sizeof(T) * max_batch_size * seq_length * hidden_size));
         this->_pooled_output = static_cast<T *>(cuBERT::malloc(sizeof(T) * max_batch_size * hidden_size));
         this->_logits = static_cast<T *>(cuBERT::malloc(sizeof(T) * max_batch_size * num_labels));
+        this->_probs = static_cast<T *>(cuBERT::malloc(sizeof(T) * max_batch_size * num_labels));
 
         this->input_ids_buf = static_cast<int *>(cuBERT::malloc(sizeof(int) * max_batch_size * seq_length));
         this->input_mask_buf = static_cast<int8_t *>(cuBERT::malloc(sizeof(int8_t) * max_batch_size * seq_length));
@@ -57,6 +58,9 @@ namespace cuBERT {
 
         // pre-compute buffers
         transformer->_pre_compute(max_batch_size);
+        if (additional_output_layer != nullptr) {
+            additional_output_layer->_pre_compute(max_batch_size, _logits);
+        }
         this->buffer_filled = true;
     }
 
@@ -66,6 +70,7 @@ namespace cuBERT {
         cuBERT::free(input_mask_buf);
         cuBERT::free(input_ids_buf);
 
+        cuBERT::free(_probs);
         cuBERT::free(_logits);
         cuBERT::free(_pooled_output);
         cuBERT::free(_embedding_output);
@@ -118,7 +123,7 @@ namespace cuBERT {
         bert_pooler->compute(batch_size, _sequence_output, _pooled_output);
 
         if (additional_output_layer != nullptr) {
-            additional_output_layer->_in_compute(batch_size, _pooled_output, _logits);
+            additional_output_layer->_in_compute(batch_size, _pooled_output, _logits, _probs);
         }
 
         // buffers should be re-computed in the next request
@@ -126,17 +131,25 @@ namespace cuBERT {
     }
 
     template <typename T>
-    void Bert<T>::logits(size_t batch_size, T *logits) {
+    void Bert<T>::logits(size_t batch_size, T *logits, T *probs) {
         if (additional_output_layer == nullptr) {
             std::cerr << "model does not have additional_output_layer, the output logits is wrong." << std::endl;
         }
 
         void *streamId = cuBERT::blas_get_stream(cublas);
-        cuBERT::memcpyAsync(logits, _logits, sizeof(T) * batch_size * num_labels, 2, streamId);
+        if (logits != nullptr) {
+            cuBERT::memcpyAsync(logits, _logits, sizeof(T) * batch_size * num_labels, 2, streamId);
+        }
+        if (probs != nullptr) {
+            cuBERT::memcpyAsync(probs, _probs, sizeof(T) * batch_size * num_labels, 2, streamId);
+        }
         cuBERT::cuda_stream_synchronize(streamId);
 
         if (!buffer_filled) {
             transformer->_pre_compute(batch_size);
+            if (additional_output_layer != nullptr) {
+                additional_output_layer->_pre_compute(batch_size, _logits);
+            }
             buffer_filled = true;
         }
     }
@@ -149,6 +162,9 @@ namespace cuBERT {
 
         if (!buffer_filled) {
             transformer->_pre_compute(batch_size);
+            if (additional_output_layer != nullptr) {
+                additional_output_layer->_pre_compute(batch_size, _logits);
+            }
             buffer_filled = true;
         }
     }
@@ -162,6 +178,9 @@ namespace cuBERT {
 
         if (!buffer_filled) {
             transformer->_pre_compute(batch_size);
+            if (additional_output_layer != nullptr) {
+                additional_output_layer->_pre_compute(batch_size, _logits);
+            }
             buffer_filled = true;
         }
     }
@@ -175,6 +194,9 @@ namespace cuBERT {
 
         if (!buffer_filled) {
             transformer->_pre_compute(batch_size);
+            if (additional_output_layer != nullptr) {
+                additional_output_layer->_pre_compute(batch_size, _logits);
+            }
             buffer_filled = true;
         }
     }
diff --git a/src/cuBERT/Bert.h b/src/cuBERT/Bert.h
index e14b0b5..765ab16 100644
--- a/src/cuBERT/Bert.h
+++ b/src/cuBERT/Bert.h
@@ -29,7 +29,7 @@ namespace cuBERT {
         void compute(size_t batch_size, int *input_ids, int8_t *input_mask, int8_t *segment_ids);
 
         // ouput methods, cpu/gpu outputs
-        void logits(size_t batch_size, T *logits);
+        void logits(size_t batch_size, T *logits, T *probs);
         void pooled_output(size_t batch_size, T *pooled_output);
         void sequence_output(size_t batch_size, T *sequence_output);
         void embedding_output(size_t batch_size, T *embedding_output);
@@ -58,6 +58,7 @@ namespace cuBERT {
         T *_sequence_output;
         T *_pooled_output;
         T *_logits;
+        T *_probs;
 
         // for pre-compute
         // FIXME: _sequence_output will be flushed
diff --git a/src/cuBERT/BertM.cpp b/src/cuBERT/BertM.cpp
index a6b935b..67978ff 100644
--- a/src/cuBERT/BertM.cpp
+++ b/src/cuBERT/BertM.cpp
@@ -77,7 +77,10 @@ namespace cuBERT {
         bert_instance->compute(batch_size, input_ids, input_mask, segment_ids);
         switch (output_type) {
             case cuBERT_LOGITS:
-                bert_instance->logits(batch_size, output);
+                bert_instance->logits(batch_size, output, nullptr);
+                break;
+            case cuBERT_PROBS:
+                bert_instance->logits(batch_size, nullptr, output);
                 break;
             case cuBERT_POOLED_OUTPUT:
                 bert_instance->pooled_output(batch_size, output);
diff --git a/src/cuBERT/op/Softmax.cpp b/src/cuBERT/op/Softmax.cpp
index 3c6a408..7805cf8 100644
--- a/src/cuBERT/op/Softmax.cpp
+++ b/src/cuBERT/op/Softmax.cpp
@@ -8,7 +8,8 @@ namespace cuBERT {
 
 #ifdef HAVE_MKL
     template<>
-    void softmax_<float>(float *inout,
+    void softmax_<float>(float *in,
+                         float *out,
                          const int batch_size,
                          const int channel,
                          float *sum_gpu,
@@ -17,12 +18,12 @@ namespace cuBERT {
         for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
             float sum = 0;
             for (int i = batch_idx * channel; i < (batch_idx + 1) * channel; ++i) {
-                inout[i] = expf(inout[i]);
-                sum += inout[i];
+                out[i] = expf(in[i]);
+                sum += out[i];
             }
 
             for (int i = batch_idx * channel; i < (batch_idx + 1) * channel; ++i) {
-                inout[i] = inout[i] / sum;
+                out[i] = out[i] / sum;
             }
         }
     }
@@ -40,8 +41,13 @@ namespace cuBERT {
     }
 
     template <typename T>
-    void Softmax<T>::compute_(size_t batch_size, T *inout_gpu, void* stream) {
-        softmax_<T>(inout_gpu, batch_size, channel, sum_gpu, stream);
+    void Softmax<T>::compute_(size_t batch_size, T *inout, void* stream) {
+        softmax_<T>(inout, inout, batch_size, channel, sum_gpu, stream);
+    }
+
+    template <typename T>
+    void Softmax<T>::compute_(size_t batch_size, T *in, T *out, void* stream) {
+        softmax_<T>(in, out, batch_size, channel, sum_gpu, stream);
     }
 
     template class Softmax<float>;
diff --git a/src/cuBERT/op/Softmax.cu b/src/cuBERT/op/Softmax.cu
index 713aebd..48afdf9 100644
--- a/src/cuBERT/op/Softmax.cu
+++ b/src/cuBERT/op/Softmax.cu
@@ -40,14 +40,18 @@ namespace cuBERT {
     }
 
     template <typename T>
-    __global__ void kernel_substract_(T *inout, const int batch_size, const int channel, T *max_in) {
+    __global__ void kernel_substract(const T *__restrict__ in, 
+                                     T *out, 
+                                     const int batch_size, 
+                                     const int channel, 
+                                     const T *__restrict__ max_in) {
         int idx = blockIdx.x * blockDim.x + threadIdx.x;
         if (idx >= batch_size * channel) {
             return;
         }
 
         int batch_idx = idx / channel;
-        inout[idx] = (float) __ldg(inout + idx) - (float) __ldg(max_in + batch_idx);
+        out[idx] = (float) __ldg(in + idx) - (float) __ldg(max_in + batch_idx);
     }
 
     template <typename T>
@@ -81,22 +85,22 @@ namespace cuBERT {
     }
 
     template <typename T>
-    __host__ void softmax_(T *inout, const int batch_size, const int channel, T *sum_gpu, void* stream) {
+    __host__ void softmax_(T *in, T *out, const int batch_size, const int channel, T *sum_gpu, void* stream) {
         const int all_blocks = (batch_size * channel + 127) / 128;
 
-        kernel_max_cub<T> <<<batch_size, 128, 0, (cudaStream_t) stream>>> (inout, batch_size, channel, sum_gpu);
-        kernel_substract_<T> <<<all_blocks, 128, 0, (cudaStream_t) stream>>> (inout, batch_size, channel, sum_gpu);
+        kernel_max_cub<T> <<<batch_size, 128, 0, (cudaStream_t) stream>>> (in, batch_size, channel, sum_gpu);
+        kernel_substract<T> <<<all_blocks, 128, 0, (cudaStream_t) stream>>> (in, out, batch_size, channel, sum_gpu);
 
-        thrust::device_ptr<T> dev_ptr(inout);
+        thrust::device_ptr<T> dev_ptr(out);
         thrust::transform(thrust::cuda::par.on((cudaStream_t) stream), dev_ptr, dev_ptr + batch_size * channel, dev_ptr, exp_functor<T>());
 
-        kernel_sum_cub<T> <<<batch_size, 128, 0, (cudaStream_t) stream>>> (inout, batch_size, channel, sum_gpu);
-        kernel_scale_<T> <<<all_blocks, 128, 0, (cudaStream_t) stream>>> (inout, batch_size, channel, sum_gpu);
+        kernel_sum_cub<T> <<<batch_size, 128, 0, (cudaStream_t) stream>>> (out, batch_size, channel, sum_gpu);
+        kernel_scale_<T> <<<all_blocks, 128, 0, (cudaStream_t) stream>>> (out, batch_size, channel, sum_gpu);
     }
 
     template
-    __host__ void softmax_<float>(float *inout, const int batch_size, const int channel, float *sum_gpu, void *stream);
+    __host__ void softmax_<float>(float *in, float *out, const int batch_size, const int channel, float *sum_gpu, void *stream);
 
     template
-    __host__ void softmax_<half>(half *inout, const int batch_size, const int channel, half *sum_gpu, void *stream);
+    __host__ void softmax_<half>(half *in, half *out, const int batch_size, const int channel, half *sum_gpu, void *stream);
 }
diff --git a/src/cuBERT/op/Softmax.h b/src/cuBERT/op/Softmax.h
index 8d9cfcb..fd44c40 100644
--- a/src/cuBERT/op/Softmax.h
+++ b/src/cuBERT/op/Softmax.h
@@ -6,7 +6,8 @@
 namespace cuBERT {
 
     template <typename T>
-    void softmax_(T *inout,
+    void softmax_(T *in,
+                  T *out,
                   const int batch_size,
                   const int channel,
                   T *sum_gpu,
@@ -19,7 +20,9 @@ namespace cuBERT {
 
         virtual ~Softmax();
 
-        void compute_(size_t batch_size, T *inout_gpu, void* stream);
+        void compute_(size_t batch_size, T *inout, void* stream);
+
+        void compute_(size_t batch_size, T *in, T *out, void* stream);
 
     private:
         size_t channel;
diff --git a/src/cuBERT/op_out/AdditionalOutputLayer.cpp b/src/cuBERT/op_out/AdditionalOutputLayer.cpp
index 3d4705d..a6731b7 100644
--- a/src/cuBERT/op_out/AdditionalOutputLayer.cpp
+++ b/src/cuBERT/op_out/AdditionalOutputLayer.cpp
@@ -13,8 +13,6 @@ namespace cuBERT {
         this->handle = handle;
         this->hidden_size = hidden_size;
         this->num_labels = num_labels;
-        
-        std::cerr << "hidden_size:" << this->hidden_size << " num_labels:" << this->num_labels << std::endl;
 
         this->output_weights = static_cast<T *>(cuBERT::malloc(sizeof(T) * hidden_size * this->num_labels));
         cuBERT::memcpy(this->output_weights, output_weights, sizeof(T) * hidden_size * this->num_labels, 1);
@@ -28,10 +26,13 @@ namespace cuBERT {
             this->output_bias = nullptr;
         }
 
+        this->softmax = new Softmax<T>(max_batch_size, num_labels);
     }
 
     template <typename T>
     ClassifierOutputLayer<T>::~ClassifierOutputLayer() {
+        delete softmax;
+
         if (output_bias != nullptr) {
             cuBERT::free(output_bias);
         }
@@ -47,7 +48,7 @@ namespace cuBERT {
     }
 
     template<typename T>
-    void ClassifierOutputLayer<T>::_in_compute(size_t batch_size, T *input, T *output) {
+    void ClassifierOutputLayer<T>::_in_compute(size_t batch_size, T *input, T *output_logits, T *output_probs) {
         float beta = output_bias == nullptr ? 0.f : 1.f;
         cuBERT::blas_gemm(handle, true, false,
                           num_labels, batch_size, hidden_size,
@@ -55,13 +56,17 @@ namespace cuBERT {
                           output_weights, hidden_size,
                           input, hidden_size,
                           beta,
-                          output, num_labels);
+                          output_logits, num_labels);
+        if (output_probs != nullptr) {
+            void* streamId = blas_get_stream(handle);
+            softmax->compute_(batch_size, output_logits, output_probs, streamId);
+        }
     }
 
     template <typename T>
-    void ClassifierOutputLayer<T>::compute(size_t batch_size, T *input, T *output) {
-        _pre_compute(batch_size, output);
-        _in_compute(batch_size, input, output);
+    void ClassifierOutputLayer<T>::compute(size_t batch_size, T *input, T *output_logits, T *output_probs) {
+        _pre_compute(batch_size, output_logits);
+        _in_compute(batch_size, input, output_logits, output_probs);
     }
 
     template class ClassifierOutputLayer<float>;
diff --git a/src/cuBERT/op_out/AdditionalOutputLayer.h b/src/cuBERT/op_out/AdditionalOutputLayer.h
index b77cc73..7f271e7 100644
--- a/src/cuBERT/op_out/AdditionalOutputLayer.h
+++ b/src/cuBERT/op_out/AdditionalOutputLayer.h
@@ -4,7 +4,6 @@
 
 #include <cstddef>
 
-
 #include "cuBERT/op/Softmax.h"
 
 namespace cuBERT {
@@ -21,6 +20,8 @@ namespace cuBERT {
      *
      * logits = tf.matmul(output_layer, output_weights, transpose_b=True)
      * logits = tf.nn.bias_add(logits, output_bias)
+     * 
+     * probabilities = tf.nn.softmax(logits, axis=-1)
      */
     template <typename T>
     class ClassifierOutputLayer {
@@ -36,9 +37,9 @@ namespace cuBERT {
 
         void _pre_compute(size_t batch_size, T *output);
 
-        void _in_compute(size_t batch_size, T *input, T *output);
+        void _in_compute(size_t batch_size, T *input, T *output_logits, T *output_probs);
 
-        void compute(size_t batch_size, T *in_gpu, T *out_gpu);
+        void compute(size_t batch_size, T *in_gpu, T *output_logits, T *output_probs);
 
     private:
         void* handle;
@@ -49,6 +50,8 @@ namespace cuBERT {
         // cpu/gpu buffer
         T *output_weights;
         T *output_bias;
+
+        Softmax<T> *softmax;
     };
 }
 
diff --git a/test/cuBERT/BertTest.cpp b/test/cuBERT/BertTest.cpp
index 2c6bb12..8f4805d 100644
--- a/test/cuBERT/BertTest.cpp
+++ b/test/cuBERT/BertTest.cpp
@@ -49,7 +49,7 @@ TEST_F(BertTest, compute) {
     EXPECT_FLOAT_EQ(embedding_output[49151], 0.33240327);
 
     float logits[2];
-    bert.logits(2, logits);
+    bert.logits(2, logits, nullptr);
     EXPECT_NEAR(logits[0], -2.9427543, 1e-5);
     EXPECT_NEAR(logits[1], -1.4876306, 1e-5);
 }
@@ -104,7 +104,7 @@ TEST_F(BertHalfTest, compute) {
     float logits[2];
     half logits_half[2];
 
-    bert.logits(2, logits_half);
+    bert.logits(2, logits_half, nullptr);
     half2float(logits_half, logits, 2);
 
     EXPECT_NEAR(logits[0], -2.9427543, 0.01);
diff --git a/test/cuBERT/op_out/AdditionalOutputLayerTest.cpp b/test/cuBERT/op_out/AdditionalOutputLayerTest.cpp
index 151a18e..f5ff92b 100644
--- a/test/cuBERT/op_out/AdditionalOutputLayerTest.cpp
+++ b/test/cuBERT/op_out/AdditionalOutputLayerTest.cpp
@@ -22,7 +22,7 @@ TEST_F(CommonTest, additional_output) {
 
     cuBERT::memcpy(in_gpu, in, sizeof(float) * 6, 1);
 
-    aol.compute(2, in_gpu, out_gpu);
+    aol.compute(2, in_gpu, out_gpu, nullptr);
 
     cuBERT::memcpy(out, out_gpu, sizeof(float) * 2, 2);
     cuBERT::free(in_gpu);