diff --git a/src/application/app_alphapose.cpp b/src/application/app_alphapose.cpp
index 590facf..a775c60 100644
--- a/src/application/app_alphapose.cpp
+++ b/src/application/app_alphapose.cpp
@@ -40,7 +40,7 @@ int app_alphapose(){
         return 0;
 
     string onnx_file = iLogger::format("%s.onnx", name);
-    string model_file = iLogger::format("%s.fp32.trtmodel", name);
+    string model_file = iLogger::format("%s.FP32.trtmodel", name);
     int test_batch_size = 16;  
     
     if(!iLogger::exists(model_file)){
diff --git a/src/application/app_arcface.cpp b/src/application/app_arcface.cpp
index 802667a..aeba6cf 100644
--- a/src/application/app_arcface.cpp
+++ b/src/application/app_arcface.cpp
@@ -32,7 +32,7 @@ static bool compile_models(){
             return false;
 
         string onnx_file = iLogger::format("%s.onnx", name);
-        string model_file = iLogger::format("%s.fp32.trtmodel", name);
+        string model_file = iLogger::format("%s.FP32.trtmodel", name);
         int test_batch_size = 1;
         
         if(not iLogger::exists(model_file)){
@@ -116,7 +116,7 @@ int app_arcface(){
 
     auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f);
     //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.5f);
-    auto arcface  = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0);
+    auto arcface  = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0);
     auto library  = build_library(detector, arcface);
 
     auto files    = iLogger::find_files("face/recognize");
@@ -180,7 +180,7 @@ int app_arcface_video(){
 
     auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f);
     //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.5f);
-    auto arcface  = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0);
+    auto arcface  = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0);
     auto library  = build_library(detector, arcface);
     //auto remote_show = create_zmq_remote_show();
     INFO("Use tools/show.py to remote show");
@@ -277,7 +277,7 @@ int app_arcface_tracker(){
 
     auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f);
     //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.6f);
-    auto arcface  = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0);
+    auto arcface  = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0);
     //auto library  = build_library(detector, arcface);
 
     //tools/show.py connect to remote show
diff --git a/src/application/app_arcface/arcface.cpp b/src/application/app_arcface/arcface.cpp
index 1b9bda0..20c5f94 100644
--- a/src/application/app_arcface/arcface.cpp
+++ b/src/application/app_arcface/arcface.cpp
@@ -179,8 +179,8 @@ namespace Arcface{
             //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
             // speed up
             memcpy(image_host, image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i,   sizeof(job.additional.d2i));
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
-            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i,   sizeof(job.additional.d2i), cudaMemcpyHostToHost,   stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
 
             CUDAKernel::warp_affine_bilinear_and_normalize(
diff --git a/src/application/app_fall_recognize.cpp b/src/application/app_fall_recognize.cpp
index fa33bb2..a16d0ce 100644
--- a/src/application/app_fall_recognize.cpp
+++ b/src/application/app_fall_recognize.cpp
@@ -26,7 +26,7 @@ static bool compile_models(){
             return false;
 
         string onnx_file = iLogger::format("%s.onnx", name);
-        string model_file = iLogger::format("%s.fp32.trtmodel", name);
+        string model_file = iLogger::format("%s.FP32.trtmodel", name);
         int test_batch_size = 1; 
         
         if(not iLogger::exists(model_file)){
@@ -46,13 +46,13 @@ static bool compile_models(){
 int app_fall_recognize(){
     cv::setNumThreads(0);
 
-    INFO("===================== test alphapose fp32 ==================================");
+    INFO("===================== test alphapose FP32 ==================================");
     if(!compile_models())
         return 0;
     
-    auto pose_model_file     = "sppe.fp32.trtmodel";
-    auto detector_model_file = "yolox_m.fp32.trtmodel";
-    auto gcn_model_file      = "fall_bp.fp32.trtmodel";
+    auto pose_model_file     = "sppe.FP32.trtmodel";
+    auto detector_model_file = "yolox_m.FP32.trtmodel";
+    auto gcn_model_file      = "fall_bp.FP32.trtmodel";
     
     auto pose_model     = AlphaPose::create_infer(pose_model_file, 0);
     auto detector_model = Yolo::create_infer(detector_model_file, Yolo::Type::X, 0, 0.4f);
diff --git a/src/application/app_high_performance/yolo_high_perf.cpp b/src/application/app_high_performance/yolo_high_perf.cpp
index 3e9a63d..cb6af3b 100644
--- a/src/application/app_high_performance/yolo_high_perf.cpp
+++ b/src/application/app_high_performance/yolo_high_perf.cpp
@@ -241,9 +241,10 @@ namespace YoloHighPerf{
             float* affine_matrix_host     = (float*)cpu_workspace;
             uint8_t* image_host           = size_matrix + cpu_workspace;
 
-            checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            memcpy(image_host,   image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
-            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
 
             CUDAKernel::warp_affine_bilinear_and_normalize(
diff --git a/src/application/app_retinaface/retinaface.cpp b/src/application/app_retinaface/retinaface.cpp
index f28b4d1..8b65e01 100644
--- a/src/application/app_retinaface/retinaface.cpp
+++ b/src/application/app_retinaface/retinaface.cpp
@@ -249,8 +249,8 @@ namespace RetinaFace{
             // checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
             // speed up
             memcpy(image_host, image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
-            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
 
             CUDAKernel::warp_affine_bilinear_and_normalize(
diff --git a/src/application/app_scrfd.cpp b/src/application/app_scrfd.cpp
index 6cbd8c9..6b991c3 100644
--- a/src/application/app_scrfd.cpp
+++ b/src/application/app_scrfd.cpp
@@ -81,7 +81,7 @@ static void scrfd_performance(shared_ptr<Scrfd::Infer> infer){
 int app_scrfd(){
 
     TRT::set_device(0);
-    INFO("===================== test scrfd fp32 ==================================");
+    INFO("===================== test scrfd FP32 ==================================");
 
     string model_file;
     if(!compile_scrfd(640, 640, model_file))
diff --git a/src/application/app_scrfd/scrfd.cpp b/src/application/app_scrfd/scrfd.cpp
index 1fe124a..fa1a2aa 100644
--- a/src/application/app_scrfd/scrfd.cpp
+++ b/src/application/app_scrfd/scrfd.cpp
@@ -251,8 +251,8 @@ namespace Scrfd{
             //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
             // speed up
             memcpy(image_host, image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
-            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
 
             CUDAKernel::warp_affine_bilinear_and_normalize(
diff --git a/src/application/app_yolo/yolo.cpp b/src/application/app_yolo/yolo.cpp
index 001634c..3db16f3 100644
--- a/src/application/app_yolo/yolo.cpp
+++ b/src/application/app_yolo/yolo.cpp
@@ -246,8 +246,8 @@ namespace Yolo{
             //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
             // speed up
             memcpy(image_host, image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
             checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
-            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_));
             checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
 
             CUDAKernel::warp_affine_bilinear_and_normalize(
diff --git a/src/main.cpp b/src/main.cpp
index 5850e46..f6038fb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -25,7 +25,6 @@ void test_all(){
     app_arcface_video();
     app_arcface_tracker();
     app_scrfd();
-    app_plugin();
     INFO("test done.");
 }
 
diff --git a/src/tensorRT/common/trt_tensor.cpp b/src/tensorRT/common/trt_tensor.cpp
index b80054f..e313652 100644
--- a/src/tensorRT/common/trt_tensor.cpp
+++ b/src/tensorRT/common/trt_tensor.cpp
@@ -263,7 +263,8 @@ namespace TRT{
 		if(head_ == DataHead::Device){
 			checkCudaRuntime(cudaMemcpyAsync((char*)data_->gpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToDevice, stream_));
 		}else if(head_ == DataHead::Host){
-			checkCudaRuntime(cudaMemcpyAsync((char*)data_->cpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToHost, stream_));
+			//checkCudaRuntime(cudaMemcpyAsync((char*)data_->cpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToHost, stream_));
+			memcpy((char*)data_->cpu() + offset_location, src, copyed_bytes);
 		}else{
 			INFOE("Unsupport head type %d", head_);
 		}