diff --git a/src/application/app_alphapose.cpp b/src/application/app_alphapose.cpp index 590facf..a775c60 100644 --- a/src/application/app_alphapose.cpp +++ b/src/application/app_alphapose.cpp @@ -40,7 +40,7 @@ int app_alphapose(){ return 0; string onnx_file = iLogger::format("%s.onnx", name); - string model_file = iLogger::format("%s.fp32.trtmodel", name); + string model_file = iLogger::format("%s.FP32.trtmodel", name); int test_batch_size = 16; if(!iLogger::exists(model_file)){ diff --git a/src/application/app_arcface.cpp b/src/application/app_arcface.cpp index 802667a..aeba6cf 100644 --- a/src/application/app_arcface.cpp +++ b/src/application/app_arcface.cpp @@ -32,7 +32,7 @@ static bool compile_models(){ return false; string onnx_file = iLogger::format("%s.onnx", name); - string model_file = iLogger::format("%s.fp32.trtmodel", name); + string model_file = iLogger::format("%s.FP32.trtmodel", name); int test_batch_size = 1; if(not iLogger::exists(model_file)){ @@ -116,7 +116,7 @@ int app_arcface(){ auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f); //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.5f); - auto arcface = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0); + auto arcface = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0); auto library = build_library(detector, arcface); auto files = iLogger::find_files("face/recognize"); @@ -180,7 +180,7 @@ int app_arcface_video(){ auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f); //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.5f); - auto arcface = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0); + auto arcface = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0); auto library = build_library(detector, arcface); //auto remote_show = create_zmq_remote_show(); INFO("Use tools/show.py to remote show"); @@ -277,7 +277,7 @@ int app_arcface_tracker(){ auto detector = Scrfd::create_infer("scrfd_2.5g_bnkps.640x480.FP32.trtmodel", 0, 0.6f); //auto detector = RetinaFace::create_infer("mb_retinaface.640x480.FP32.trtmodel", 0, 0.6f); - auto arcface = Arcface::create_infer("arcface_iresnet50.fp32.trtmodel", 0); + auto arcface = Arcface::create_infer("arcface_iresnet50.FP32.trtmodel", 0); //auto library = build_library(detector, arcface); //tools/show.py connect to remote show diff --git a/src/application/app_arcface/arcface.cpp b/src/application/app_arcface/arcface.cpp index 1b9bda0..20c5f94 100644 --- a/src/application/app_arcface/arcface.cpp +++ b/src/application/app_arcface/arcface.cpp @@ -179,8 +179,8 @@ namespace Arcface{ //checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); // speed up memcpy(image_host, image.data, size_image); + memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); - checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_)); CUDAKernel::warp_affine_bilinear_and_normalize( diff --git a/src/application/app_fall_recognize.cpp b/src/application/app_fall_recognize.cpp index fa33bb2..a16d0ce 100644 --- a/src/application/app_fall_recognize.cpp +++ b/src/application/app_fall_recognize.cpp @@ -26,7 +26,7 @@ static bool compile_models(){ return false; string onnx_file = iLogger::format("%s.onnx", name); - string model_file = iLogger::format("%s.fp32.trtmodel", name); + string model_file = iLogger::format("%s.FP32.trtmodel", name); int test_batch_size = 1; if(not iLogger::exists(model_file)){ @@ -46,13 +46,13 @@ static bool compile_models(){ int app_fall_recognize(){ cv::setNumThreads(0); - INFO("===================== test alphapose fp32 =================================="); + INFO("===================== test alphapose FP32 =================================="); if(!compile_models()) return 0; - auto pose_model_file = "sppe.fp32.trtmodel"; - auto detector_model_file = "yolox_m.fp32.trtmodel"; - auto gcn_model_file = "fall_bp.fp32.trtmodel"; + auto pose_model_file = "sppe.FP32.trtmodel"; + auto detector_model_file = "yolox_m.FP32.trtmodel"; + auto gcn_model_file = "fall_bp.FP32.trtmodel"; auto pose_model = AlphaPose::create_infer(pose_model_file, 0); auto detector_model = Yolo::create_infer(detector_model_file, Yolo::Type::X, 0, 0.4f); diff --git a/src/application/app_high_performance/yolo_high_perf.cpp b/src/application/app_high_performance/yolo_high_perf.cpp index 3e9a63d..cb6af3b 100644 --- a/src/application/app_high_performance/yolo_high_perf.cpp +++ b/src/application/app_high_performance/yolo_high_perf.cpp @@ -241,9 +241,10 @@ namespace YoloHighPerf{ float* affine_matrix_host = (float*)cpu_workspace; uint8_t* image_host = size_matrix + cpu_workspace; - checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); + //checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); + memcpy(image_host, image.data, size_image); + memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); - checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_)); CUDAKernel::warp_affine_bilinear_and_normalize( diff --git a/src/application/app_retinaface/retinaface.cpp b/src/application/app_retinaface/retinaface.cpp index f28b4d1..8b65e01 100644 --- a/src/application/app_retinaface/retinaface.cpp +++ b/src/application/app_retinaface/retinaface.cpp @@ -249,8 +249,8 @@ namespace RetinaFace{ // checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); // speed up memcpy(image_host, image.data, size_image); + memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); - checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_)); CUDAKernel::warp_affine_bilinear_and_normalize( diff --git a/src/application/app_scrfd.cpp b/src/application/app_scrfd.cpp index 6cbd8c9..6b991c3 100644 --- a/src/application/app_scrfd.cpp +++ b/src/application/app_scrfd.cpp @@ -81,7 +81,7 @@ static void scrfd_performance(shared_ptr<Scrfd::Infer> infer){ int app_scrfd(){ TRT::set_device(0); - INFO("===================== test scrfd fp32 =================================="); + INFO("===================== test scrfd FP32 =================================="); string model_file; if(!compile_scrfd(640, 640, model_file)) diff --git a/src/application/app_scrfd/scrfd.cpp b/src/application/app_scrfd/scrfd.cpp index 1fe124a..fa1a2aa 100644 --- a/src/application/app_scrfd/scrfd.cpp +++ b/src/application/app_scrfd/scrfd.cpp @@ -251,8 +251,8 @@ namespace Scrfd{ //checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); // speed up memcpy(image_host, image.data, size_image); + memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); - checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_)); CUDAKernel::warp_affine_bilinear_and_normalize( diff --git a/src/application/app_yolo/yolo.cpp b/src/application/app_yolo/yolo.cpp index 001634c..3db16f3 100644 --- a/src/application/app_yolo/yolo.cpp +++ b/src/application/app_yolo/yolo.cpp @@ -246,8 +246,8 @@ namespace Yolo{ //checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); // speed up memcpy(image_host, image.data, size_image); + memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); - checkCudaRuntime(cudaMemcpyAsync(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToHost, stream_)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_)); CUDAKernel::warp_affine_bilinear_and_normalize( diff --git a/src/main.cpp b/src/main.cpp index 5850e46..f6038fb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,7 +25,6 @@ void test_all(){ app_arcface_video(); app_arcface_tracker(); app_scrfd(); - app_plugin(); INFO("test done."); } diff --git a/src/tensorRT/common/trt_tensor.cpp b/src/tensorRT/common/trt_tensor.cpp index b80054f..e313652 100644 --- a/src/tensorRT/common/trt_tensor.cpp +++ b/src/tensorRT/common/trt_tensor.cpp @@ -263,7 +263,8 @@ namespace TRT{ if(head_ == DataHead::Device){ checkCudaRuntime(cudaMemcpyAsync((char*)data_->gpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToDevice, stream_)); }else if(head_ == DataHead::Host){ - checkCudaRuntime(cudaMemcpyAsync((char*)data_->cpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToHost, stream_)); + //checkCudaRuntime(cudaMemcpyAsync((char*)data_->cpu() + offset_location, src, copyed_bytes, cudaMemcpyHostToHost, stream_)); + memcpy((char*)data_->cpu() + offset_location, src, copyed_bytes); }else{ INFOE("Unsupport head type %d", head_); }