From 139d701db77d66ade0bc3c1f9148f25df4563218 Mon Sep 17 00:00:00 2001 From: daquexian Date: Thu, 26 Nov 2020 12:10:09 +0800 Subject: [PATCH 1/6] add int8 args --- Classification/cnns/config.py | 16 +++++++++++++++- Classification/cnns/evaluate.sh | 10 ++++++---- Classification/cnns/job_function_util.py | 8 ++++++++ Classification/cnns/of_cnn_evaluate.py | 3 +++ Classification/cnns/of_cnn_train_val.py | 12 +++++++++--- Classification/cnns/train.sh | 10 +++++----- 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/Classification/cnns/config.py b/Classification/cnns/config.py index 215a385..b84611c 100755 --- a/Classification/cnns/config.py +++ b/Classification/cnns/config.py @@ -68,8 +68,22 @@ def str2bool(v): type=str2bool, nargs='?', const=True, - help='Whether to use use xla' + help='Whether to use xla' ) + parser.add_argument( + '--use_tensorrt', + type=str2bool, + nargs='?', + const=True, + help='Whether to use tensorrt' + ) + parser.add_argument( + '--use_int8', + type=str2bool, + nargs='?', + const=True, + help='Whether to use int8 calibration' + ) parser.add_argument( '--channel_last', type=str2bool, diff --git a/Classification/cnns/evaluate.sh b/Classification/cnns/evaluate.sh index b669f18..0d22159 100644 --- a/Classification/cnns/evaluate.sh +++ b/Classification/cnns/evaluate.sh @@ -7,13 +7,15 @@ DATA_ROOT=/dataset/ImageNet/ofrecord MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" python3 of_cnn_evaluate.py \ - --num_epochs=3 \ + --num_epochs=1 \ --num_val_examples=50000 \ --model_load_dir=$MODEL_LOAD_DIR \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ --node_ips='127.0.0.1' \ - --gpu_num_per_node=4 \ - --val_batch_size_per_device=64 \ - --model="resnet50" + --gpu_num_per_node=1 \ + --val_batch_size_per_device=256 \ + --model="resnet50" \ + --use_tensorrt \ + --use_int8 diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index 88675c0..ec66a1a 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -26,6 +26,14 @@ def _default_config(args): if args.use_xla: config.use_xla_jit(True) config.enable_fuse_add_to_output(True) + if args.use_tensorrt: + config.use_tensorrt(True) + if args.use_int8: + config.tensorrt.use_int8() + elif args.use_int8: + raise Exception("You can set use_int8 only after use_tensorrt is True!") + # int8_calibration_path = "./int8_calibration" + # config.tensorrt.int8_calibration(int8_calibration_path) return config diff --git a/Classification/cnns/of_cnn_evaluate.py b/Classification/cnns/of_cnn_evaluate.py index 286f25b..a7766e6 100644 --- a/Classification/cnns/of_cnn_evaluate.py +++ b/Classification/cnns/of_cnn_evaluate.py @@ -79,6 +79,9 @@ def main(): for i in range(args.num_epochs): for j in range(num_val_steps): + if args.use_int8 and j ==10: + flow.tensorrt.cache_int8_calibration() + InferenceNet().async_get(metric.metric_cb(0, j)) diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 31a924c..83dc95b 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -39,7 +39,6 @@ epoch_size = math.ceil(args.num_examples / train_batch_size) num_val_steps = int(args.num_val_examples / val_batch_size) - model_dict = { "resnet50": resnet_model.resnet50, "vgg": vgg_model.vgg16bn, @@ -126,12 +125,19 @@ def main(): batch_size=train_batch_size, loss_key='loss') for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) - + # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) - for i in range(num_val_steps): + for i in range(val_batch_size): + # if i<=10: + # InferenceNet().get() + # if i ==10: + # flow.tensorrt.cache_int8_calibration() + # else: + # InferenceNet().async_get(metric.metric_cb(epoch, i)) InferenceNet().async_get(metric.metric_cb(epoch, i)) + snapshot.save('epoch_{}'.format(epoch)) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 6aa2b80..d63428e 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -12,11 +12,11 @@ echo NUM_EPOCH=$NUM_EPOCH if [ -n "$2" ]; then DATA_ROOT=$2 else - DATA_ROOT=/data/imagenet/ofrecord + DATA_ROOT=/dataset/ImageNet/ofrecord fi echo DATA_ROOT=$DATA_ROOT -LOG_FOLDER=../logs +LOG_FOLDER=./logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log @@ -26,13 +26,13 @@ python3 of_cnn_train_val.py \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=8 \ + --gpu_num_per_node=2 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.024 \ - --loss_print_every_n_iter=100 \ - --batch_size_per_device=128 \ + --loss_print_every_n_iter=10 \ + --batch_size_per_device=64 \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} From de08dc6bfeeff5dd0e7084992a21f00d60ed165d Mon Sep 17 00:00:00 2001 From: daquexian Date: Fri, 8 Jan 2021 00:47:11 +0800 Subject: [PATCH 2/6] fix trt infer performance --- Classification/cnns/config.py | 15 ++++++++++---- Classification/cnns/evaluate.sh | 25 ++++++++++++++++++++---- Classification/cnns/job_function_util.py | 9 +++++---- Classification/cnns/of_cnn_evaluate.py | 24 +++++++++++++++++++---- Classification/cnns/util.py | 6 +++--- 5 files changed, 60 insertions(+), 19 deletions(-) diff --git a/Classification/cnns/config.py b/Classification/cnns/config.py index b84611c..defc046 100755 --- a/Classification/cnns/config.py +++ b/Classification/cnns/config.py @@ -74,16 +74,23 @@ def str2bool(v): '--use_tensorrt', type=str2bool, nargs='?', - const=True, + default=False, help='Whether to use tensorrt' ) parser.add_argument( - '--use_int8', + '--use_int8_online', type=str2bool, nargs='?', - const=True, - help='Whether to use int8 calibration' + default=False, + help='Whether to use online int8 calibration' ) + parser.add_argument( + '--use_int8_offline', + type=str2bool, + nargs='?', + default=False, help='Whether to use online int8 calibration' + ) + parser.add_argument( '--channel_last', type=str2bool, diff --git a/Classification/cnns/evaluate.sh b/Classification/cnns/evaluate.sh index 0d22159..db27303 100644 --- a/Classification/cnns/evaluate.sh +++ b/Classification/cnns/evaluate.sh @@ -6,7 +6,21 @@ DATA_ROOT=/dataset/ImageNet/ofrecord # Set up model path, e.g. : vgg16_of_best_model_val_top1_721 alexnet_of_best_model_val_top1_54762 MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" - python3 of_cnn_evaluate.py \ +#PYTHONPATH=/home/dev/files/repos/oneflow6/build-debug/python_scripts \ +#python3 of_cnn_evaluate.py \ +# --num_epochs=1 \ +# --num_val_examples=50000 \ +# --model_load_dir=$MODEL_LOAD_DIR \ +# --val_data_dir=$DATA_ROOT/validation \ +# --val_data_part_num=256 \ +# --num_nodes=1 \ +# --node_ips='127.0.0.1' \ +# --gpu_num_per_node=1 \ +# --val_batch_size_per_device=256 \ +# --model="resnet50" |& tee rn50-offline-int8.log + +PYTHONPATH=/home/dev/files/repos/oneflow6/build-release/python_scripts \ +python3 of_cnn_evaluate.py \ --num_epochs=1 \ --num_val_examples=50000 \ --model_load_dir=$MODEL_LOAD_DIR \ @@ -15,7 +29,10 @@ MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" --num_nodes=1 \ --node_ips='127.0.0.1' \ --gpu_num_per_node=1 \ - --val_batch_size_per_device=256 \ + --val_batch_size_per_device=320 \ --model="resnet50" \ - --use_tensorrt \ - --use_int8 + --use_tensorrt=True \ + --use_int8_online=True \ + --use_int8_offline=False \ + |& tee rn50-offline-int8.log +# --use_int8_offline |& tee rn50-offline-int8.log diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index ec66a1a..c0d68ac 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -28,12 +28,13 @@ def _default_config(args): config.enable_fuse_add_to_output(True) if args.use_tensorrt: config.use_tensorrt(True) - if args.use_int8: + if args.use_int8_online or args.use_int8_offline: config.tensorrt.use_int8() - elif args.use_int8: + elif args.use_int8_online: raise Exception("You can set use_int8 only after use_tensorrt is True!") - # int8_calibration_path = "./int8_calibration" - # config.tensorrt.int8_calibration(int8_calibration_path) + if args.use_int8_offline: + int8_calibration_path = "./int8_calibration" + config.tensorrt.int8_calibration(int8_calibration_path) return config diff --git a/Classification/cnns/of_cnn_evaluate.py b/Classification/cnns/of_cnn_evaluate.py index a7766e6..12fbe73 100644 --- a/Classification/cnns/of_cnn_evaluate.py +++ b/Classification/cnns/of_cnn_evaluate.py @@ -74,16 +74,32 @@ def main(): print("Restoring model from {}.".format(args.model_load_dir)) checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) + + if args.use_int8_online: + for j in range(10): + flow.tensorrt.cache_int8_calibration() + + warmup = 2 + for j in range(warmup): + InferenceNet().get() + metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) - for i in range(args.num_epochs): for j in range(num_val_steps): - if args.use_int8 and j ==10: - flow.tensorrt.cache_int8_calibration() - InferenceNet().async_get(metric.metric_cb(0, j)) + # for i in range(args.num_epochs): + # for j in range(num_val_steps): + # if i == 0 and j == 10: + # if args.use_int8_online: + # flow.tensorrt.cache_int8_calibration() + # if args.use_int8_offline: + # flow.tensorrt.write_int8_calibration("./int8_calibration") + + + # InferenceNet().async_get(metric.metric_cb(0, j)) + if __name__ == "__main__": main() diff --git a/Classification/cnns/util.py b/Classification/cnns/util.py index d153f04..83c6aad 100755 --- a/Classification/cnns/util.py +++ b/Classification/cnns/util.py @@ -172,9 +172,9 @@ def callback(outputs): self.summary.scalar(self.desc + "_top_{}".format(self.top_k), top_k_accuracy, epoch, step) - if self.save_summary: - if (step + 1) % self.save_summary_steps == 0: - self.summary.save() + # if self.save_summary: + # if (step + 1) % self.save_summary_steps == 0: + # self.summary.save() return callback From 167c89e3971ede5b9eaf016e0c5767a782f61c03 Mon Sep 17 00:00:00 2001 From: daquexian Date: Fri, 8 Jan 2021 11:25:49 +0800 Subject: [PATCH 3/6] optimize arg parser --- Classification/cnns/evaluate.sh | 14 -------------- Classification/cnns/job_function_util.py | 6 ++++-- Classification/cnns/of_cnn_evaluate.py | 12 ------------ 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/Classification/cnns/evaluate.sh b/Classification/cnns/evaluate.sh index db27303..e15428c 100644 --- a/Classification/cnns/evaluate.sh +++ b/Classification/cnns/evaluate.sh @@ -6,19 +6,6 @@ DATA_ROOT=/dataset/ImageNet/ofrecord # Set up model path, e.g. : vgg16_of_best_model_val_top1_721 alexnet_of_best_model_val_top1_54762 MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" -#PYTHONPATH=/home/dev/files/repos/oneflow6/build-debug/python_scripts \ -#python3 of_cnn_evaluate.py \ -# --num_epochs=1 \ -# --num_val_examples=50000 \ -# --model_load_dir=$MODEL_LOAD_DIR \ -# --val_data_dir=$DATA_ROOT/validation \ -# --val_data_part_num=256 \ -# --num_nodes=1 \ -# --node_ips='127.0.0.1' \ -# --gpu_num_per_node=1 \ -# --val_batch_size_per_device=256 \ -# --model="resnet50" |& tee rn50-offline-int8.log - PYTHONPATH=/home/dev/files/repos/oneflow6/build-release/python_scripts \ python3 of_cnn_evaluate.py \ --num_epochs=1 \ @@ -35,4 +22,3 @@ python3 of_cnn_evaluate.py \ --use_int8_online=True \ --use_int8_offline=False \ |& tee rn50-offline-int8.log -# --use_int8_offline |& tee rn50-offline-int8.log diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index c0d68ac..dcfb02c 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -30,11 +30,13 @@ def _default_config(args): config.use_tensorrt(True) if args.use_int8_online or args.use_int8_offline: config.tensorrt.use_int8() - elif args.use_int8_online: - raise Exception("You can set use_int8 only after use_tensorrt is True!") + elif args.use_int8_online or args.use_int8_offline: + raise Exception("You can set use_int8_online or use_int8_offline only after use_tensorrt is True!") if args.use_int8_offline: int8_calibration_path = "./int8_calibration" config.tensorrt.int8_calibration(int8_calibration_path) + if args.use_int8_offline and if use_int8_online: + raise ValueError("You cannot use use_int8_offline or use_int8_online at the same time!") return config diff --git a/Classification/cnns/of_cnn_evaluate.py b/Classification/cnns/of_cnn_evaluate.py index 12fbe73..b4984f7 100644 --- a/Classification/cnns/of_cnn_evaluate.py +++ b/Classification/cnns/of_cnn_evaluate.py @@ -89,17 +89,5 @@ def main(): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j)) - # for i in range(args.num_epochs): - # for j in range(num_val_steps): - # if i == 0 and j == 10: - # if args.use_int8_online: - # flow.tensorrt.cache_int8_calibration() - # if args.use_int8_offline: - # flow.tensorrt.write_int8_calibration("./int8_calibration") - - - # InferenceNet().async_get(metric.metric_cb(0, j)) - - if __name__ == "__main__": main() From 9c29d13834f33c80741b4982a4d0c1b9a77734bd Mon Sep 17 00:00:00 2001 From: daquexian Date: Sat, 9 Jan 2021 13:49:20 +0800 Subject: [PATCH 4/6] fix int8 online infer --- Classification/cnns/evaluate.sh | 2 +- Classification/cnns/job_function_util.py | 2 +- Classification/cnns/of_cnn_evaluate.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Classification/cnns/evaluate.sh b/Classification/cnns/evaluate.sh index e15428c..d291377 100644 --- a/Classification/cnns/evaluate.sh +++ b/Classification/cnns/evaluate.sh @@ -16,7 +16,7 @@ python3 of_cnn_evaluate.py \ --num_nodes=1 \ --node_ips='127.0.0.1' \ --gpu_num_per_node=1 \ - --val_batch_size_per_device=320 \ + --val_batch_size_per_device=350 \ --model="resnet50" \ --use_tensorrt=True \ --use_int8_online=True \ diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index dcfb02c..bde64c9 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -35,7 +35,7 @@ def _default_config(args): if args.use_int8_offline: int8_calibration_path = "./int8_calibration" config.tensorrt.int8_calibration(int8_calibration_path) - if args.use_int8_offline and if use_int8_online: + if args.use_int8_offline and use_int8_online: raise ValueError("You cannot use use_int8_offline or use_int8_online at the same time!") return config diff --git a/Classification/cnns/of_cnn_evaluate.py b/Classification/cnns/of_cnn_evaluate.py index b4984f7..a8c1313 100644 --- a/Classification/cnns/of_cnn_evaluate.py +++ b/Classification/cnns/of_cnn_evaluate.py @@ -75,9 +75,11 @@ def main(): checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) + if args.use_int8_online: for j in range(10): - flow.tensorrt.cache_int8_calibration() + InferenceNet().get() + flow.tensorrt.cache_int8_calibration() warmup = 2 for j in range(warmup): From 7a5a3cbf50846765332d1aaa3b94e187832f8791 Mon Sep 17 00:00:00 2001 From: daquexian Date: Wed, 13 Jan 2021 17:08:22 +0800 Subject: [PATCH 5/6] fix online args error --- Classification/cnns/job_function_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Classification/cnns/job_function_util.py b/Classification/cnns/job_function_util.py index bde64c9..f218999 100755 --- a/Classification/cnns/job_function_util.py +++ b/Classification/cnns/job_function_util.py @@ -35,7 +35,7 @@ def _default_config(args): if args.use_int8_offline: int8_calibration_path = "./int8_calibration" config.tensorrt.int8_calibration(int8_calibration_path) - if args.use_int8_offline and use_int8_online: + if args.use_int8_offline and args.use_int8_online: raise ValueError("You cannot use use_int8_offline or use_int8_online at the same time!") return config From 06f336ed3923d212d15d75a2fbe0e74053c852c7 Mon Sep 17 00:00:00 2001 From: daquexian Date: Fri, 2 Apr 2021 14:57:30 +0800 Subject: [PATCH 6/6] update --- Classification/cnns/evaluate.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Classification/cnns/evaluate.sh b/Classification/cnns/evaluate.sh index d291377..0408b1c 100644 --- a/Classification/cnns/evaluate.sh +++ b/Classification/cnns/evaluate.sh @@ -16,9 +16,9 @@ python3 of_cnn_evaluate.py \ --num_nodes=1 \ --node_ips='127.0.0.1' \ --gpu_num_per_node=1 \ - --val_batch_size_per_device=350 \ + --val_batch_size_per_device=10 \ --model="resnet50" \ --use_tensorrt=True \ - --use_int8_online=True \ - --use_int8_offline=False \ + --use_int8_online=False \ + --use_int8_offline=True \ |& tee rn50-offline-int8.log