diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..e209e8054 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..937cb81f5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "512": {"BLOCK_M": 4, "BLOCK_N": 64, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 16}, "4096": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..2d34061d1 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..7df7062ab --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "512": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 16}, "2048": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..6a6371ae5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..72e06487b --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..6cbc29191 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..e63a93348 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json new file mode 100644 index 000000000..5badc266e --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json index 7ddfcb5b9..49d77008c 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 8, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 8, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json new file mode 100644 index 000000000..957fd19a5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json index 8bea98945..88b5ca0c0 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 4, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 16, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json new file mode 100644 index 000000000..71f70226e --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json index 9a8d3a8ba..51613a7ec 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 16, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}} \ No newline at end of file +{"1": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json new file mode 100644 index 000000000..4f8c7f4b2 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 2}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json index 799566ec4..8ca8cc0ae 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 16, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json new file mode 100644 index 000000000..f68930f63 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json index 5d23e0323..0fdbd462f 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 32, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 2}} \ No newline at end of file +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json new file mode 100644 index 000000000..18ea04bd8 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json index 26b63a807..6f2352659 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 8, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json new file mode 100644 index 000000000..af3e4937d --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json index 02da9e4cd..180bfeb57 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 5}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 2}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}} \ No newline at end of file +{"1": {"BLOCK_M": 16, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json new file mode 100644 index 000000000..dac590c1d --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json index df605213f..70aa39ba3 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 2, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json new file mode 100644 index 000000000..41782fee0 --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json index d1e5567bf..20ef120fb 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 16, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}} \ No newline at end of file +{"1": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json new file mode 100644 index 000000000..3410f7b3d --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json index ba33b4641..26f18f2e6 100644 --- a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json @@ -1 +1 @@ -{"1": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}} \ No newline at end of file +{"1": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json new file mode 100644 index 000000000..959a797dc --- /dev/null +++ b/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} \ No newline at end of file diff --git a/test/kernel/moe_silu_and_mul_tuning_bf16.py b/test/kernel/moe_silu_and_mul_tuning_bf16.py index 2c8bb1cb9..950d30d24 100644 --- a/test/kernel/moe_silu_and_mul_tuning_bf16.py +++ b/test/kernel/moe_silu_and_mul_tuning_bf16.py @@ -198,7 +198,7 @@ def tuning_configs( # tuning to get silu and mul for n in [128, 192, 256, 512, 1024, 1408, 2048, 4096, 8192]: json_dict = {} - for m in [1, 8, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: + for m in [1, 8, 64, 128, 200, 256, 512, 1024, 2048, 4096, 8192]: ans = mp_tuning( tuning_configs, { diff --git a/test/kernel/moe_sum_reduce_tuning_bf16.py b/test/kernel/moe_sum_reduce_tuning_bf16.py index 20e86b5d2..9b52efdaf 100644 --- a/test/kernel/moe_sum_reduce_tuning_bf16.py +++ b/test/kernel/moe_sum_reduce_tuning_bf16.py @@ -107,7 +107,7 @@ def get_test_configs(split_id, split_count): [1, 2, 4, 8, 16, 32], [64, 128, 256, 512, 1024], [1, 2, 4, 8, 16], - [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5, 6, 7, 8], ) for BLOCK_M, BLOCK_DIM, num_warps, NUM_STAGE in result: @@ -211,7 +211,7 @@ def tuning_configs( for hidden_dim in [1024, 2048, 4096, 5120, 8192]: for topk_num in [1, 6]: json_dict = {} - for m in [1, 8, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: + for m in [1, 8, 64, 128, 200, 256, 512, 1024, 2048, 4096, 8192]: ans = mp_tuning( tuning_configs, {