forked from littsk/test_attn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
profiler_output.txt
160 lines (156 loc) · 12 KB
/
profiler_output.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
====================
TRITON KERNELS BANDWIDTH INFO (./profiler-outputs/wk/cwkjglrfoeirfwgrjwutbt23brjyvzz75f3mlghtsiafcnoziuai.py)
0.160ms 0.453 GB 2826.25GB/s 10.80% triton_red_fused_5
0.152ms 0.201 GB 1327.19GB/s 10.22% triton_poi_fused_gelu_19
0.108ms 0.283 GB 2633.37GB/s 7.24% triton_poi_fused_cat_30
0.094ms 0.123 GB 1309.51GB/s 6.31% triton_red_fused__scaled_dot_product_flash_attention__to_copy_mean_mul_pow_28
0.085ms 0.227 GB 2661.66GB/s 5.74% triton_red_fused_29
0.082ms 0.227 GB 2766.68GB/s 5.52% triton_red_fused_25
0.075ms 0.113 GB 1513.28GB/s 5.05% triton_red_fused_add_mul_native_layer_norm_31
0.063ms 0.123 GB 1949.09GB/s 4.24% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_add_cat_mul_14
0.062ms 0.101 GB 1626.00GB/s 4.18% triton_red_fused_add_mul_native_layer_norm_17
0.062ms 0.101 GB 1633.49GB/s 4.16% triton_red_fused_add_mul_native_layer_norm_23
0.061ms 0.101 GB 1645.53GB/s 4.13% triton_red_fused_add_mul_native_layer_norm_20
0.057ms 0.085 GB 1487.19GB/s 3.85% triton_poi_fused_add_mul_33
0.055ms 0.114 GB 2075.44GB/s 3.68% triton_red_fused_add_cat_mul_native_layer_norm_27
0.027ms 0.057 GB 2072.48GB/s 1.84% triton_red_fused_add_4
0.027ms 0.057 GB 2088.64GB/s 1.84% triton_poi_fused_cat_13
0.026ms 0.057 GB 2141.25GB/s 1.78% triton_red_fused_26
0.025ms 0.050 GB 2005.12GB/s 1.69% triton_red_fused_add_mul_native_layer_norm_34
0.025ms 0.025 GB 1018.37GB/s 1.67% triton_poi_fused_gelu_21
0.025ms 0.050 GB 2047.42GB/s 1.66% triton_red_fused_add_mul_native_layer_norm_6
0.025ms 0.057 GB 2307.00GB/s 1.65% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_cat_15
0.024ms 0.057 GB 2334.40GB/s 1.63% triton_poi_fused_clone_16
0.020ms 0.038 GB 1898.08GB/s 1.34% triton_red_fused_32
0.017ms 0.013 GB 774.31GB/s 1.11% triton_red_fused_add_mul_native_layer_norm_22
0.017ms 0.026 GB 1546.78GB/s 1.11% triton_per_fused__to_copy_mean_pow_12
0.016ms 0.013 GB 784.70GB/s 1.09% triton_red_fused_add_mul_native_layer_norm_18
0.016ms 0.013 GB 785.87GB/s 1.09% triton_red_fused_add_mul_native_layer_norm_24
0.016ms 0.004 GB 266.71GB/s 1.05% triton_poi_fused_stack_10
0.015ms 0.004 GB 272.87GB/s 1.03% triton_poi_fused_stack_9
0.009ms 0.006 GB 695.33GB/s 0.62% triton_red_fused_add_mul_native_layer_norm_7
0.008ms 0.001 GB 142.37GB/s 0.57% triton_poi_fused_stack_8
0.007ms 0.005 GB 632.00GB/s 0.50% triton_per_fused_3
0.007ms 0.002 GB 226.34GB/s 0.47% triton_per_fused_1
0.007ms 0.003 GB 483.25GB/s 0.45% triton_per_fused__to_copy_mean_pow_11
0.005ms 0.000 GB 0.20GB/s 0.35% triton_poi_fused_cat_0
0.005ms 0.000 GB 0.21GB/s 0.33% triton_poi_fused_cat_2
SUMMARY (./profiler-outputs/wk/cwkjglrfoeirfwgrjwutbt23brjyvzz75f3mlghtsiafcnoziuai.py)
1.48ms 2.79 GB 1877.71GB/s
====================
TRITON KERNELS BANDWIDTH INFO (./profiler-outputs/wk/cwkjglrfoeirfwgrjwutbt23brjyvzz75f3mlghtsiafcnoziuai.py)
0.160ms 0.453 GB 2825.69GB/s 10.84% triton_red_fused_5
0.152ms 0.201 GB 1327.19GB/s 10.26% triton_poi_fused_gelu_19
0.107ms 0.283 GB 2635.72GB/s 7.26% triton_poi_fused_cat_30
0.093ms 0.123 GB 1322.16GB/s 6.27% triton_red_fused__scaled_dot_product_flash_attention__to_copy_mean_mul_pow_28
0.085ms 0.227 GB 2664.67GB/s 5.75% triton_red_fused_29
0.081ms 0.227 GB 2801.14GB/s 5.47% triton_red_fused_25
0.075ms 0.113 GB 1518.47GB/s 5.05% triton_red_fused_add_mul_native_layer_norm_31
0.063ms 0.123 GB 1961.55GB/s 4.23% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_add_cat_mul_14
0.062ms 0.101 GB 1630.96GB/s 4.18% triton_red_fused_add_mul_native_layer_norm_23
0.062ms 0.101 GB 1632.75GB/s 4.17% triton_red_fused_add_mul_native_layer_norm_17
0.061ms 0.101 GB 1647.25GB/s 4.14% triton_red_fused_add_mul_native_layer_norm_20
0.057ms 0.085 GB 1479.74GB/s 3.88% triton_poi_fused_add_mul_33
0.055ms 0.114 GB 2080.92GB/s 3.69% triton_red_fused_add_cat_mul_native_layer_norm_27
0.027ms 0.057 GB 2078.90GB/s 1.86% triton_poi_fused_cat_13
0.027ms 0.057 GB 2109.49GB/s 1.82% triton_red_fused_add_4
0.026ms 0.057 GB 2191.60GB/s 1.75% triton_red_fused_26
0.025ms 0.050 GB 2005.74GB/s 1.70% triton_red_fused_add_mul_native_layer_norm_6
0.025ms 0.057 GB 2298.02GB/s 1.67% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_cat_15
0.025ms 0.025 GB 1026.34GB/s 1.66% triton_poi_fused_gelu_21
0.024ms 0.050 GB 2060.23GB/s 1.65% triton_red_fused_add_mul_native_layer_norm_34
0.024ms 0.057 GB 2339.02GB/s 1.64% triton_poi_fused_clone_16
0.020ms 0.038 GB 1879.95GB/s 1.36% triton_red_fused_32
0.016ms 0.013 GB 776.57GB/s 1.12% triton_red_fused_add_mul_native_layer_norm_22
0.016ms 0.013 GB 790.95GB/s 1.09% triton_red_fused_add_mul_native_layer_norm_18
0.016ms 0.013 GB 793.73GB/s 1.08% triton_red_fused_add_mul_native_layer_norm_24
0.016ms 0.026 GB 1604.24GB/s 1.08% triton_per_fused__to_copy_mean_pow_12
0.016ms 0.004 GB 266.98GB/s 1.05% triton_poi_fused_stack_9
0.015ms 0.004 GB 276.36GB/s 1.02% triton_poi_fused_stack_10
0.009ms 0.006 GB 676.54GB/s 0.64% triton_red_fused_add_mul_native_layer_norm_7
0.008ms 0.001 GB 149.71GB/s 0.55% triton_poi_fused_stack_8
0.008ms 0.005 GB 593.93GB/s 0.54% triton_per_fused_3
0.007ms 0.003 GB 454.69GB/s 0.48% triton_per_fused__to_copy_mean_pow_11
0.007ms 0.002 GB 241.80GB/s 0.44% triton_per_fused_1
0.005ms 0.000 GB 0.20GB/s 0.35% triton_poi_fused_cat_0
0.004ms 0.000 GB 0.23GB/s 0.30% triton_poi_fused_cat_2
SUMMARY (./profiler-outputs/wk/cwkjglrfoeirfwgrjwutbt23brjyvzz75f3mlghtsiafcnoziuai.py)
1.48ms 2.79 GB 1884.33GB/s
====================
TRITON KERNELS BANDWIDTH INFO (./targets/54/c5443b7pdjj3aeabbbp3qgeklde4nhum4vqd3ihavfschzw6nv62.py)
0.160ms 0.453 GB 2834.74GB/s 10.78% triton_red_fused_5
0.152ms 0.201 GB 1327.19GB/s 10.23% triton_poi_fused_gelu_19
0.108ms 0.283 GB 2621.67GB/s 7.28% triton_poi_fused_cat_30
0.093ms 0.123 GB 1315.35GB/s 6.29% triton_red_fused__scaled_dot_product_flash_attention__to_copy_mean_mul_pow_28
0.085ms 0.227 GB 2665.67GB/s 5.73% triton_red_fused_29
0.081ms 0.227 GB 2786.27GB/s 5.48% triton_red_fused_25
0.075ms 0.113 GB 1511.67GB/s 5.06% triton_red_fused_add_mul_native_layer_norm_31
0.063ms 0.123 GB 1954.06GB/s 4.23% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_add_cat_mul_14
0.062ms 0.101 GB 1625.16GB/s 4.18% triton_red_fused_add_mul_native_layer_norm_17
0.062ms 0.101 GB 1634.34GB/s 4.16% triton_red_fused_add_mul_native_layer_norm_23
0.061ms 0.101 GB 1648.11GB/s 4.13% triton_red_fused_add_mul_native_layer_norm_20
0.057ms 0.085 GB 1479.74GB/s 3.87% triton_poi_fused_add_mul_33
0.054ms 0.114 GB 2088.27GB/s 3.66% triton_red_fused_add_cat_mul_native_layer_norm_27
0.027ms 0.057 GB 2091.09GB/s 1.84% triton_poi_fused_cat_13
0.027ms 0.057 GB 2082.22GB/s 1.84% triton_red_fused_add_4
0.027ms 0.057 GB 2130.95GB/s 1.79% triton_red_fused_26
0.025ms 0.057 GB 2237.01GB/s 1.71% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_cat_15
0.025ms 0.050 GB 2017.97GB/s 1.68% triton_red_fused_add_mul_native_layer_norm_34
0.025ms 0.050 GB 2042.11GB/s 1.67% triton_red_fused_add_mul_native_layer_norm_6
0.025ms 0.025 GB 1023.67GB/s 1.66% triton_poi_fused_gelu_21
0.024ms 0.057 GB 2334.40GB/s 1.64% triton_poi_fused_clone_16
0.020ms 0.038 GB 1925.95GB/s 1.32% triton_red_fused_32
0.016ms 0.013 GB 778.83GB/s 1.11% triton_red_fused_add_mul_native_layer_norm_22
0.016ms 0.013 GB 789.38GB/s 1.09% triton_red_fused_add_mul_native_layer_norm_18
0.016ms 0.013 GB 791.35GB/s 1.08% triton_red_fused_add_mul_native_layer_norm_24
0.016ms 0.026 GB 1601.03GB/s 1.08% triton_per_fused__to_copy_mean_pow_12
0.016ms 0.004 GB 267.26GB/s 1.05% triton_poi_fused_stack_10
0.016ms 0.004 GB 267.53GB/s 1.05% triton_poi_fused_stack_9
0.010ms 0.006 GB 656.58GB/s 0.66% triton_red_fused_add_mul_native_layer_norm_7
0.008ms 0.001 GB 146.80GB/s 0.55% triton_poi_fused_stack_8
0.008ms 0.005 GB 596.32GB/s 0.53% triton_per_fused_3
0.007ms 0.003 GB 485.59GB/s 0.44% triton_per_fused__to_copy_mean_pow_11
0.007ms 0.002 GB 241.80GB/s 0.44% triton_per_fused_1
0.005ms 0.000 GB 0.19GB/s 0.37% triton_poi_fused_cat_0
0.005ms 0.000 GB 0.21GB/s 0.33% triton_poi_fused_cat_2
SUMMARY (./targets/54/c5443b7pdjj3aeabbbp3qgeklde4nhum4vqd3ihavfschzw6nv62.py)
1.48ms 2.79 GB 1879.05GB/s
====================
TRITON KERNELS BANDWIDTH INFO (./targets/54/c5443b7pdjj3aeabbbp3qgeklde4nhum4vqd3ihavfschzw6nv62.py)
0.160ms 0.453 GB 2829.64GB/s 10.88% triton_red_fused_5
0.151ms 0.201 GB 1331.97GB/s 10.27% triton_poi_fused_gelu_19
0.108ms 0.283 GB 2632.59GB/s 7.31% triton_poi_fused_cat_30
0.094ms 0.123 GB 1309.96GB/s 6.36% triton_red_fused__scaled_dot_product_flash_attention__to_copy_mean_mul_pow_28
0.084ms 0.227 GB 2685.89GB/s 5.73% triton_red_fused_29
0.081ms 0.227 GB 2788.46GB/s 5.52% triton_red_fused_25
0.074ms 0.113 GB 1534.25GB/s 5.02% triton_red_fused_add_mul_native_layer_norm_31
0.062ms 0.123 GB 1964.07GB/s 4.24% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_add_cat_mul_14
0.062ms 0.101 GB 1632.75GB/s 4.19% triton_red_fused_add_mul_native_layer_norm_17
0.061ms 0.101 GB 1654.08GB/s 4.14% triton_red_fused_add_mul_native_layer_norm_23
0.061ms 0.101 GB 1662.01GB/s 4.12% triton_red_fused_add_mul_native_layer_norm_20
0.057ms 0.085 GB 1501.49GB/s 3.85% triton_poi_fused_add_mul_33
0.055ms 0.114 GB 2067.58GB/s 3.73% triton_red_fused_add_cat_mul_native_layer_norm_27
0.027ms 0.057 GB 2100.94GB/s 1.85% triton_poi_fused_cat_13
0.027ms 0.057 GB 2099.49GB/s 1.83% triton_red_fused_add_4
0.026ms 0.057 GB 2149.05GB/s 1.79% triton_red_fused_26
0.025ms 0.057 GB 2248.38GB/s 1.71% triton_poi_fused__scaled_dot_product_flash_attention__to_copy_cat_15
0.025ms 0.050 GB 2058.11GB/s 1.66% triton_red_fused_add_mul_native_layer_norm_6
0.024ms 0.057 GB 2313.04GB/s 1.66% triton_poi_fused_clone_16
0.024ms 0.050 GB 2079.28GB/s 1.65% triton_red_fused_add_mul_native_layer_norm_34
0.024ms 0.025 GB 1049.60GB/s 1.63% triton_poi_fused_gelu_21
0.020ms 0.038 GB 1925.95GB/s 1.33% triton_red_fused_32
0.016ms 0.013 GB 808.73GB/s 1.08% triton_red_fused_add_mul_native_layer_norm_22
0.016ms 0.026 GB 1630.43GB/s 1.07% triton_per_fused__to_copy_mean_pow_12
0.016ms 0.013 GB 814.92GB/s 1.06% triton_red_fused_add_mul_native_layer_norm_24
0.015ms 0.013 GB 821.21GB/s 1.05% triton_red_fused_add_mul_native_layer_norm_18
0.015ms 0.004 GB 272.30GB/s 1.04% triton_poi_fused_stack_10
0.015ms 0.004 GB 277.54GB/s 1.02% triton_poi_fused_stack_9
0.009ms 0.006 GB 712.65GB/s 0.61% triton_red_fused_add_mul_native_layer_norm_7
0.008ms 0.001 GB 149.71GB/s 0.55% triton_poi_fused_stack_8
0.007ms 0.005 GB 654.37GB/s 0.49% triton_per_fused_3
0.007ms 0.002 GB 231.63GB/s 0.47% triton_per_fused_1
0.007ms 0.003 GB 467.44GB/s 0.47% triton_per_fused__to_copy_mean_pow_11
0.005ms 0.000 GB 0.21GB/s 0.33% triton_poi_fused_cat_0
0.005ms 0.000 GB 0.22GB/s 0.31% triton_poi_fused_cat_2
SUMMARY (./targets/54/c5443b7pdjj3aeabbbp3qgeklde4nhum4vqd3ihavfschzw6nv62.py)
1.47ms 2.79 GB 1893.38GB/s