Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

几类循环模式的指令集角度观察(2) #33

Open
meton-robean opened this issue Feb 25, 2020 · 2 comments
Open

几类循环模式的指令集角度观察(2) #33

meton-robean opened this issue Feb 25, 2020 · 2 comments

Comments

@meton-robean
Copy link
Owner

meton-robean commented Feb 25, 2020

往期 #31 几类循环模式的指令集角度观察

simple sgemm

这个例子来自论文XLOOP 提供的benchmark

void sgemm_scalar_int( int C[], int A[], int B[], int size )
{

  for ( int mm = 0; mm < size; ++mm  ) {
    for ( int nn = 0; nn < size; ++nn ) {
      int c = 0;
      for ( int i = 0; i < size; ++i ) {
        int a = A[mm + i * size];
        int b = B[nn + i * size];
        c += a * b;
      }
      C[mm+nn*size] = c;
    }
  }

}

000000008000110a <sgemm_scalar_int>:
    8000110a:	06d05363          	blez	a3,80001170 <sgemm_scalar_int+0x66>
    8000110e:	1141                	addi	sp,sp,-16
    80001110:	e422                	sd	s0,8(sp)
    80001112:	82aa                	mv	t0,a0
    80001114:	8336                	mv	t1,a3
    80001116:	8432                	mv	s0,a2
    80001118:	8fae                	mv	t6,a1
    8000111a:	00269513          	slli	a0,a3,0x2
    8000111e:	4381                	li	t2,0
    80001120:	8f22                	mv	t5,s0
    80001122:	8e96                	mv	t4,t0
    80001124:	4e01                	li	t3,0
    80001126:	867a                	mv	a2,t5
    80001128:	86fe                	mv	a3,t6
    8000112a:	4781                	li	a5,0
    8000112c:	4581                	li	a1,0

  **最内层循环的汇编代码------------------------------------
    **8000112e:	4298                lw	a4,0(a3)
    80001130:	00062803          	lw	a6,0(a2)
    80001134:	88be                mv	a7,a5
    80001136:	2785                addiw	a5,a5,1
    80001138:	0307073b          	mulw	a4,a4,a6
    8000113c:	96aa                add	a3,a3,a0
    8000113e:	962a                add	a2,a2,a0
    80001140:	9db9                addw	a1,a1,a4
    80001142:	fef316e3          	bne	t1,a5,8000112e <sgemm_scalar_int+0x24>

** 最内层循环的汇编代码--end----------------------------------

    80001146:	00bea023          	sw	a1,0(t4)
    8000114a:	001e079b          	addiw	a5,t3,1
    8000114e:	9eaa                	add	t4,t4,a0
    80001150:	0f11                	addi	t5,t5,4
    80001152:	011e0463          	beq	t3,a7,8000115a <sgemm_scalar_int+0x50>
    80001156:	8e3e                	mv	t3,a5
    80001158:	b7f9                	j	80001126 <sgemm_scalar_int+0x1c>
    8000115a:	0013879b          	addiw	a5,t2,1


    8000115e:	0f91                	addi	t6,t6,4
    80001160:	0291                	addi	t0,t0,4
    80001162:	01c38463          	beq	t2,t3,8000116a <sgemm_scalar_int+0x60>
    80001166:	83be                	mv	t2,a5
    80001168:	bf65                	j	80001120 <sgemm_scalar_int+0x16>
    8000116a:	6422                	ld	s0,8(sp)


    8000116c:	0141                	addi	sp,sp,16
    8000116e:	8082                	ret
    80001170:	8082                	ret
Repository owner locked and limited conversation to collaborators Feb 25, 2020
@meton-robean
Copy link
Owner Author

meton-robean commented Mar 10, 2020

dynamic programing (这个例子来自 xloop 的dynprog)

 __attribute__((noinline))
  void dynprog_scalar_longlong( int length, long long *c, long long *W, long long *out )
  {

    int out_l = 0;
    int i, j, k;
    for (i = 0; i < length - 1; i++)
    {
      
      for (j = i + 1; j < length; j++)
      {

        int s = 0;

        asm volatile ("fence");
        for (k = i + 1; k < j; k++)
          s += c[i * length + k] + c[k * length + j];
        asm volatile ("fence" ::: "memory");

        c[i * length + j] = s + W[i * length + j];
        

      }
      
      out_l += c[length - 1];
    }

    *out = out_l;
  }
000000008000115e <dynprog_scalar_longlong>:
    8000115e:	7159                	addi	sp,sp,-112
    80001160:	f4a2                	sd	s0,104(sp)
    80001162:	f0a6                	sd	s1,96(sp)
    80001164:	ecca                	sd	s2,88(sp)
    80001166:	e8ce                	sd	s3,80(sp)
    80001168:	e4d2                	sd	s4,72(sp)
    8000116a:	e0d6                	sd	s5,64(sp)
    8000116c:	fc5a                	sd	s6,56(sp)
    8000116e:	f85e                	sd	s7,48(sp)
    80001170:	f462                	sd	s8,40(sp)
    80001172:	f066                	sd	s9,32(sp)
    80001174:	ec6a                	sd	s10,24(sp)
    80001176:	e86e                	sd	s11,16(sp)
    80001178:	4785                	li	a5,1
    8000117a:	e436                	sd	a3,8(sp)
    8000117c:	0ea7de63          	bge	a5,a0,80001278 <dynprog_scalar_longlong+0x11a>
    80001180:	00351813          	slli	a6,a0,0x3
    80001184:	ff880d93          	addi	s11,a6,-8
    80001188:	00050d1b          	sext.w	s10,a0
    8000118c:	9f89                	subw	a5,a5,a0
    8000118e:	8f2a                	mv	t5,a0
    80001190:	8fae                	mv	t6,a1
    80001192:	83b2                	mv	t2,a2
    80001194:	9dae                	add	s11,s11,a1
    80001196:	e03e                	sd	a5,0(sp)
    80001198:	8c6a                	mv	s8,s10
    8000119a:	4b01                	li	s6,0
    8000119c:	4a85                	li	s5,1
    8000119e:	4a01                	li	s4,0
    800011a0:	4b81                	li	s7,0
    800011a2:	4c81                	li	s9,0
    800011a4:	01058993          	addi	s3,a1,16
    800011a8:	000a889b          	sext.w	a7,s5
    800011ac:	82c6                	mv	t0,a7
    800011ae:	09e8d663          	bge	a7,t5,8000123a <dynprog_scalar_longlong+0xdc>
    800011b2:	015a0333          	add	t1,s4,s5
    800011b6:	030e                	slli	t1,t1,0x3
    800011b8:	414c0eb3          	sub	t4,s8,s4
    800011bc:	006f8e33          	add	t3,t6,t1
    800011c0:	0e8e                	slli	t4,t4,0x3
    800011c2:	8972                	mv	s2,t3
    800011c4:	9efe                	add	t4,t4,t6
    800011c6:	ffeb049b          	addiw	s1,s6,-2
    800011ca:	017a0433          	add	s0,s4,s7
    800011ce:	0ff0000f          	fence

    800011d2:	0512d763          	bge	t0,a7,80001220 <dynprog_scalar_longlong+0xc2>
    800011d6:	0114853b          	addw	a0,s1,a7
    800011da:	1502                	slli	a0,a0,0x20
    800011dc:	9101                	srli	a0,a0,0x20
    800011de:	9522                	add	a0,a0,s0
    800011e0:	050e                	slli	a0,a0,0x3
    800011e2:	01d306b3          	add	a3,t1,t4
    800011e6:	954e                	add	a0,a0,s3
    800011e8:	87ca                	mv	a5,s2
    800011ea:	4601                	li	a2,
    
    ## 最内层循环 ------------------------------------------
    800011ec:	6398                	ld	a4,0(a5)
    800011ee:	628c                	ld	a1,0(a3)
    800011f0:	07a1                	addi	a5,a5,8
    800011f2:	96c2                	add	a3,a3,a6
    800011f4:	9f2d                	addw	a4,a4,a1
    800011f6:	9e39                	addw	a2,a2,a4
    800011f8:	fef51ae3          	bne	a0,a5,800011ec <dynprog_scalar_longlong+0x8e>
    ## 最内层循环 end----------------------------------------
    800011fc:	0ff0000f          	fence


    80001200:	006387b3          	add	a5,t2,t1
    80001204:	639c                	ld	a5,0(a5)
    80001206:	2885                	addiw	a7,a7,1
    80001208:	0321                	addi	t1,t1,8
    8000120a:	963e                	add	a2,a2,a5
    8000120c:	00ce3023          	sd	a2,0(t3)
    80001210:	031f0563          	beq	t5,a7,8000123a <dynprog_scalar_longlong+0xdc>
    80001214:	006f8e33          	add	t3,t6,t1
    80001218:	0ff0000f          	fence
    8000121c:	fb12cde3          	blt	t0,a7,800011d6 <dynprog_scalar_longlong+0x78>
    80001220:	4601                	li	a2,0
    80001222:	0ff0000f          	fence
    80001226:	006387b3          	add	a5,t2,t1
    8000122a:	639c                	ld	a5,0(a5)
    8000122c:	2885                	addiw	a7,a7,1
    8000122e:	0321                	addi	t1,t1,8
    80001230:	963e                	add	a2,a2,a5
    80001232:	00ce3023          	sd	a2,0(t3)
    80001236:	fd1f1fe3          	bne	t5,a7,80001214 <dynprog_scalar_longlong+0xb6>
    8000123a:	000db783          	ld	a5,0(s11)
    8000123e:	3b7d                	addiw	s6,s6,-1
    80001240:	0b85                	addi	s7,s7,1
    80001242:	01978cbb          	addw	s9,a5,s9
    80001246:	6782                	ld	a5,0(sp)
    80001248:	01aa0a3b          	addw	s4,s4,s10
    8000124c:	0a85                	addi	s5,s5,1
    8000124e:	018d0c3b          	addw	s8,s10,s8
    80001252:	f4fb1be3          	bne	s6,a5,800011a8 <dynprog_scalar_longlong+0x4a>
    80001256:	67a2                	ld	a5,8(sp)
    80001258:	7426                	ld	s0,104(sp)
    8000125a:	7486                	ld	s1,96(sp)
    8000125c:	0197b023          	sd	s9,0(a5)
    80001260:	6966                	ld	s2,88(sp)
    80001262:	69c6                	ld	s3,80(sp)
    80001264:	6a26                	ld	s4,72(sp)
    80001266:	6a86                	ld	s5,64(sp)
    80001268:	7b62                	ld	s6,56(sp)
    8000126a:	7bc2                	ld	s7,48(sp)
    8000126c:	7c22                	ld	s8,40(sp)
    8000126e:	7c82                	ld	s9,32(sp)
    80001270:	6d62                	ld	s10,24(sp)
    80001272:	6dc2                	ld	s11,16(sp)
    80001274:	6165                	addi	sp,sp,112
    80001276:	8082                	ret
    80001278:	4c81                	li	s9,0
    8000127a:	bff1                	j	80001256 <dynprog_scalar_longlong+0xf8>

@meton-robean
Copy link
Owner Author

warshall computing 例子来自xloop的benchmark

  __attribute__((noinline))
  void warshall_scalar( int n, float *path, float *path_in )
  {
    int i, j, k;

    // initially copy the input path to path
    memcpy( path, path_in, sizeof(float) * n * n );

    for (k = 0; k < n; k++)
    {
      for (i = 0; i < n; i++)
        asm volatile ("fence");  //最内层循环中有条件判断
        for (j = 0; j < n; j++)
          path[i*n+j] = path[i*n+j] < path[i*n+k] + path[k*n+j] ?
                        path[i*n+j] : path[i*n+k] + path[k*n+j];
        asm volatile ("fence" ::: "memory");
        
    }
  }

    80001098:	0ff0000f          	fence
    8000109c:	86be                	mv	a3,a5
    8000109e:	2785                	addiw	a5,a5,1
    800010a0:	fef41ce3          	bne	s0,a5,80001098 <warshall_scalar+0x50>
    800010a4:	879a                	mv	a5,t1


    800010a6:	00b78733          	add	a4,a5,a1
    800010aa:	9742                	add	a4,a4,a6
    800010ac:	00072787          	flw	fa5,0(a4)
    800010b0:	00062687          	flw	fa3,0(a2)
    800010b4:	0007a707          	flw	fa4,0(a5)
    800010b8:	00d7f7d3          	fadd.s	fa5,fa5,fa3
    800010bc:	a0f71753          	flt.s	a4,fa4,fa5
    800010c0:	e319                	bnez	a4,800010c6 <warshall_scalar+0x7e>

    800010c2:	20f78753          	fmv.s	fa4,fa5
    800010c6:	0791                	addi	a5,a5,4


    800010c8:	fee7ae27          	fsw	fa4,-4(a5)
    800010cc:	fcf51de3          	bne	a0,a5,800010a6 <warshall_scalar+0x5e>
    800010d0:	0ff0000f          	fence

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

1 participant