Unrolled 1

Shortest form I’ve found so far. Not a good odd/even balance on the pipeline usage though.

for (j = 0; j < num_indexes; j += 24) {
 qword* lower_qword = (qword*)&indexes[j];
 qword i0 = lower_qword[0];
 qword i1 = lower_qword[1];
 qword i2 = lower_qword[2];
 qword i0r = si_rotqmbyi(i0, -2);
 qword i1r = si_rotqmbyi(i1, -2);
 qword i2r = si_rotqmbyi(i2, -2);

 qword v0 = si_mpya(i0, vertex_sizes, verticess);
 qword v1 = si_mpya(i1, vertex_sizes, verticess);
 qword v2 = si_mpya(i2, vertex_sizes, verticess);
 qword v0r = si_mpya(i0r, vertex_sizes, verticess);
 qword v1r = si_mpya(i1r, vertex_sizes, verticess);
 qword v2r = si_mpya(i2r, vertex_sizes, verticess);

 // Little constant reuse here :\
 qword vs7 = si_shufb(v2r, v2, SHUFB4(c,D,d,0));
 qword vs6 = si_shufb(v2r, v2, SHUFB4(B,b,C,0));
 qword vs5 = si_shufb(v1, v2r, SHUFB4(D,a,0,0));
 vs5 = si_shufb(vs5, v2, SHUFB4(A,B,a,0));
 qword vs4 = si_shufb(v1, v1r, SHUFB4(c,C,d,0));
 qword vs3 = si_shufb(v1, v1r, SHUFB4(A,b,B,0));
 qword vs2 = si_shufb(v0r, v0, SHUFB4(D,d,0,0));
 vs2 = si_shufb(vs2, v1r,SHUFB4(A,B,a,0));
 qword vs1 = si_shufb(v0r, v0, SHUFB4(b,C,c,0));
 qword vs0 = si_shufb(v0r, v0, SHUFB4(A,a,B,0));

 switch(num_indexes - j) {
  default: func(vs7);
  case 21: func(vs6);
  case 18: func(vs5);
  case 15: func(vs4);
  case 12: func(vs3);
  case 9:  func(vs2);
  case 6:  func(vs1);
  case 3:  func(vs0);   
 }
}

Leave a Reply

Your email address will not be published.