Unrolled 2

Longer than the other one, but with better odd/even balance and only one shuffle constant. Probably faster.

for (j = 0; j < num_indexes; j += 24) {
 qword* lower_qword = (qword*)&indexes[j];
 qword indices0 = lower_qword[0];
 qword indices1 = lower_qword[1];
 qword indices2 = lower_qword[2];

 qword vs0 = indices0;
 qword vs1 = si_shlqbyi(indices0, 6);
 qword vs3 = si_shlqbyi(indices1, 2);
 qword vs4 = si_shlqbyi(indices1, 8);
 qword vs6 = si_shlqbyi(indices2, 4);
 qword vs7 = si_shlqbyi(indices2, 10);

 qword tmp2a, tmp2b, tmp5a, tmp5b;
 qword tmp2a = si_shlqbyi(indices0, 12);
 qword tmp2b = si_rotqmbyi(indices1, 12|16);
 qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(0x20));

 qword tmp5a = si_shlqbyi(indices1, 14);
 qword tmp5b = si_rotqmbyi(indices2, 14|16);
 qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(0x60));

 vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
 vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
 vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
 vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
 vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
 vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
 vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
 vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));

 vs0 = si_mpya(vs0, vertex_sizes, verticess);
 vs1 = si_mpya(vs1, vertex_sizes, verticess);
 vs2 = si_mpya(vs2, vertex_sizes, verticess);
 vs3 = si_mpya(vs3, vertex_sizes, verticess);
 vs4 = si_mpya(vs4, vertex_sizes, verticess);
 vs5 = si_mpya(vs5, vertex_sizes, verticess);
 vs6 = si_mpya(vs6, vertex_sizes, verticess);
 vs7 = si_mpya(vs7, vertex_sizes, verticess);

 switch(num_indexes - j) {
  default: func(vs7);
  case 21: func(vs6);
  case 18: func(vs5);
  case 15: func(vs4);
  case 12: func(vs3);
  case 9:  func(vs2);
  case 6:  func(vs1);
  case 3:  func(vs0);   
 }
}

Leave a Reply

Your email address will not be published.