Longer than the other one, but with better odd/even balance and only one shuffle constant. Probably faster.
for (j = 0; j < num_indexes; j += 24) { qword* lower_qword = (qword*)&indexes[j]; qword indices0 = lower_qword[0]; qword indices1 = lower_qword[1]; qword indices2 = lower_qword[2]; qword vs0 = indices0; qword vs1 = si_shlqbyi(indices0, 6); qword vs3 = si_shlqbyi(indices1, 2); qword vs4 = si_shlqbyi(indices1, 8); qword vs6 = si_shlqbyi(indices2, 4); qword vs7 = si_shlqbyi(indices2, 10); qword tmp2a, tmp2b, tmp5a, tmp5b; qword tmp2a = si_shlqbyi(indices0, 12); qword tmp2b = si_rotqmbyi(indices1, 12|16); qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(0x20)); qword tmp5a = si_shlqbyi(indices1, 14); qword tmp5b = si_rotqmbyi(indices2, 14|16); qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(0x60)); vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0)); vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0)); vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0)); vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0)); vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0)); vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0)); vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0)); vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0)); vs0 = si_mpya(vs0, vertex_sizes, verticess); vs1 = si_mpya(vs1, vertex_sizes, verticess); vs2 = si_mpya(vs2, vertex_sizes, verticess); vs3 = si_mpya(vs3, vertex_sizes, verticess); vs4 = si_mpya(vs4, vertex_sizes, verticess); vs5 = si_mpya(vs5, vertex_sizes, verticess); vs6 = si_mpya(vs6, vertex_sizes, verticess); vs7 = si_mpya(vs7, vertex_sizes, verticess); switch(num_indexes - j) { default: func(vs7); case 21: func(vs6); case 18: func(vs5); case 15: func(vs4); case 12: func(vs3); case 9: func(vs2); case 6: func(vs1); case 3: func(vs0); } }