Longer than the other one, but with better odd/even balance and only one shuffle constant. Probably faster.
for (j = 0; j < num_indexes; j += 24) {
qword* lower_qword = (qword*)&indexes[j];
qword indices0 = lower_qword[0];
qword indices1 = lower_qword[1];
qword indices2 = lower_qword[2];
qword vs0 = indices0;
qword vs1 = si_shlqbyi(indices0, 6);
qword vs3 = si_shlqbyi(indices1, 2);
qword vs4 = si_shlqbyi(indices1, 8);
qword vs6 = si_shlqbyi(indices2, 4);
qword vs7 = si_shlqbyi(indices2, 10);
qword tmp2a, tmp2b, tmp5a, tmp5b;
qword tmp2a = si_shlqbyi(indices0, 12);
qword tmp2b = si_rotqmbyi(indices1, 12|16);
qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(0x20));
qword tmp5a = si_shlqbyi(indices1, 14);
qword tmp5b = si_rotqmbyi(indices2, 14|16);
qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(0x60));
vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
vs0 = si_mpya(vs0, vertex_sizes, verticess);
vs1 = si_mpya(vs1, vertex_sizes, verticess);
vs2 = si_mpya(vs2, vertex_sizes, verticess);
vs3 = si_mpya(vs3, vertex_sizes, verticess);
vs4 = si_mpya(vs4, vertex_sizes, verticess);
vs5 = si_mpya(vs5, vertex_sizes, verticess);
vs6 = si_mpya(vs6, vertex_sizes, verticess);
vs7 = si_mpya(vs7, vertex_sizes, verticess);
switch(num_indexes - j) {
default: func(vs7);
case 21: func(vs6);
case 18: func(vs5);
case 15: func(vs4);
case 12: func(vs3);
case 9: func(vs2);
case 6: func(vs1);
case 3: func(vs0);
}
}