Extract three adjacent ushorts from an arbitrary array location.
(Would do a lot better unrolled, I think)
for (j = 0; j < num_indexes; j += 3) {
// Determine address of aligned qword containing indexes[j]
qword lower_qword = si_from_ptr(&indexes[j]);
// Load qword containing indexes[j] and successor
qword first = si_lqd(lower_qword, 0);
qword second = si_lqd(lower_qword, 16);
// Calculate &indexes[j]&15 - offset of index from 16 byte alignment
qword offset = si_andi(lower_qword, 15);
// Generate a mask to select the appropriate parts of first and
// second form byte select mask from (1<
qword one = si_from_uint(1);
qword mask = si_fsmb(si_sf(one, si_shl(one, offset)));
// Rotate first and second parts to desired locations
// This is the key interesting bit, but I'd like to
// think this could be improved upon...
first = si_shlqby(first, offset);
second = si_rotqmby(second, si_ori(offset, 16));
// Store indexes[j],[j+1],[j+2] in vs.
qword is = si_selb(first, second, mask);
// Expand is to uint positioning
is = si_shufb(is, is, SHUFB8(0,A,0,B,0,C,0,0));
qword vs = si_mpya(is, (qword)spu_splats(vertex_size),
(qword)spu_splats((unsigned)vertices));
func(vs);
}