SPU unaligned loads

Extract three adjacent ushorts from an arbitrary array location.

(Would do a lot better unrolled, I think)

for (j = 0; j < num_indexes; j += 3) {
 // Determine address of aligned qword containing indexes[j]
 qword lower_qword = si_from_ptr(&indexes[j]);

 // Load qword containing indexes[j] and successor
 qword first = si_lqd(lower_qword, 0);
 qword second = si_lqd(lower_qword, 16);

 // Calculate &indexes[j]&15 - offset of index from 16 byte alignment
 qword offset = si_andi(lower_qword, 15);

 // Generate a mask to select the appropriate parts of first and 
 // second form byte select mask from (1<
 qword one = si_from_uint(1);
 qword mask = si_fsmb(si_sf(one, si_shl(one, offset)));

 // Rotate first and second parts to desired locations
 // This is the key interesting bit, but I'd like to
 // think this could be improved upon...
 first = si_shlqby(first, offset);
 second = si_rotqmby(second, si_ori(offset, 16));

 // Store indexes[j],[j+1],[j+2] in vs.
 qword is = si_selb(first, second, mask);

 // Expand is to uint positioning
 is = si_shufb(is, is, SHUFB8(0,A,0,B,0,C,0,0));

 qword vs = si_mpya(is, (qword)spu_splats(vertex_size),
                 (qword)spu_splats((unsigned)vertices));

 func(vs);
}

Leave a Reply

Your email address will not be published.