#ifndef INVERSE_V6_H #define INVERSE_V5_H #define SHUFFLE_FLOAT_X 0x00, 0x01, 0x02, 0x03 #define SHUFFLE_FLOAT_Y 0x04, 0x05, 0x06, 0x07 #define SHUFFLE_FLOAT_Z 0x08, 0x09, 0x0A, 0x0B #define SHUFFLE_FLOAT_W 0x0C, 0x0D, 0x0E, 0x0F #define SHUFFLE_FLOAT_A 0x10, 0x11, 0x12, 0x13 #define SHUFFLE_FLOAT_B 0x14, 0x15, 0x16, 0x17 #define SHUFFLE_FLOAT_C 0x18, 0x19, 0x1A, 0x1B #define SHUFFLE_FLOAT_D 0x1C, 0x1D, 0x1E, 0x1F #define SHUFFLE_MERGE4(A, B, C, D) (vector unsigned char)(A, B, C, D) #define SHUFFLE_YZWX SHUFFLE_MERGE4(SHUFFLE_FLOAT_Y, SHUFFLE_FLOAT_Z, SHUFFLE_FLOAT_W, SHUFFLE_FLOAT_X) #define SHUFFLE_ZWXY SHUFFLE_MERGE4(SHUFFLE_FLOAT_Z, SHUFFLE_FLOAT_W, SHUFFLE_FLOAT_X, SHUFFLE_FLOAT_Y) #define SHUFFLE_WXYZ SHUFFLE_MERGE4(SHUFFLE_FLOAT_W, SHUFFLE_FLOAT_X, SHUFFLE_FLOAT_Y, SHUFFLE_FLOAT_Z) #define SHUFFLE_XAYB SHUFFLE_MERGE4(SHUFFLE_FLOAT_X, SHUFFLE_FLOAT_A, SHUFFLE_FLOAT_Y, SHUFFLE_FLOAT_B) #define SHUFFLE_ZCWD SHUFFLE_MERGE4(SHUFFLE_FLOAT_Z, SHUFFLE_FLOAT_C, SHUFFLE_FLOAT_W, SHUFFLE_FLOAT_D) inline void inverse_v6(s_matrix_spu * const restrict output, const s_matrix_spu * const restrict source) { // InvM = (1/det(M)) * Transpose(Cofactor(M)) //======================================================================== // Calcul of the Cofactor Matrix //======================================================================== // Let's keep the calcul the det of the 3x3 matrix in mind: // // [ a b c ] // [ d e f ] = aei - ahf + dhc - dbi + gbf - gec = (aei + dhc + gbf) - (ahf + dbi + gec) // [ g h i ] // // By looking at the version 2 of the code, code we can see that the // multiplication to get the positive and negative part of the cofactor is // done using a rotation of the rows. const vector float c0 = source->cols[0]; const vector float c1 = source->cols[1]; const vector float c2 = source->cols[2]; const vector float c3 = source->cols[3]; // Load the shuffling pattern: const vector unsigned char patternYZWX = SHUFFLE_YZWX; const vector unsigned char patternZWXY = SHUFFLE_ZWXY; const vector unsigned char patternWXYZ = SHUFFLE_WXYZ; const vector unsigned char patternXAYB = SHUFFLE_XAYB; const vector unsigned char patternZCWD = SHUFFLE_ZCWD; // between the different rows Let's create the different roration, using the following notatoin: // rNuM = row N rotate up by M float. const vector float c0u1 = spu_shuffle(c0, c0, patternYZWX); const vector float c0u2 = spu_shuffle(c0, c0, patternZWXY); const vector float c0u3 = spu_shuffle(c0, c0, patternWXYZ); const vector float c1u1 = spu_shuffle(c1, c1, patternYZWX); const vector float c1u2 = spu_shuffle(c1, c1, patternZWXY); const vector float c1u3 = spu_shuffle(c1, c1, patternWXYZ); const vector float c2u1 = spu_shuffle(c2, c2, patternYZWX); const vector float c2u2 = spu_shuffle(c2, c2, patternZWXY); const vector float c2u3 = spu_shuffle(c2, c2, patternWXYZ); const vector float c3u1 = spu_shuffle(c3, c3, patternYZWX); const vector float c3u2 = spu_shuffle(c3, c3, patternZWXY); const vector float c3u3 = spu_shuffle(c3, c3, patternWXYZ); // Some useful constant: const vector unsigned int u_znzn = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; const vector unsigned int u_nznz = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; const vector float znzn = (const vector float)u_znzn; const vector float nznz = (const vector float)u_nznz; //------------------------------------------------------------------------- // Column0: col0 = c1, col1 = c2, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c2u2_c3u3 = spu_mul(c2u2, c3u3); const vector float m_c1u1_c2u2_c3u3 = spu_mul(c1u1, m_c2u2_c3u3); // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float m_c2u3_c3u1 = spu_mul(c2u3, c3u1); const vector float a_m_c1u2_c2u3_c3u1_m_c1u1_c2u2_c3u3 = spu_madd(c1u2, m_c2u3_c3u1, m_c1u1_c2u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float m_c2u1_c3u2 = spu_shuffle(m_c2u2_c3u3, m_c2u2_c3u3, patternWXYZ); //spu_mul(c2u1, c3u2); const vector float pos_part_det3x3_c0 = spu_madd(c1u3, m_c2u1_c3u2, a_m_c1u2_c2u3_c3u1_m_c1u1_c2u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c2u3_c3u2 = spu_mul(c2u3, c3u2); const vector float m_c1u1_c2u3_c3u2 = spu_mul(c1u1, m_c2u3_c3u2); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float m_c2u1_c3u3 = spu_shuffle(m_c2u3_c3u1, m_c2u3_c3u1, patternZWXY); // spu_mul(c2u1, c3u3); const vector float a_m_c1u2_c2u1_c3u3_m_c1u1_c2u3_c3u2 = spu_madd(c1u2, m_c2u1_c3u3, m_c1u1_c2u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float m_c2u2_c3u1 = spu_shuffle(m_c2u3_c3u2, m_c2u3_c3u2, patternWXYZ); // spu_mul(c2u2, c3u1); const vector float neg_part_det3x3_c0 = spu_madd(c1u3, m_c2u2_c3u1, a_m_c1u2_c2u1_c3u3_m_c1u1_c2u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c0 = spu_sub(pos_part_det3x3_c0, neg_part_det3x3_c0); // Column1: col0 = c0, col1 = c2, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c2u2_c3u3 = spu_mul(c0u1, m_c2u2_c3u3); // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float a_m_c0u2_c2u3_c3u1_m_c0u1_c2u2_c3u3 = spu_madd(c0u2, m_c2u3_c3u1, m_c0u1_c2u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float pos_part_det3x3_c1 = spu_madd(c0u3, m_c2u1_c3u2, a_m_c0u2_c2u3_c3u1_m_c0u1_c2u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c2u3_c3u2 = spu_mul(c0u1, m_c2u3_c3u2); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float a_m_c0u2_c2u1_c3u3_m_c0u1_c2u3_c3u2 = spu_madd(c0u2, m_c2u1_c3u3, m_c0u1_c2u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float neg_part_det3x3_c1 = spu_madd(c0u3, m_c2u2_c3u1, a_m_c0u2_c2u1_c3u3_m_c0u1_c2u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c1 = spu_sub(pos_part_det3x3_c1, neg_part_det3x3_c1); // Column3: col2 = c0, col1 = c1, col2 = c2 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c1u2 = spu_mul(c0u1, c1u2); const vector float m_c0u1_c1u2_c2u3 = spu_mul(m_c0u1_c1u2, c2u3); // aei // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float m_c0u2_c1u3 = spu_shuffle(m_c0u1_c1u2, m_c0u1_c1u2, patternYZWX); // spu_mul(c0u2, c1u3); const vector float a_m_c0u2_c1u3_c2u1_m_c0u1_c1u2_c2u3 = spu_madd(m_c0u2_c1u3, c2u1, m_c0u1_c1u2_c2u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float m_c0u3_c1u1 = spu_mul(c0u3, c1u1); const vector float pos_part_det3x3_c3 = spu_madd(m_c0u3_c1u1, c2u2, a_m_c0u2_c1u3_c2u1_m_c0u1_c1u2_c2u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c1u3 = spu_shuffle(m_c0u3_c1u1, m_c0u3_c1u1, patternZWXY); // spu_mul(c0u1, c1u3); const vector float m_c0u1_c1u3_c2u2 = spu_mul(m_c0u1_c1u3, c2u2); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float m_c0u2_c1u1 = spu_mul(c0u2, c1u1); const vector float a_m_c0u2_c1u1_c2u3_m_c0u1_c1u3_c2u2 = spu_madd(m_c0u2_c1u1, c2u3, m_c0u1_c1u3_c2u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float m_c0u3_c1u2 = spu_shuffle(m_c0u2_c1u1, m_c0u2_c1u1, patternYZWX); // spu_mul(c0u3, c1u2); const vector float neg_part_det3x3_c3 = spu_madd(m_c0u3_c1u2, c2u1, a_m_c0u2_c1u1_c2u3_m_c0u1_c1u3_c2u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c3 = spu_sub(pos_part_det3x3_c3, neg_part_det3x3_c3); // Column2: col2 = c0, col1 = c1, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c1u2_c3u3 = spu_mul(m_c0u1_c1u2, c3u3); // aei // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float a_m_c0u2_c1u3_c3u1_m_c0u1_c1u2_c3u3 = spu_madd(m_c0u2_c1u3, c3u1, m_c0u1_c1u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float pos_part_det3x3_c2 = spu_madd(m_c0u3_c1u1, c3u2, a_m_c0u2_c1u3_c3u1_m_c0u1_c1u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c1u3_c3u2 = spu_mul(m_c0u1_c1u3, c3u2); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float a_m_c0u2_c1u1_c3u3_m_c0u1_c1u3_c3u2 = spu_madd(m_c0u2_c1u1, c3u3, m_c0u1_c1u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float neg_part_det3x3_c2 = spu_madd(m_c0u3_c1u2, c3u1, a_m_c0u2_c1u1_c3u3_m_c0u1_c1u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c2 = spu_sub(pos_part_det3x3_c2, neg_part_det3x3_c2); //------------------------------------------------------------------------- // Fix the sign of each column: const vector float cofactor_c0 = spu_xor(det3x3_c0, znzn); const vector float cofactor_c1 = spu_xor(det3x3_c1, nznz); const vector float cofactor_c2 = spu_xor(det3x3_c2, znzn); const vector float cofactor_c3 = spu_xor(det3x3_c3, nznz); //======================================================================== // Calcul of 1 / Determinant //======================================================================== const vector float sc0_ac0 = spu_mul(c0, cofactor_c0); const vector float up1 = spu_shuffle(sc0_ac0, sc0_ac0, patternYZWX); const vector float oddeven = spu_add(sc0_ac0, up1); const vector float up2 = spu_shuffle(oddeven, oddeven, patternZWXY); const vector float det = spu_add(up2, oddeven); const vector float oodet = spu_re(det); //======================================================================== // Calcul of the transpose of the Cofactor matrix //======================================================================== const vector float hi_part_c0_c2 = spu_shuffle(cofactor_c0, cofactor_c2, patternXAYB ); const vector float hi_part_c1_c3 = spu_shuffle(cofactor_c1, cofactor_c3, patternXAYB ); const vector float lo_part_c0_c2 = spu_shuffle(cofactor_c0, cofactor_c2, patternZCWD ); const vector float lo_part_c1_c3 = spu_shuffle(cofactor_c1, cofactor_c3, patternZCWD); const vector float adjoint_c0 = spu_shuffle(hi_part_c0_c2, hi_part_c1_c3, patternXAYB ); const vector float adjoint_c1 = spu_shuffle(hi_part_c0_c2, hi_part_c1_c3, patternZCWD); const vector float adjoint_c2 = spu_shuffle(lo_part_c0_c2, lo_part_c1_c3, patternXAYB ); const vector float adjoint_c3 = spu_shuffle(lo_part_c0_c2, lo_part_c1_c3, patternZCWD); //======================================================================== // Final step the transposed matrix time oodet //======================================================================== output->cols[0] = spu_mul(adjoint_c0, oodet); output->cols[1] = spu_mul(adjoint_c1, oodet); output->cols[2] = spu_mul(adjoint_c2, oodet); output->cols[3] = spu_mul(adjoint_c3, oodet); } #endif // INVERSE_V6_H