#ifndef INVERSE_V3_H #define INVERSE_V3_H inline void cofactor_v3(s_matrix_ppu * const restrict output, const s_matrix_ppu * const restrict source) { // Let's keep the calcul the det of the 3x3 matrix in mind: // // [ a b c ] // [ d e f ] = aei - ahf + dhc - dbi + gbf - gec = (aei + dhc + gbf) - (ahf + dbi + gec) // [ g h i ] // // By looking at the version 2 of the code, code we can see that the // multiplication to get the positive and negative part of the cofactor is // done using a rotation of the rows. const vector float zero = vec_xor(zero, zero); const vector float c0 = source->cols[0]; const vector float c1 = source->cols[1]; const vector float c2 = source->cols[2]; const vector float c3 = source->cols[3]; // between the different rows Let's create the different roration, using the following notatoin: // rNuM = row N rotate up by M float. const vector float c0u1 = vec_sld(c0, c0, 4); const vector float c0u2 = vec_sld(c0, c0, 8); const vector float c0u3 = vec_sld(c0, c0, 12); const vector float c1u1 = vec_sld(c1, c1, 4); const vector float c1u2 = vec_sld(c1, c1, 8); const vector float c1u3 = vec_sld(c1, c1, 12); const vector float c2u1 = vec_sld(c2, c2, 4); const vector float c2u2 = vec_sld(c2, c2, 8); const vector float c2u3 = vec_sld(c2, c2, 12); const vector float c3u1 = vec_sld(c3, c3, 4); const vector float c3u2 = vec_sld(c3, c3, 8); const vector float c3u3 = vec_sld(c3, c3, 12); // Some useful constant: const vector unsigned int u_znzn = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; const vector unsigned int u_nznz = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; const vector float znzn = (const vector float)u_znzn; const vector float nznz = (const vector float)u_nznz; //------------------------------------------------------------------------- // Column0: col0 = c1, col1 = c2, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c2u2_c3u3 = vec_madd(c2u2, c3u3, zero); const vector float m_c1u1_c2u2_c3u3 = vec_madd(c1u1, m_c2u2_c3u3, zero); // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float m_c2u3_c3u1 = vec_madd(c2u3, c3u1, zero); const vector float a_m_c1u2_c2u3_c3u1_m_c1u1_c2u2_c3u3 = vec_madd(c1u2, m_c2u3_c3u1, m_c1u1_c2u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float m_c2u1_c3u2 = vec_madd(c2u1, c3u2, zero); const vector float pos_part_det3x3_c0 = vec_madd(c1u3, m_c2u1_c3u2, a_m_c1u2_c2u3_c3u1_m_c1u1_c2u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c2u3_c3u2 = vec_madd(c2u3, c3u2, zero); const vector float m_c1u1_c2u3_c3u2 = vec_madd(c1u1, m_c2u3_c3u2, zero); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float m_c2u1_c3u3 = vec_madd(c2u1, c3u3, zero); const vector float a_m_c1u2_c2u1_c3u3_m_c1u1_c2u3_c3u2 = vec_madd(c1u2, m_c2u1_c3u3, m_c1u1_c2u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float m_c2u2_c3u1 = vec_madd(c2u2, c3u1, zero); const vector float neg_part_det3x3_c0 = vec_madd(c1u3, m_c2u2_c3u1, a_m_c1u2_c2u1_c3u3_m_c1u1_c2u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c0 = vec_sub(pos_part_det3x3_c0, neg_part_det3x3_c0); // Column1: col0 = c0, col1 = c2, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c2u2_c3u3 = vec_madd(c0u1, m_c2u2_c3u3, zero); // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float a_m_c0u2_c2u3_c3u1_m_c0u1_c2u2_c3u3 = vec_madd(c0u2, m_c2u3_c3u1, m_c0u1_c2u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float pos_part_det3x3_c1 = vec_madd(c0u3, m_c2u1_c3u2, a_m_c0u2_c2u3_c3u1_m_c0u1_c2u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c2u3_c3u2 = vec_madd(c0u1, m_c2u3_c3u2, zero); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float a_m_c0u2_c2u1_c3u3_m_c0u1_c2u3_c3u2 = vec_madd(c0u2, m_c2u1_c3u3, m_c0u1_c2u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float neg_part_det3x3_c1 = vec_madd(c0u3, m_c2u2_c3u1, a_m_c0u2_c2u1_c3u3_m_c0u1_c2u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c1 = vec_sub(pos_part_det3x3_c1, neg_part_det3x3_c1); // Column3: col2 = c0, col1 = c1, col2 = c2 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c1u2 = vec_madd(c0u1, c1u2, zero); const vector float m_c0u1_c1u2_c2u3 = vec_madd(m_c0u1_c1u2, c2u3, zero); // aei // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float m_c0u2_c1u3 = vec_madd(c0u2, c1u3, zero); const vector float a_m_c0u2_c1u3_c2u1_m_c0u1_c1u2_c2u3 = vec_madd(m_c0u2_c1u3, c2u1, m_c0u1_c1u2_c2u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float m_c0u3_c1u1 = vec_madd(c0u3, c1u1, zero); const vector float pos_part_det3x3_c3 = vec_madd(m_c0u3_c1u1, c2u2, a_m_c0u2_c1u3_c2u1_m_c0u1_c1u2_c2u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c1u3 = vec_madd(c0u1, c1u3, zero); const vector float m_c0u1_c1u3_c2u2 = vec_madd(m_c0u1_c1u3, c2u2, zero); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float m_c0u2_c1u1 = vec_madd(c0u2, c1u1, zero); const vector float a_m_c0u2_c1u1_c2u3_m_c0u1_c1u3_c2u2 = vec_madd(m_c0u2_c1u1, c2u3, m_c0u1_c1u3_c2u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float m_c0u3_c1u2 = vec_madd(c0u3, c1u2, zero); const vector float neg_part_det3x3_c3 = vec_madd(m_c0u3_c1u2, c2u1, a_m_c0u2_c1u1_c2u3_m_c0u1_c1u3_c2u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c3 = vec_sub(pos_part_det3x3_c3, neg_part_det3x3_c3); // Column2: col2 = c0, col1 = c1, col2 = c3 //----------------------------------------- // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const vector float m_c0u1_c1u2_c3u3 = vec_madd(m_c0u1_c1u2, c3u3, zero); // aei // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const vector float a_m_c0u2_c1u3_c3u1_m_c0u1_c1u2_c3u3 = vec_madd(m_c0u2_c1u3, c3u1, m_c0u1_c1u2_c3u3); // dhc + aei // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const vector float pos_part_det3x3_c2 = vec_madd(m_c0u3_c1u1, c3u2, a_m_c0u2_c1u3_c3u1_m_c0u1_c1u2_c3u3); // aei + dhc + gbf // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const vector float m_c0u1_c1u3_c3u2 = vec_madd(m_c0u1_c1u3, c3u2, zero); // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const vector float a_m_c0u2_c1u1_c3u3_m_c0u1_c1u3_c3u2 = vec_madd(m_c0u2_c1u1, c3u3, m_c0u1_c1u3_c3u2); // dbi + ahf // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const vector float neg_part_det3x3_c2 = vec_madd(m_c0u3_c1u2, c3u1, a_m_c0u2_c1u1_c3u3_m_c0u1_c1u3_c3u2); // ahf + dbi + gec // det3x3 = pos_part - neg_part; const vector float det3x3_c2 = vec_sub(pos_part_det3x3_c2, neg_part_det3x3_c2); //------------------------------------------------------------------------- // Fix the sign of each column: const vector float cofactor_c0 = vec_xor(det3x3_c0, znzn); const vector float cofactor_c1 = vec_xor(det3x3_c1, nznz); const vector float cofactor_c2 = vec_xor(det3x3_c2, znzn); const vector float cofactor_c3 = vec_xor(det3x3_c3, nznz); // Store the result: output->cols[0] = cofactor_c0; output->cols[1] = cofactor_c1; output->cols[2] = cofactor_c2; output->cols[3] = cofactor_c3; } inline vector float determinant_v3(const s_matrix_ppu * const restrict source, const s_matrix_ppu * const restrict cofactor) { const vector float zero = vec_xor(zero, zero); const vector float sc0_ac0 = vec_madd(source->cols[0], cofactor->cols[0], zero); const vector float up1 = vec_sld(sc0_ac0, sc0_ac0,4); const vector float oddeven = vec_add(sc0_ac0, up1); const vector float up2 = vec_sld(oddeven, oddeven, 8); const vector float det = vec_add(up2, oddeven); return det; } inline void transpose_v3(s_matrix_ppu * const restrict output, const s_matrix_ppu * const restrict source) { const vector float c0 = source->cols[0]; const vector float c1 = source->cols[1]; const vector float c2 = source->cols[2]; const vector float c3 = source->cols[3]; const vector float hi_part_c0_c2 = vec_mergeh(c0, c2); const vector float hi_part_c1_c3 = vec_mergeh(c1, c3); const vector float lo_part_c0_c2 = vec_mergel(c0, c2); const vector float lo_part_c1_c3 = vec_mergel(c1, c3); output->cols[0] = vec_mergeh(hi_part_c0_c2, hi_part_c1_c3); output->cols[1] = vec_mergel(hi_part_c0_c2, hi_part_c1_c3); output->cols[2] = vec_mergeh(lo_part_c0_c2, lo_part_c1_c3); output->cols[3] = vec_mergel(lo_part_c0_c2, lo_part_c1_c3); } inline void mul_v3(s_matrix_ppu * const restrict output, const s_matrix_ppu * const restrict source, const vector float factor) { const vector float zero = vec_xor(zero, zero); output->cols[0] = vec_madd(source->cols[0], factor, zero); output->cols[1] = vec_madd(source->cols[1], factor, zero); output->cols[2] = vec_madd(source->cols[2], factor, zero); output->cols[3] = vec_madd(source->cols[3], factor, zero); } inline void inverse_v3(s_matrix_ppu * const restrict output, const s_matrix_ppu * const restrict source) { s_matrix_ppu adj; s_matrix_ppu cof; // InvM = (1/det(M)) * Transpose(Cofactor(M)) cofactor_v3(&cof, source); const vector float det = determinant_v3(source, &cof); const vector float oodet = vec_re(det); transpose_v3(&adj, &cof); mul_v3(output, &adj, oodet); //dump_matrix_ppu("Cofactor", &cof); } #endif // INVERSE_V3_H