#ifndef INVERSE_V2_H #define INVERSE_V2_H typedef union { float f; int i; } signadj; inline void cofactor_column_v2(float cols[4], const s_matrix * const restrict mat, unsigned int col) { static const unsigned int sel0[] = { 1,0,0,0 }; static const unsigned int sel1[] = { 2,2,1,1 }; static const unsigned int sel2[] = { 3,3,3,2 }; // Let's first define the 3x3 matrix: const unsigned int col0 = sel0[col]; const unsigned int col1 = sel1[col]; const unsigned int col2 = sel2[col]; static const unsigned int znzn[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; static const unsigned int nznz[] = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; // Computer the det of the 3x3 matrix: // // [ a b c ] // [ d e f ] = aei - ahf + dhc - dbi + gbf - gec = (aei + dhc + gbf) - (ahf + dbi + gec) // [ g h i ] // // pos_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row1] * mat->cols[col2].row[row2]; // aei const float r0_pos_part1 = mat->cols[col0].row[1] * mat->cols[col1].row[2] * mat->cols[col2].row[3]; const float r1_pos_part1 = mat->cols[col0].row[2] * mat->cols[col1].row[3] * mat->cols[col2].row[0]; const float r2_pos_part1 = mat->cols[col0].row[3] * mat->cols[col1].row[0] * mat->cols[col2].row[1]; const float r3_pos_part1 = mat->cols[col0].row[0] * mat->cols[col1].row[1] * mat->cols[col2].row[2]; // pos_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row2] * mat->cols[col2].row[row0]; // dhc const float r0_pos_part2 = mat->cols[col0].row[2] * mat->cols[col1].row[3] * mat->cols[col2].row[1]; const float r1_pos_part2 = mat->cols[col0].row[3] * mat->cols[col1].row[0] * mat->cols[col2].row[2]; const float r2_pos_part2 = mat->cols[col0].row[0] * mat->cols[col1].row[1] * mat->cols[col2].row[3]; const float r3_pos_part2 = mat->cols[col0].row[1] * mat->cols[col1].row[2] * mat->cols[col2].row[0]; // pos_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row0] * mat->cols[col2].row[row1]; // gbf const float r0_pos_part3 = mat->cols[col0].row[3] * mat->cols[col1].row[1] * mat->cols[col2].row[2]; const float r1_pos_part3 = mat->cols[col0].row[0] * mat->cols[col1].row[2] * mat->cols[col2].row[3]; const float r2_pos_part3 = mat->cols[col0].row[1] * mat->cols[col1].row[3] * mat->cols[col2].row[0]; const float r3_pos_part3 = mat->cols[col0].row[2] * mat->cols[col1].row[0] * mat->cols[col2].row[1]; // neg_part1 = mat->cols[col0].row[row0] * mat->cols[col1].row[row2] * mat->cols[col2].row[row1]; // ahf const float r0_neg_part1 = mat->cols[col0].row[1] * mat->cols[col1].row[3] * mat->cols[col2].row[2]; const float r1_neg_part1 = mat->cols[col0].row[2] * mat->cols[col1].row[0] * mat->cols[col2].row[3]; const float r2_neg_part1 = mat->cols[col0].row[3] * mat->cols[col1].row[1] * mat->cols[col2].row[0]; const float r3_neg_part1 = mat->cols[col0].row[0] * mat->cols[col1].row[2] * mat->cols[col2].row[1]; // neg_part2 = mat->cols[col0].row[row1] * mat->cols[col1].row[row0] * mat->cols[col2].row[row2]; // dbi const float r0_neg_part2 = mat->cols[col0].row[2] * mat->cols[col1].row[1] * mat->cols[col2].row[3]; const float r1_neg_part2 = mat->cols[col0].row[3] * mat->cols[col1].row[2] * mat->cols[col2].row[0]; const float r2_neg_part2 = mat->cols[col0].row[0] * mat->cols[col1].row[3] * mat->cols[col2].row[1]; const float r3_neg_part2 = mat->cols[col0].row[1] * mat->cols[col1].row[0] * mat->cols[col2].row[2]; // neg_part3 = mat->cols[col0].row[row2] * mat->cols[col1].row[row1] * mat->cols[col2].row[row0]; // gec const float r0_neg_part3 = mat->cols[col0].row[3] * mat->cols[col1].row[2] * mat->cols[col2].row[1]; const float r1_neg_part3 = mat->cols[col0].row[0] * mat->cols[col1].row[3] * mat->cols[col2].row[2]; const float r2_neg_part3 = mat->cols[col0].row[1] * mat->cols[col1].row[0] * mat->cols[col2].row[3]; const float r3_neg_part3 = mat->cols[col0].row[2] * mat->cols[col1].row[1] * mat->cols[col2].row[0]; // pos_part = pos_part1 + pos_part2 + pos_part3; const float r0_pos_part = r0_pos_part1 + r0_pos_part2 + r0_pos_part3; const float r1_pos_part = r1_pos_part1 + r1_pos_part2 + r1_pos_part3; const float r2_pos_part = r2_pos_part1 + r2_pos_part2 + r2_pos_part3; const float r3_pos_part = r3_pos_part1 + r3_pos_part2 + r3_pos_part3; // neg_part = neg_part1 + neg_part2 + neg_part3; const float r0_neg_part = r0_neg_part1 + r0_neg_part2 + r0_neg_part3; const float r1_neg_part = r1_neg_part1 + r1_neg_part2 + r1_neg_part3; const float r2_neg_part = r2_neg_part1 + r2_neg_part2 + r2_neg_part3; const float r3_neg_part = r3_neg_part1 + r3_neg_part2 + r3_neg_part3; // det3x3 = pos_part - neg_part; const float r0_det3x3 = r0_pos_part - r0_neg_part; const float r1_det3x3 = r1_pos_part - r1_neg_part; const float r2_det3x3 = r2_pos_part - r2_neg_part; const float r3_det3x3 = r3_pos_part - r3_neg_part; // Now let's adjust the sign of the cofactor: signadj r0_cofactor; signadj r1_cofactor; signadj r2_cofactor; signadj r3_cofactor; // First we choose the mask depending on the column: const unsigned int col_mask = (const unsigned int)(((const int)((col & 1) << 31)) >> 31); const unsigned int u_znzn = (const unsigned int)(&znzn[0]); const unsigned int u_nznz = (const unsigned int)(&nznz[0]); union { unsigned int u; unsigned int* p; } mask; mask.u = (u_nznz&col_mask)|(u_znzn&~col_mask); r0_cofactor.f = r0_det3x3; r1_cofactor.f = r1_det3x3; r2_cofactor.f = r2_det3x3; r3_cofactor.f = r3_det3x3; r0_cofactor.i ^= mask.p[0]; r1_cofactor.i ^= mask.p[1]; r2_cofactor.i ^= mask.p[2]; r3_cofactor.i ^= mask.p[3]; // Let's fill the output buffer: cols[0] = r0_cofactor.f; cols[1] = r1_cofactor.f; cols[2] = r2_cofactor.f; cols[3] = r3_cofactor.f; } inline void cofactor_v2(s_matrix * const restrict output, const s_matrix * const restrict source) { cofactor_column_v2(output->cols[0].row, source, 0); cofactor_column_v2(output->cols[1].row, source, 1); cofactor_column_v2(output->cols[2].row, source, 2); cofactor_column_v2(output->cols[3].row, source, 3); } inline float determinant_v2(const s_matrix * const restrict source, const s_matrix * const restrict cofactor) { return source->cols[0].row[0] * cofactor->cols[0].row[0] + source->cols[1].row[0] * cofactor->cols[1].row[0] + source->cols[2].row[0] * cofactor->cols[2].row[0] + source->cols[3].row[0] * cofactor->cols[3].row[0]; } inline void transpose_v2(s_matrix * const restrict output, const s_matrix * const restrict source) { output->cols[0].row[0] = source->cols[0].row[0]; output->cols[1].row[0] = source->cols[0].row[1]; output->cols[2].row[0] = source->cols[0].row[2]; output->cols[3].row[0] = source->cols[0].row[3]; output->cols[0].row[1] = source->cols[1].row[0]; output->cols[1].row[1] = source->cols[1].row[1]; output->cols[2].row[1] = source->cols[1].row[2]; output->cols[3].row[1] = source->cols[1].row[3]; output->cols[0].row[2] = source->cols[2].row[0]; output->cols[1].row[2] = source->cols[2].row[1]; output->cols[2].row[2] = source->cols[2].row[2]; output->cols[3].row[2] = source->cols[2].row[3]; output->cols[0].row[3] = source->cols[3].row[0]; output->cols[1].row[3] = source->cols[3].row[1]; output->cols[2].row[3] = source->cols[3].row[2]; output->cols[3].row[3] = source->cols[3].row[3]; } inline void mul_v2(s_matrix * const restrict output, const s_matrix * const restrict source, const float factor) { output->cols[0].row[0] = source->cols[0].row[0] * factor; output->cols[1].row[0] = source->cols[1].row[0] * factor; output->cols[2].row[0] = source->cols[2].row[0] * factor; output->cols[3].row[0] = source->cols[3].row[0] * factor; output->cols[0].row[1] = source->cols[0].row[1] * factor; output->cols[1].row[1] = source->cols[1].row[1] * factor; output->cols[2].row[1] = source->cols[2].row[1] * factor; output->cols[3].row[1] = source->cols[3].row[1] * factor; output->cols[0].row[2] = source->cols[0].row[2] * factor; output->cols[1].row[2] = source->cols[1].row[2] * factor; output->cols[2].row[2] = source->cols[2].row[2] * factor; output->cols[3].row[2] = source->cols[3].row[2] * factor; output->cols[0].row[3] = source->cols[0].row[3] * factor; output->cols[1].row[3] = source->cols[1].row[3] * factor; output->cols[2].row[3] = source->cols[2].row[3] * factor; output->cols[3].row[3] = source->cols[3].row[3] * factor; } inline void inverse_v2(s_matrix * const restrict output, const s_matrix * const restrict source) { s_matrix cof; s_matrix adj; float oodet; // InvM = (1/det(M)) * Transpose(Cofactor(M)) cofactor_v2(&cof, source); oodet = 1.0f / determinant_v2(source, &cof); transpose_v2(&adj, &cof); mul_v2(output, &adj, oodet); //dump_matrix("Cofactor", &cof); } #endif // INVERSE_V2_H