// ## cp_fatan-cbe-spu.h (C99) // ## Version 1.0 // ## // ## Copyright (c) 2006 Mike Acton // ## // ## SIGNIFICANT REFERENCES: // ## // ## [1] Cephes Math Library Release 2.8: June, 2000 // ## Copyright 1984, 1995, 2000, Stephen L. Moshier // ## [2] Numerical Computation Guide (PDF) // ## Copyright 2000, Sun Microsystems, Inc. // ## [3] IEEE 754 Support in C99 (PDF) // ## Copyright 2001, Jim Thomas // ## [4] Solaris 10 Reference Manual : atan2(3M) // ## Copyright 1994-2005, Sun Microsystems, Inc. // ## // ## Permission is hereby granted, free of charge, to any person obtaining // ## a copy of this software and associated documentation files // ## (the "Software"), to deal in the Software without restriction, including // ## without limitation the rights to use, copy, modify, merge, publish, // ## distribute, sublicense, and/or sell copies of the Software, and to permit // ## persons to whom the Software is furnished to do so, subject to the // ## following conditions: // ## // ## The above copyright notice and this permission notice shall be included // ## in all copies or substantial portions of the Software. // ## // ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // ## OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // ## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // ## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // ## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // ## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // ## THE SOFTWARE. // ## #ifndef CP_FATAN_CBE_SPU_H #define CP_FATAN_CBE_SPU_H #include #include // ## // ## Global Floating-point constants (32 bit) // ## // ## Constant is loaded in each element of 32 bit floating-point vector // ## from local store. // ## // ## cp_flpio4() +PI/+4 // ## cp_flt3p8() tan( +3.0 * PI / +8.0 ) // ## cp_flnpio2() -PI/+2 // ## cp_flpio2() +PI/+2 // ## cp_flpt66() +0.66 // ## cp_flpi() +PI // ## cp_flnpi() -PI extern const vector unsigned int _cp_f_pio4; extern const vector unsigned int _cp_f_t3p8; extern const vector unsigned int _cp_f_npio2; extern const vector unsigned int _cp_f_pio2; extern const vector unsigned int _cp_f_pt66; extern const vector unsigned int _cp_f_pi; extern const vector unsigned int _cp_f_npi; static inline qword cp_flpio4( void ) { return si_lqa( (intptr_t)&_cp_f_pio4 ); } static inline qword cp_flt3p8( void ) { return si_lqa( (intptr_t)&_cp_f_t3p8 ); } static inline qword cp_flnpio2( void ) { return si_lqa( (intptr_t)&_cp_f_npio2 ); } static inline qword cp_flpio2( void ) { return si_lqa( (intptr_t)&_cp_f_pio2 ); } static inline qword cp_flpt66( void ) { return si_lqa( (intptr_t)&_cp_f_pt66 ); } static inline qword cp_flpi( void ) { return si_lqa( (intptr_t)&_cp_f_pi ); } static inline qword cp_flnpi( void ) { return si_lqa( (intptr_t)&_cp_f_npi ); } // ## // ## Load-Immediate Floating-point constants (32 bit) // ## // ## Constant is loaded in each element of 32 bit floating-point vector // ## using immediate values. i.e. No loads // ## // ## cp_filzero() +0.0 +0x00000000 // ## cp_filnzero() -0.0 +0x80000000 // ## cp_filone() +1.0 +0x3f800000 // ## cp_filtwo() +2.0 +0x40000000 // ## cp_filinf() +INF +0x7f800000 // ## cp_filninf() -INF +0xff800000 // ## cp_filnan() NaN +0x7fc00000 // ## static inline qword cp_filzero( void ) { return si_ilhu( (int16_t)0x0000 ); } static inline qword cp_filnzero( void ) { return si_ilhu( (int16_t)0x8000 ); } static inline qword cp_filone( void ) { return si_ilhu( (int16_t)0x3f80 ); } static inline qword cp_filtwo( void ) { return si_ilhu( (int16_t)0x4000 ); } static inline qword cp_filinf( void ) { return si_ilhu( (int16_t)0x7f80 ); } static inline qword cp_filninf( void ) { return si_ilhu( (int16_t)0xff80 ); } static inline qword cp_filnan( void ) { return si_ilhu( (int16_t)0x7fc0 ); } // ## // ## cp_fatan() Coefficients and other constants // ## extern const vector unsigned int _cp_f_atan_q4; extern const vector unsigned int _cp_f_atan_q3; extern const vector unsigned int _cp_f_atan_q2; extern const vector unsigned int _cp_f_atan_q1; extern const vector unsigned int _cp_f_atan_q0; extern const vector unsigned int _cp_f_atan_p4; extern const vector unsigned int _cp_f_atan_p3; extern const vector unsigned int _cp_f_atan_p2; extern const vector unsigned int _cp_f_atan_p1; extern const vector unsigned int _cp_f_atan_p0; extern const vector unsigned int _cp_f_hmorebits; extern const vector unsigned int _cp_f_morebits; // ## cp_fatan(x) // ## // ## 0 <= x <= 0.66 // ## -PI/2 <= cp_fatan(x) <= +PI/2 // ## // ## Each floating-point component of the result is a function of // ## the corresponding components of x: // ## // ## 0.0 { x == 0.0 // ## // ## +PI { // ## --- { x == INF // ## 2.0 { // ## // ## -PI { // ## --- { x == -INF // ## 2.0 { // ## // ## // ## 2 4 6 8 { // ## P + P x + P x + P x + P x { // ## 2 0 1 2 3 4 { // ## x x ----------------------------------- + x { otherwise // ## 2 4 6 8 10 { // ## Q + Q x + Q x + Q x + Q x + x { // ## 0 1 2 3 4 { static inline qword _cp_fatan( const qword x ) { // ## // ## Load constants // ## const qword f_one = cp_filone(); const qword f_inf = cp_filinf(); const qword f_ninf = cp_filninf(); const qword f_msb = cp_filnzero(); const qword f_zero = cp_filzero(); const qword f_pt66 = si_lqa( (intptr_t)&_cp_f_pt66 ); const qword f_pio2 = si_lqa( (intptr_t)&_cp_f_pio2 ); const qword f_npio2 = si_lqa( (intptr_t)&_cp_f_npio2 ); const qword f_pio4 = si_lqa( (intptr_t)&_cp_f_pio4 ); const qword f_t3p8 = si_lqa( (intptr_t)&_cp_f_t3p8 ); const qword f_atan_p0 = si_lqa( (intptr_t)&_cp_f_atan_p0 ); const qword f_atan_p1 = si_lqa( (intptr_t)&_cp_f_atan_p1 ); const qword f_atan_p2 = si_lqa( (intptr_t)&_cp_f_atan_p2 ); const qword f_atan_p3 = si_lqa( (intptr_t)&_cp_f_atan_p3 ); const qword f_atan_p4 = si_lqa( (intptr_t)&_cp_f_atan_p4 ); const qword f_atan_q0 = si_lqa( (intptr_t)&_cp_f_atan_q0 ); const qword f_atan_q1 = si_lqa( (intptr_t)&_cp_f_atan_q1 ); const qword f_atan_q2 = si_lqa( (intptr_t)&_cp_f_atan_q2 ); const qword f_atan_q3 = si_lqa( (intptr_t)&_cp_f_atan_q3 ); const qword f_atan_q4 = si_lqa( (intptr_t)&_cp_f_atan_q4 ); const qword f_morebits = si_lqa( (intptr_t)&_cp_f_morebits ); const qword f_hmorebits = si_lqa( (intptr_t)&_cp_f_hmorebits ); // ## // ## pos_x = -x { x < 0 // ## x { otherwise // ## const qword neg_x = si_xor( x, f_msb ); const qword sign_mask = si_fcgt( f_zero, x ); const qword pos_x = si_selb( x, neg_x, sign_mask ); // ## // ## Range reduction // ## // ## // ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) ) // ## range1_mask = ( pos_x <= 0.66 ) // ## range2_mask = !( range0_mask || range1_mask ) // ## const qword range0_mask = si_fcgt( pos_x, f_t3p8 ); const qword range1_gt_mask = si_fcgt( f_pt66, pos_x ); const qword range1_eq_mask = si_fceq( f_pt66, pos_x ); const qword range1_mask = si_or( range1_gt_mask, range1_eq_mask ); const qword range2_mask = si_nor( range0_mask, range1_mask ); // ## // ## range0_x = -1.0 // ## ----- // ## pos_x // ## // ## range0_y = PI // ## --- // ## 2.0 // ## const qword range0_x0 = si_frest( pos_x ); const qword range0_x1 = si_fi( pos_x, range0_x0 ); const qword range0_x2 = si_fnms( range0_x1, pos_x, f_one ); const qword range0_x3 = si_fma( range0_x2, range0_x1, range0_x1 ); const qword range0_x = si_xor( range0_x3, f_msb ); const qword range0_y = f_pio2; // ## // ## range1_x = pos_x // ## range1_y = 0.0 // ## const qword range1_x = pos_x; const qword range1_y = f_zero; // ## // ## range2_x = (pos_x-1.0) // ## ----------- // ## (pos_x+1.0) // ## // ## range2_y = PI // ## --- // ## 4.0 // ## const qword range2_y = f_pio4; const qword range2_x0num = si_fs( pos_x, f_one ); const qword range2_x0den = si_fa( pos_x, f_one ); const qword range2_x0 = si_frest( range2_x0den ); const qword range2_x1 = si_fnms( range2_x0, range2_x0den, f_one ); const qword range2_x2 = si_fma( range2_x1, range2_x0, range2_x0 ); const qword range2_x = si_fm( range2_x0num, range2_x2 ); // ## // ## range_x = range0_x { range0_mask // ## range1_x { range1_mask // ## range2_x { range2_mask // ## // ## range_y = range0_y { range0_mask // ## range1_y { range1_mask // ## range2_y { range2_mask // ## const qword range_x0 = si_selb( range2_x, range0_x, range0_mask ); const qword range_x = si_selb( range_x0, range1_x, range1_mask ); const qword range_y0 = si_selb( range2_y, range0_y, range0_mask ); const qword range_y = si_selb( range_y0, range1_y, range1_mask ); // ## // ## 2 // ## xp2 = range_x // ## 2 3 4 // ## P + P xp2 + P xp2 + P xp2 + P xp2 // ## 0 1 2 3 4 // ## zdiv = ------------------------------------------ // ## 2 3 4 5 // ## Q + Q xp2 + Q xp2 + Q xp2 + Q xp2 + xp2 // ## 0 1 2 3 4 // ## // ## z1 = range_x * ( xp2 * zdiv ) + range_x // ## const qword xp2 = si_fm( range_x, range_x ); const qword znum0 = f_atan_p0; const qword znum1 = si_fma( znum0, xp2, f_atan_p1 ); const qword znum2 = si_fma( znum1, xp2, f_atan_p2 ); const qword znum3 = si_fma( znum2, xp2, f_atan_p3 ); const qword znum = si_fma( znum3, xp2, f_atan_p4 ); const qword zden0 = si_fa( xp2, f_atan_q0 ); const qword zden1 = si_fma( zden0, xp2, f_atan_q1 ); const qword zden2 = si_fma( zden1, xp2, f_atan_q2 ); const qword zden3 = si_fma( zden2, xp2, f_atan_q3 ); const qword zden = si_fma( zden3, xp2, f_atan_q4 ); const qword zden_r0 = si_frest( zden ); const qword zden_r1 = si_fnms( zden_r0, zden, f_one ); const qword zden_r = si_fma( zden_r1, zden_r0, zden_r0 ); const qword zdiv = si_fm( znum, zden_r ); const qword z0 = si_fm( xp2, zdiv ); const qword z1 = si_fma( range_x, z0, range_x ); // ## // ## zadd = z1 + 0.5 * MOREBITS { range2_mask // ## z1 + MOREBITS { range1_mask // ## z1 { otherwise // ## // ## yaddz = range_y + zadd // ## // ## pos_yaddz = yaddz { yaddz >= 0 // ## -yaddz { yaddz < 0 // ## const qword zadd0 = si_selb( f_zero, f_hmorebits, range2_mask ); const qword zadd1 = si_selb( zadd0, f_morebits, range1_mask ); const qword zadd = si_fa( z1, zadd1 ); const qword yaddz = si_fa( range_y, zadd ); const qword neg_yaddz = si_xor( yaddz, f_msb ); const qword pos_yaddz = si_selb( yaddz, neg_yaddz, sign_mask ); // ## // ## result_y0 = 0.0 { x == 0.0 // ## pos_yaddz { otherwise // ## const qword x_eqz_mask = si_fceq( f_zero, x ); const qword result_y0 = si_selb( pos_yaddz, x, x_eqz_mask ); // ## // ## result_y2 = +PI { // ## --- { x == INF // ## 2.0 { // ## // ## -PI { // ## --- { x == -INF // ## 2.0 { // ## // ## result_y0 { otherwise // ## const qword x_eqinf_mask = si_fceq( f_inf, x ); const qword x_eqninf_mask = si_fceq( f_ninf, x ); const qword result_y1 = si_selb( result_y0, f_pio2, x_eqinf_mask ); const qword result = si_selb( result_y1, f_npio2, x_eqninf_mask ); return (result); } static inline vector float cp_fatan( const vector float x ) { return (vector float)( _cp_fatan( (qword)x ) ); } static inline float cp_fatan_scalar( const float x ) { const qword vx = si_from_float( x ); const qword vresult = _cp_fatan( vx ); const float result = si_to_float( vresult ); return (result); } // ## cp_fatan2(y,x) // ## // ## -INF <= x <= INF // ## -INF <= y <= INF // ## -PI <= cp_fatan2(y,x) <= +PI // ## // ## Each floating-point component of the result is a function of // ## the corresponding components of y and x: // ## // ## +PI { (y == +0.0) && (x < 0.0) // ## // ## -PI { (y == -0.0) && (x < 0.0) // ## // ## +0.0 { (y == +0.0) && (x > 0.0) // ## // ## -0.0 { (y == -0.0) && (x > 0.0) // ## // ## -PI { // ## ---- { (y < 0.0) && (x == 0.0) // ## +2.0 { // ## // ## +PI { // ## ---- { (y > 0.0) && (x == 0.0) // ## +2.0 { // ## // ## NaN { (y == NaN) || (x == NaN) // ## // ## +PI { (y == +0.0) && (x == -0.0) // ## // ## -PI { (y == -0.0) && (x == -0.0) // ## // ## +0.0 { (y == +0.0) && (x == +0.0) // ## // ## -0.0 { (y == -0.0) && (x == +0.0) // ## // ## +PI { // ## --- { (y == +INF) && (x == +INF) // ## 4.0 { // ## // ## -PI { // ## --- { (y == -INF) && (x == +INF) // ## 4.0 { // ## // ## +3.0 PI { // ## ------- { (y == +INF) && (x == -INF) // ## +4.0 { // ## // ## -3.0 PI { // ## ------- { (y == -INF) && (x == -INF) // ## +4.0 { // ## // ## +PI { isfinite(y) && (+y > 0) && (x == -INF) // ## // ## -PI { isfinite(y) && (-y > 0) && (x == -INF) // ## // ## +0.0 { isfinite(y) && (+y > 0) && (x == +INF) // ## // ## -0.0 { isfinite(y) && (-y > 0) && (x == +INF) // ## // ## +PI { // ## ---- { (isfinite(x) && (y == +INF) // ## +2.0 { // ## // ## -PI { // ## --- { (isfinite(x) && (y == -INF) // ## +2.0 { // ## // ## ( y ) { // ## +PI + cp_atan( - ) { ( x < 0.0 ) && ( y >= 0.0 ) // ## ( x ) { // ## // ## ( y ) { // ## -PI + cp_atan( - ) { ( x < 0.0 ) && ( y < 0.0 ) // ## ( x ) { // ## // ## ( y ) { // ## +0.0 + cp_atan( - ) { otherwise // ## ( x ) { // ## qword _cp_fatan2( qword y, qword x ) { const qword f_one = cp_filone(); const qword f_zero = cp_filzero(); const qword f_pi = si_lqa( (intptr_t)&_cp_f_pi ); const qword f_npi = si_lqa( (intptr_t)&_cp_f_npi ); // ## // ## yox = y // ## - // ## x // ## // ## z = +PI + cp_atan( yox ) { ( x < 0.0 ) && ( y >= 0.0 ) // ## -PI + cp_atan( yox ) { ( x < 0.0 ) && ( y < 0.0 ) // ## 0.0 + cp_atan( yox ) { otherwise const qword x_ltz_mask = si_fcgt( f_zero, x ); const qword y_ltz_mask = si_fcgt( f_zero, y ); const qword xy_ltz_mask = si_and( x_ltz_mask, y_ltz_mask ); const qword zadd0 = si_selb( f_zero, f_pi, x_ltz_mask ); const qword zadd = si_selb( zadd0, f_npi, xy_ltz_mask ); const qword x_r0 = si_frest( x ); const qword x_r1 = si_fnms( x_r0, x, f_one ); const qword x_r = si_fma( x_r1, x_r0, x_r0 ); const qword yox = si_fm( y, x_r ); const qword atan_yox = _cp_fatan( yox ); const qword result = si_fa( zadd, atan_yox ); return (result); } vector float cp_fatan2( vector float arg0 /* y */, vector float arg1 /* x */ ) { const qword y = (qword)arg0; const qword x = (qword)arg1; const qword result = _cp_fatan2( y, x ); return (vector float)(result); } float cp_fatan2_scalar( float arg0 /* y */, float arg1 /* x */ ) { const qword y = si_from_float( arg0 ); const qword x = si_from_float( arg1 ); const qword z = _cp_fatan2( y, x ); const float result = si_to_float( z ); return( result ); } #endif /* CP_FATAN_CBE_SPU_H */