qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using s


From: Peter Maydell
Subject: Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
Date: Thu, 28 Jun 2012 19:25:13 +0100

On 28 June 2012 00:00, Catalin Patulea <address@hidden> wrote:
> Hey guys,
>
> I've been hacking up the FPREM and FPREM1 i386 instructions (without KVM) to 
> be implemented using SoftFloat only. This was motivated by some correctness 
> issues
> that we noticed with the current implementation which follows the AMD 
> datasheet.I believe the datasheet explains the operation as if intermediate 
> results had "infinite" precision, but in this case intermediate rounding 
> causes a loss of precision at the final output.
>
> My reference was TestFloat [1], specifically the floatx80_rem suite, which 
> tests the FPREM instruction quite thoroughly (FPREM1 is technically untested 
> but very similar). The current code fails about half the suite; with the 
> patch, all tests pass.
>
> There are still lots of dark corners - the code originates from Bochs' copy 
> of SoftFloat and I tried to port the Bochs exception handling logic as much 
> as I could. Many details still elude me though (see comments in the code). 
> TestFloat does test some of the exception logic but not as thoroughly as I 
> would have liked. If anyone can offer some guidance here, I am happy to fix 
> up the patch.
>
> That's about it.. let me know what you think.
>
> Catalin
>
> [1] http://www.jhauser.us/arithmetic/TestFloat.html
>
> ---
>  fpu/softfloat.c         |  182 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  fpu/softfloat.h         |    3 +
>  target-i386/op_helper.c |  163 +++++++++++++++++-------------------------
>  3 files changed, 249 insertions(+), 99 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index b29256a..9c6e4a3 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b 
> STATUS_PARAM )
>  }
>
>  
> /*----------------------------------------------------------------------------
> +| Returns 1 if the extended double-precision floating-point value `a' is an
> +| unsupported; otherwise returns 0.

an unsupported what?

> +*----------------------------------------------------------------------------*/
> +int floatx80_is_unsupported(floatx80 a)
> +{
> +    return (extractFloatx80Exp(a) &&
> +            !(extractFloatx80Frac(a) & LIT64(0x8000000000000000)));
> +}
> +
> +/*----------------------------------------------------------------------------
>  | Returns the result of converting the quadruple-precision floating-point
>  | value `a' to the 32-bit two's complement integer format.  The conversion
>  | is performed according to the IEC/IEEE Standard for Binary Floating-Point
> @@ -6828,6 +6838,178 @@ floatx80 floatx80_scalbn( floatx80 a, int n 
> STATUS_PARAM )
>                                           aSign, aExp, aSig, 0 STATUS_VAR );
>  }
>
> +/*
> + * BEGIN from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
> + *
> + * Converted to use Qemu type aliases, use C features only, etc.
> + */

I'm not convinced we want these markers in the source code
(though they should probably be in the commit message).

Can you confirm that the Bochs licence is compatible with
the QEMU one? In particular, is it compatible with/the same
as the license as stated at the top of softfloat.c?

> +
> +/* executes single exponent reduction cycle */
> +static uint64 remainder_kernel(uint64 aSig0, uint64 bSig, int expDiff, 
> uint64 *zSig0, uint64 *zSig1)
> +{
> +    uint64 term0, term1;
> +    uint64 aSig1 = 0;

No new code should be using the uint64 &c types (which are
"at least NN bits wide") -- uint64_t or uint_fast64_t please.

> +
> +    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
> +    uint64 q = estimateDiv128To64(aSig1, aSig0, bSig);
> +    mul64To128(bSig, q, &term0, &term1);
> +    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
> +    while ((int64)(*zSig1) < 0) {
> +        --q;
> +        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
> +    }
> +    return q;
> +}
> +
> +static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64 *q, int 
> rounding_mode STATUS_PARAM )
> +{
> +    int32 aExp, bExp, zExp, expDiff;
> +    uint64 aSig0, aSig1, bSig;
> +    int aSign;
> +    *q = 0;
> +
> +    // handle unsupported extended double-precision floating encodings
> +    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
> +    {
> +        float_raise(float_flag_invalid, status);
> +        *r = floatx80_default_nan;
> +        return -1;
> +    }

So are we mishandling unsupported 80bit floats in other operations
(eg addition), or do those functions just opencode things?

> +    aSig0 = extractFloatx80Frac(a);
> +    aExp = extractFloatx80Exp(a);
> +    aSign = extractFloatx80Sign(a);
> +    bSig = extractFloatx80Frac(b);
> +    bExp = extractFloatx80Exp(b);
> +
> +    if (aExp == 0x7FFF) {
> +        if ((uint64) (aSig0<<1) || ((bExp == 0x7FFF) && (uint64) (bSig<<1))) 
> {
> +            *r = propagateFloatx80NaN(a, b, status);
> +            return -1;
> +        }
> +        float_raise(float_flag_invalid, status);
> +        *r = floatx80_default_nan;
> +        return -1;
> +    }
> +    if (bExp == 0x7FFF) {
> +        if ((uint64) (bSig<<1)) {
> +            *r = propagateFloatx80NaN(a, b, status);
> +            return -1;
> +        }
> +        if (aExp == 0 && aSig0) {
> +            float_raise(float_flag_input_denormal, status);
> +            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
> +            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
> +                    packFloatx80(aSign, aExp, aSig0) : a;
> +            return 0;
> +        }
> +        *r = a;
> +        return 0;
> +
> +    }
> +    if (bExp == 0) {
> +        if (bSig == 0) {
> +            float_raise(float_flag_invalid, status);
> +            *r = floatx80_default_nan;
> +            return -1;
> +        }
> +        float_raise(float_flag_input_denormal, status);
> +        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
> +    }
> +    if (aExp == 0) {
> +        if (aSig0 == 0) {
> +            *r = a;
> +            return 0;
> +        }
> +        float_raise(float_flag_input_denormal, status);
> +        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
> +    }
> +    expDiff = aExp - bExp;
> +    aSig1 = 0;
> +
> +    uint32 overflow = 0;
> +
> +    if (expDiff >= 64) {
> +        int n = (expDiff & 0x1f) | 0x20;
> +        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
> +        zExp = aExp - n;
> +        overflow = 1;
> +    }
> +    else {
> +        zExp = bExp;
> +
> +        if (expDiff < 0) {
> +            if (expDiff < -1) {
> +               *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
> +                    packFloatx80(aSign, aExp, aSig0) : a;
> +               return 0;
> +            }
> +            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
> +            expDiff = 0;
> +        }
> +
> +        if (expDiff > 0) {
> +            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
> +        }
> +        else {
> +            if (bSig <= aSig0) {
> +               aSig0 -= bSig;
> +               *q = 1;
> +            }
> +        }
> +
> +        if (rounding_mode == float_round_nearest_even)
> +        {
> +            uint64 term0, term1;
> +            shift128Right(bSig, 0, 1, &term0, &term1);
> +
> +            if (! lt128(aSig0, aSig1, term0, term1))
> +            {
> +               int lt = lt128(term0, term1, aSig0, aSig1);
> +               int eq = eq128(aSig0, aSig1, term0, term1);
> +
> +               if ((eq && (*q & 1)) || lt) {
> +                  aSign = !aSign;
> +                  ++*q;
> +               }
> +               if (lt) sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
> +            }
> +        }
> +    }
> +
> +    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, 
> status);
> +    return overflow;
> +}
> +
> +/*----------------------------------------------------------------------------
> +| Returns the remainder of the extended double-precision floating-point value
> +| `a' with respect to the corresponding value `b'.  The operation is 
> performed
> +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
> +*----------------------------------------------------------------------------*/
> +
> +int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 
> *q STATUS_PARAM )
> +{
> +    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR );
> +}

This claims to return the remainder, but do_fprem only returns
0, 1 or -1...

> +
> +/*----------------------------------------------------------------------------
> +| Returns the remainder of the extended double-precision floating-point value
> +| `a' with  respect to  the corresponding value `b'. Unlike previous function
> +| the  function  does not compute  the remainder  specified  in  the IEC/IEEE
> +| Standard  for Binary  Floating-Point  Arithmetic.  This  function  operates
> +| differently  from the  previous  function in  the way  that it  rounds  the
> +| quotient of 'a' divided by 'b' to an integer.
> +*----------------------------------------------------------------------------*/
> +
> +int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 *q 
> STATUS_PARAM )
> +{
> +    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR );
> +}
> +
> +/*
> + * END from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
> + */
> +
>  float128 float128_scalbn( float128 a, int n STATUS_PARAM )
>  {
>     flag aSign;
> diff --git a/fpu/softfloat.h b/fpu/softfloat.h
> index feec3a1..53d7827 100644
> --- a/fpu/softfloat.h
> +++ b/fpu/softfloat.h
> @@ -497,10 +497,13 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM 
> );
>  int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
>  int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
>  int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
> +int floatx80_is_unsupported( floatx80 );
>  int floatx80_is_quiet_nan( floatx80 );
>  int floatx80_is_signaling_nan( floatx80 );
>  floatx80 floatx80_maybe_silence_nan( floatx80 );
>  floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
> +int floatx80_ieee754_remainder( floatx80, floatx80, floatx80 *, uint64 * 
> STATUS_PARAM );
> +int floatx80_remainder( floatx80, floatx80, floatx80 *, uint64 * 
> STATUS_PARAM );
>
>  INLINE floatx80 floatx80_abs(floatx80 a)
>  {
> diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
> index 2862ea4..ae0c877 100644
> --- a/target-i386/op_helper.c
> +++ b/target-i386/op_helper.c
> @@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
>         env->fpus |= FPUS_SE | FPUS_B;
>  }
>
> +static int fpu_exception_from_fp_status(void)
> +{
> +    int mask;
> +
> +    const int float_flag_denormals = float_flag_input_denormal |
> +                                     float_flag_output_denormal;
> +
> +    // Most flags have the same value in the enum and 387 status word, except
> +    // the denormal flags.
> +    mask = env->fp_status.float_exception_flags & ~float_flag_denormals;
> +    if (env->fp_status.float_exception_flags & float_flag_denormals) {
> +        mask |= FPUS_DE;
> +    }
> +
> +    return mask;
> +}
> +
>  static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
>  {
>     if (floatx80_is_zero(b)) {
> @@ -4210,119 +4227,67 @@ void helper_fxtract(void)
>
>  void helper_fprem1(void)
>  {
> -    double st0, st1, dblq, fpsrcop, fptemp;
> -    CPU_LDoubleU fpsrcop1, fptemp1;
> -    int expdif;
> -    signed long long int q;
> -
> -    st0 = floatx80_to_double(ST0);
> -    st1 = floatx80_to_double(ST1);
> -
> -    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
> -        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        return;
> -    }
> -
> -    fpsrcop = st0;
> -    fptemp = st1;
> -    fpsrcop1.d = ST0;
> -    fptemp1.d = ST1;
> -    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
> -
> -    if (expdif < 0) {
> -        /* optimisation? taken from the AMD docs */
> +    floatx80 result;
> +    uint64_t quotient;
> +
> +    /* TODO(catalinp):
> +     *
> +     * - What is the defined purpose of fp_status in Qemu? Seems many things
> +     *   write into it, but few if any translate into env->fpus.

fp_status is a float_status structure which has two purposes:
 * tells the softfloat code how to behave in certain circumstances
   (how to handle denormals, rounding modes, etc)
 * tracks the IEEE "cumulative exception flags" (inexact, overflow, etc)
Every softfloat routine which needs to set exception flags or behave
differently according to the flags in fp_status will take a float_status*
argument. (This is obscured slightly by the silly STATUS_PARAM #define.)

(Note that for some CPU architectures there may be more than one float_status
struct; eg ARM has both an fp_status and a standard_fp_status, because Neon
operations behave differently from VFP ones; see comments in target-arm/cpu.h.
The Neon operations pass the softfloat code the standard_fp_status, and
the VFP ops pass in the fp_status.)

The way I would recommend using it is:
 * the "master copy" of this information is in the float_status
 * when we do a read of whatever CPU register(s) expose this info
   to the guest, construct that register value on the fly from the
   float_status struct
 * when we do a write of those registers (or a CPU reset), change
   the float_status struct accordingly
 * for most floating point arithmetic operations, no messing with
   float_status should then be necessary (we just pass a pointer to
   fp_status into the softfloat function)
The idea here is that fp operations are common but reading or writing
the registers which change rounding mode or reveal the exception
flags are rare, so we do the conversion only when we have to.

target-i386 is a bit of a latecomer to using softfloat and not
particularly maintained either, so it is quite possible it is
buggy in this regard. (Very few programs actually care about
the cumulative exception flags so it is quite easy for bugs in
this area to go unnoticed.) target-arm gets it right.

Also the functions which are still doing native arithmetic
(like the ones you're trying to convert) have to do their own
detection and handling of exception conditions. In theory at
least some of that code just vanishes into the generic softfloat
handling.

> +     *
> +     * - Bochs' FPU_pre_exception_handling resets a few more fields in 
> fp_status
> +     *   before doing the operation. What is the purpose of that and is this
> +     *   necessary here?
> +     */
> +    env->fp_status.float_exception_flags = 0;

This will trash all the cumulative exception flags and is very unlikely
to be correct.

> +    int flags = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
> +                                           &env->fp_status);
> +
> +    /* TODO(catalinp): Bochs also checks for unmasked exceptions before 
> storing
> +     * these flags. Should we also do this?
> +     */
> +    if (flags >= 0) {
>         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        /* ST0 is unchanged */
> -        return;
> +        if (flags) {
> +            env->fpus |= (1 << 10);
> +        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
> +            if (quotient & 1) env->fpus |= (1 << 9);
> +            if (quotient & 2) env->fpus |= (1 << 14);
> +            if (quotient & 4) env->fpus |= (1 << 8);
> +        }
>     }
>
> -    if (expdif < 53) {
> -        dblq = fpsrcop / fptemp;
> -        /* round dblq towards nearest integer */
> -        dblq = rint(dblq);
> -        st0 = fpsrcop - fptemp * dblq;
> -
> -        /* convert dblq to q by truncating towards zero */
> -        if (dblq < 0.0)
> -           q = (signed long long int)(-dblq);
> -        else
> -           q = (signed long long int)dblq;
> +    ST0 = result;
>
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -                                /* (C0,C3,C1) <-- (q2,q1,q0) */
> -        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
> -        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
> -        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
> -    } else {
> -        env->fpus |= 0x400;  /* C2 <-- 1 */
> -        fptemp = pow(2.0, expdif - 50);
> -        fpsrcop = (st0 / st1) / fptemp;
> -        /* fpsrcop = integer obtained by chopping */
> -        fpsrcop = (fpsrcop < 0.0) ?
> -                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
> -        st0 -= (st1 * fpsrcop * fptemp);
> -    }
> -    ST0 = double_to_floatx80(st0);
> +    /* TODO(catalinp): Set only unmasked exceptions? */
> +    fpu_set_exception(fpu_exception_from_fp_status());

Check the implementation of fpu_set_exception -- it handles
the check for whether the exceptions are masked or not.

>  }
>
>  void helper_fprem(void)
>  {
> -    double st0, st1, dblq, fpsrcop, fptemp;
> -    CPU_LDoubleU fpsrcop1, fptemp1;
> -    int expdif;
> -    signed long long int q;
> -
> -    st0 = floatx80_to_double(ST0);
> -    st1 = floatx80_to_double(ST1);
> +    floatx80 result;
> +    uint64_t quotient;
>
> -    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
> -       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
> -       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -       return;
> -    }
> -
> -    fpsrcop = st0;
> -    fptemp = st1;
> -    fpsrcop1.d = ST0;
> -    fptemp1.d = ST1;
> -    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
> +    /* TODO(catalinp): See comments in helper_fprem1() about lots of 
> potential
> +     * semantic ambiguities/differences between this and Bochs' 
> implementation.
> +     */
> +    env->fp_status.float_exception_flags = 0;
> +    int flags = floatx80_remainder(ST0, ST1, &result, &quotient, 
> &env->fp_status);
>
> -    if (expdif < 0) {
> -        /* optimisation? taken from the AMD docs */
> +    if (flags >= 0) {
>         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        /* ST0 is unchanged */
> -        return;
> +        if (flags) {
> +            env->fpus |= (1 << 10);
> +        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
> +            if (quotient & 1) env->fpus |= (1 << 9);
> +            if (quotient & 2) env->fpus |= (1 << 14);
> +            if (quotient & 4) env->fpus |= (1 << 8);
> +        }
>     }
>
> -    if ( expdif < 53 ) {
> -        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
> -        /* round dblq towards zero */
> -        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
> -        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
> +    ST0 = result;
>
> -        /* convert dblq to q by truncating towards zero */
> -        if (dblq < 0.0)
> -           q = (signed long long int)(-dblq);
> -        else
> -           q = (signed long long int)dblq;
> -
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -                                /* (C0,C3,C1) <-- (q2,q1,q0) */
> -        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
> -        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
> -        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
> -    } else {
> -        int N = 32 + (expdif % 32); /* as per AMD docs */
> -        env->fpus |= 0x400;  /* C2 <-- 1 */
> -        fptemp = pow(2.0, (double)(expdif - N));
> -        fpsrcop = (st0 / st1) / fptemp;
> -        /* fpsrcop = integer obtained by chopping */
> -        fpsrcop = (fpsrcop < 0.0) ?
> -                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
> -        st0 -= (st1 * fpsrcop * fptemp);
> -    }
> -    ST0 = double_to_floatx80(st0);
> +    fpu_set_exception(fpu_exception_from_fp_status());
>  }
>
>  void helper_fyl2xp1(void)
> --
> 1.7.7.3
>


-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]