From ef3ec53e4f74f32a705489b332b037569680d28e Mon Sep 17 00:00:00 2001 From: Guillaume Melquiond Date: Fri, 2 Oct 2020 18:17:23 +0200 Subject: Hardcode next_up and next_down instead of relying on nextafter. Unfortunately, compilers are currently unable to optimize the nextafter function, even in the degenerate case where the second argument is explicitly infinite. So, this commit implements this case by hand. On the following testcase, this gives a 15% speedup. From Coq Require Import Int63 BinPos PrimFloat. Definition foo n := let eps := sub (next_up one) one in Pos.iter (fun x => next_down (add x eps)) two n. Time Eval vm_compute in foo 100000000. And when looking at the cost of just the allocation-free version of next_down, the speedup is 1500%. Said otherwise, the latency of next_down is now on par with the measurement noise due to cache misses and the like. --- kernel/byterun/coq_float64.c | 32 ++++++++++++++++++++++++++++++-- kernel/byterun/coq_interp.c | 11 +++++++---- 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/byterun/coq_float64.c b/kernel/byterun/coq_float64.c index ebbfe35fa6..bea47dd47e 100644 --- a/kernel/byterun/coq_float64.c +++ b/kernel/byterun/coq_float64.c @@ -9,12 +9,40 @@ /************************************************************************/ #include +#include #define CAML_INTERNALS #include #include "coq_values.h" +union double_bits { + double d; + uint64_t u; +}; + +static double next_up(double x) { + union double_bits bits; + if (!(x < INFINITY)) return x; // x is +oo or NaN + bits.d = x; + int64_t i = bits.u; + if (i >= 0) ++bits.u; // x >= +0.0, go away from zero + else if (bits.u + bits.u == 0) bits.u = 1; // x is -0.0, should almost never happen + else --bits.u; // x < 0.0, go toward zero + return bits.d; +} + +static double next_down(double x) { + union double_bits bits; + if (!(x > -INFINITY)) return x; // x is -oo or NaN + bits.d = x; + int64_t i = bits.u; + if (i == 0) bits.u = INT64_MIN + 1; // x is +0.0 + else if (i < 0) ++bits.u; // x <= -0.0, go away from zero + else --bits.u; // x > 0.0, go toward zero + return bits.d; +} + #define DECLARE_FBINOP(name, e) \ double coq_##name(double x, double y) { \ return e; \ @@ -38,8 +66,8 @@ DECLARE_FBINOP(fadd, x + y) DECLARE_FBINOP(fsub, x - y) DECLARE_FBINOP(fdiv, x / y) DECLARE_FUNOP(fsqrt, sqrt(x)) -DECLARE_FUNOP(next_up, nextafter(x, INFINITY)) -DECLARE_FUNOP(next_down, nextafter(x, -INFINITY)) +DECLARE_FUNOP(next_up, next_up(x)) +DECLARE_FUNOP(next_down, next_down(x)) value coq_is_double(value x) { return Val_long(Is_double(x)); diff --git a/kernel/byterun/coq_interp.c b/kernel/byterun/coq_interp.c index 3c3f45f7e6..8990743de2 100644 --- a/kernel/byterun/coq_interp.c +++ b/kernel/byterun/coq_interp.c @@ -238,6 +238,9 @@ extern intnat volatile caml_pending_signals[]; extern void caml_process_pending_signals(void); #endif +extern double coq_next_up(double); +extern double coq_next_down(double); + /* The interpreter itself */ value coq_interprete @@ -1742,28 +1745,28 @@ value coq_interprete Instruct (CHECKNEXTUPFLOAT) { print_instr("CHECKNEXTUPFLOAT"); CheckFloat1(); - Coq_copy_double(nextafter(Double_val(accu), INFINITY)); + Coq_copy_double(coq_next_up(Double_val(accu))); Next; } Instruct (CHECKNEXTDOWNFLOAT) { print_instr("CHECKNEXTDOWNFLOAT"); CheckFloat1(); - Coq_copy_double(nextafter(Double_val(accu), -INFINITY)); + Coq_copy_double(coq_next_down(Double_val(accu))); Next; } Instruct (CHECKNEXTUPFLOATINPLACE) { print_instr("CHECKNEXTUPFLOATINPLACE"); CheckFloat1(); - Store_double_val(accu, nextafter(Double_val(accu), INFINITY)); + Store_double_val(accu, coq_next_up(Double_val(accu))); Next; } Instruct (CHECKNEXTDOWNFLOATINPLACE) { print_instr("CHECKNEXTDOWNFLOATINPLACE"); CheckFloat1(); - Store_double_val(accu, nextafter(Double_val(accu), -INFINITY)); + Store_double_val(accu, coq_next_down(Double_val(accu))); Next; } -- cgit v1.2.3