From 5c7f63fa7a88cf2cb9b6837eb2797268c5843030 Mon Sep 17 00:00:00 2001
From: Guillaume Melquiond
Date: Fri, 2 Oct 2020 08:16:05 +0200
Subject: Add allocation-free variants of the nextup and nextdown
 floating-point operations.

Floating-point values are boxed, which means that any operation causes an
allocation. While short-lived, they nonetheless cause the minor heap to
fill, which in turn triggers the garbage collector.

To reduce the number of allocations, I initially went with a shadow stack
mechanism for storing floating-point values. But assuming the CoqInterval
library is representative, this was way too complicated in practice, as
most stack-located values ended up being passed to nextdown and nextup
before being stored in memory.

So, this commit implements a different mechanism. Variants of nextdown and
nextup are added, which reuse the allocation of their input argument.
Obviously, this is only correct if there is no other reference to this
argument. To ensure this property, the commit only uses these opcode
during a peephole optimization. If two primitive operations follow one
another, then the second one can reuse the allocation of the first one,
since it never had time to even reach the stack.

For CoqInterval, this divides the number of allocations due to
floating-point operations by about two.

The following snippet is made 4% faster by this commit (and 13% faster if
we consider only the floating-point operations).

    From Coq Require Import Int63 BinPos PrimFloat.
    Definition foo n :=
      let eps := sub (next_up one) one in
      Pos.iter (fun x => next_down (add x eps)) two n.
    Time Eval vm_compute in foo 100000000.
---
 kernel/byterun/coq_interp.c | 14 ++++++++++++++
 kernel/genOpcodeFiles.ml    |  2 ++
 kernel/vmemitcodes.ml       | 15 +++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/byterun/coq_interp.c b/kernel/byterun/coq_interp.c
index 0f7713a6bb..3c3f45f7e6 100644
--- a/kernel/byterun/coq_interp.c
+++ b/kernel/byterun/coq_interp.c
@@ -1753,6 +1753,20 @@ value coq_interprete
         Next;
       }
 
+      Instruct (CHECKNEXTUPFLOATINPLACE) {
+        print_instr("CHECKNEXTUPFLOATINPLACE");
+        CheckFloat1();
+        Store_double_val(accu, nextafter(Double_val(accu), INFINITY));
+        Next;
+      }
+
+      Instruct (CHECKNEXTDOWNFLOATINPLACE) {
+        print_instr("CHECKNEXTDOWNFLOATINPLACE");
+        CheckFloat1();
+        Store_double_val(accu, nextafter(Double_val(accu), -INFINITY));
+        Next;
+      }
+
       Instruct(CHECKCAMLCALL2_1) {
         // arity-2 callback, the last argument can be an accumulator
         value arg;
diff --git a/kernel/genOpcodeFiles.ml b/kernel/genOpcodeFiles.ml
index 63abb50cc2..dc2cd349ce 100644
--- a/kernel/genOpcodeFiles.ml
+++ b/kernel/genOpcodeFiles.ml
@@ -147,6 +147,8 @@ let opcodes =
     "CHECKLDSHIFTEXP";
     "CHECKNEXTUPFLOAT";
     "CHECKNEXTDOWNFLOAT";
+    "CHECKNEXTUPFLOATINPLACE";
+    "CHECKNEXTDOWNFLOATINPLACE";
     "CHECKCAMLCALL2_1";
     "CHECKCAMLCALL1";
     "CHECKCAMLCALL2";
diff --git a/kernel/vmemitcodes.ml b/kernel/vmemitcodes.ml
index 592bdf775f..c1d8fcb855 100644
--- a/kernel/vmemitcodes.ml
+++ b/kernel/vmemitcodes.ml
@@ -257,6 +257,15 @@ let check_prim_op = function
   | Arraydefault | Arraycopy | Arraylength ->
       opCHECKCAMLCALL1
 
+let inplace_prim_op = function
+  | Float64next_up | Float64next_down -> true
+  | _ -> false
+
+let check_prim_op_inplace = function
+  | Float64next_up   -> opCHECKNEXTUPFLOATINPLACE
+  | Float64next_down -> opCHECKNEXTDOWNFLOATINPLACE
+  | _ -> assert false
+
 let emit_instr env = function
   | Klabel lbl -> define_label env lbl
   | Kacc n ->
@@ -392,6 +401,12 @@ let rec emit env insns remaining = match insns with
       out env opRETURN; out_int env n; emit env c remaining
   | Ksequence c1 :: c ->
       emit env c1 (c :: remaining)
+  | Kprim (op1, (q1, _)) :: Kprim (op2, (q2, _)) :: c when inplace_prim_op op2 ->
+      out env (check_prim_op op1);
+      slot_for_getglobal env q1;
+      out env (check_prim_op_inplace op2);
+      slot_for_getglobal env q2;
+      emit env c remaining
   (* Default case *)
   | instr :: c ->
       emit_instr env instr; emit env c remaining
-- 
cgit v1.2.3