src/lj_opt_loop.c - luajit-2.0-src

Data types defined

Functions defined

Macros defined

Source code

  1. /*
  2. ** LOOP: Loop Optimizations.
  3. ** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
  4. */

  5. #define lj_opt_loop_c
  6. #define LUA_CORE

  7. #include "lj_obj.h"

  8. #if LJ_HASJIT

  9. #include "lj_err.h"
  10. #include "lj_buf.h"
  11. #include "lj_ir.h"
  12. #include "lj_jit.h"
  13. #include "lj_iropt.h"
  14. #include "lj_trace.h"
  15. #include "lj_snap.h"
  16. #include "lj_vm.h"

  17. /* Loop optimization:
  18. **
  19. ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
  20. ** of a loop into invariant and variant instructions. The invariant
  21. ** instructions are hoisted out of the loop and only the variant
  22. ** instructions remain inside the loop body.
  23. **
  24. ** Unfortunately LICM is mostly useless for compiling dynamic languages.
  25. ** The IR has many guards and most of the subsequent instructions are
  26. ** control-dependent on them. The first non-hoistable guard would
  27. ** effectively prevent hoisting of all subsequent instructions.
  28. **
  29. ** That's why we use a special form of unrolling using copy-substitution,
  30. ** combined with redundancy elimination:
  31. **
  32. ** The recorded instruction stream is re-emitted to the compiler pipeline
  33. ** with substituted operands. The substitution table is filled with the
  34. ** refs returned by re-emitting each instruction. This can be done
  35. ** on-the-fly, because the IR is in strict SSA form, where every ref is
  36. ** defined before its use.
  37. **
  38. ** This aproach generates two code sections, separated by the LOOP
  39. ** instruction:
  40. **
  41. ** 1. The recorded instructions form a kind of pre-roll for the loop. It
  42. ** contains a mix of invariant and variant instructions and performs
  43. ** exactly one loop iteration (but not necessarily the 1st iteration).
  44. **
  45. ** 2. The loop body contains only the variant instructions and performs
  46. ** all remaining loop iterations.
  47. **
  48. ** On first sight that looks like a waste of space, because the variant
  49. ** instructions are present twice. But the key insight is that the
  50. ** pre-roll honors the control-dependencies for *both* the pre-roll itself
  51. ** *and* the loop body!
  52. **
  53. ** It also means one doesn't have to explicitly model control-dependencies
  54. ** (which, BTW, wouldn't help LICM much). And it's much easier to
  55. ** integrate sparse snapshotting with this approach.
  56. **
  57. ** One of the nicest aspects of this approach is that all of the
  58. ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
  59. ** reused with only minor restrictions (e.g. one should not fold
  60. ** instructions across loop-carried dependencies).
  61. **
  62. ** But in general all optimizations can be applied which only need to look
  63. ** backwards into the generated instruction stream. At any point in time
  64. ** during the copy-substitution process this contains both a static loop
  65. ** iteration (the pre-roll) and a dynamic one (from the to-be-copied
  66. ** instruction up to the end of the partial loop body).
  67. **
  68. ** Since control-dependencies are implicitly kept, CSE also applies to all
  69. ** kinds of guards. The major advantage is that all invariant guards can
  70. ** be hoisted, too.
  71. **
  72. ** Load/store forwarding works across loop iterations, too. This is
  73. ** important if loop-carried dependencies are kept in upvalues or tables.
  74. ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
  75. ** become a forwarded loop-recurrence after inlining.
  76. **
  77. ** Since the IR is in SSA form, loop-carried dependencies have to be
  78. ** modeled with PHI instructions. The potential candidates for PHIs are
  79. ** collected on-the-fly during copy-substitution. After eliminating the
  80. ** redundant ones, PHI instructions are emitted *below* the loop body.
  81. **
  82. ** Note that this departure from traditional SSA form doesn't change the
  83. ** semantics of the PHI instructions themselves. But it greatly simplifies
  84. ** on-the-fly generation of the IR and the machine code.
  85. */

  86. /* Some local macros to save typing. Undef'd at the end. */
  87. #define IR(ref)                (&J->cur.ir[(ref)])

  88. /* Pass IR on to next optimization in chain (FOLD). */
  89. #define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))

  90. /* Emit raw IR without passing through optimizations. */
  91. #define emitir_raw(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))

  92. /* -- PHI elimination ----------------------------------------------------- */

  93. /* Emit or eliminate collected PHIs. */
  94. static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
  95.                           SnapNo onsnap)
  96. {
  97.   int passx = 0;
  98.   IRRef i, j, nslots;
  99.   IRRef invar = J->chain[IR_LOOP];
  100.   /* Pass #1: mark redundant and potentially redundant PHIs. */
  101.   for (i = 0, j = 0; i < nphi; i++) {
  102.     IRRef lref = phi[i];
  103.     IRRef rref = subst[lref];
  104.     if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
  105.       irt_clearphi(IR(lref)->t);
  106.     } else {
  107.       phi[j++] = (IRRef1)lref;
  108.       if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
  109.         /* Quick check for simple recurrences failed, need pass2. */
  110.         irt_setmark(IR(lref)->t);
  111.         passx = 1;
  112.       }
  113.     }
  114.   }
  115.   nphi = j;
  116.   /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
  117.   if (passx) {
  118.     SnapNo s;
  119.     for (i = J->cur.nins-1; i > invar; i--) {
  120.       IRIns *ir = IR(i);
  121.       if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
  122.       if (!irref_isk(ir->op1)) {
  123.         irt_clearmark(IR(ir->op1)->t);
  124.         if (ir->op1 < invar &&
  125.             ir->o >= IR_CALLN && ir->o <= IR_CARG) {  /* ORDER IR */
  126.           ir = IR(ir->op1);
  127.           while (ir->o == IR_CARG) {
  128.             if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
  129.             if (irref_isk(ir->op1)) break;
  130.             ir = IR(ir->op1);
  131.             irt_clearmark(ir->t);
  132.           }
  133.         }
  134.       }
  135.     }
  136.     for (s = J->cur.nsnap-1; s >= onsnap; s--) {
  137.       SnapShot *snap = &J->cur.snap[s];
  138.       SnapEntry *map = &J->cur.snapmap[snap->mapofs];
  139.       MSize n, nent = snap->nent;
  140.       for (n = 0; n < nent; n++) {
  141.         IRRef ref = snap_ref(map[n]);
  142.         if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
  143.       }
  144.     }
  145.   }
  146.   /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
  147.   nslots = J->baseslot+J->maxslot;
  148.   for (i = 1; i < nslots; i++) {
  149.     IRRef ref = tref_ref(J->slot[i]);
  150.     while (!irref_isk(ref) && ref != subst[ref]) {
  151.       IRIns *ir = IR(ref);
  152.       irt_clearmark(ir->t);  /* Unmark potential uses, too. */
  153.       if (irt_isphi(ir->t) || irt_ispri(ir->t))
  154.         break;
  155.       irt_setphi(ir->t);
  156.       if (nphi >= LJ_MAX_PHI)
  157.         lj_trace_err(J, LJ_TRERR_PHIOV);
  158.       phi[nphi++] = (IRRef1)ref;
  159.       ref = subst[ref];
  160.       if (ref > invar)
  161.         break;
  162.     }
  163.   }
  164.   /* Pass #4: propagate non-redundant PHIs. */
  165.   while (passx) {
  166.     passx = 0;
  167.     for (i = 0; i < nphi; i++) {
  168.       IRRef lref = phi[i];
  169.       IRIns *ir = IR(lref);
  170.       if (!irt_ismarked(ir->t)) {  /* Propagate only from unmarked PHIs. */
  171.         IRIns *irr = IR(subst[lref]);
  172.         if (irt_ismarked(irr->t)) {  /* Right ref points to other PHI? */
  173.           irt_clearmark(irr->t);  /* Mark that PHI as non-redundant. */
  174.           passx = 1/* Retry. */
  175.         }
  176.       }
  177.     }
  178.   }
  179.   /* Pass #5: emit PHI instructions or eliminate PHIs. */
  180.   for (i = 0; i < nphi; i++) {
  181.     IRRef lref = phi[i];
  182.     IRIns *ir = IR(lref);
  183.     if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
  184.       IRRef rref = subst[lref];
  185.       if (rref > invar)
  186.         irt_setphi(IR(rref)->t);
  187.       emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
  188.     } else/* Otherwise eliminate PHI. */
  189.       irt_clearmark(ir->t);
  190.       irt_clearphi(ir->t);
  191.     }
  192.   }
  193. }

  194. /* -- Loop unrolling using copy-substitution ------------------------------ */

  195. /* Copy-substitute snapshot. */
  196. static void loop_subst_snap(jit_State *J, SnapShot *osnap,
  197.                             SnapEntry *loopmap, IRRef1 *subst)
  198. {
  199.   SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
  200.   SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
  201.   MSize nmapofs;
  202.   MSize on, ln, nn, onent = osnap->nent;
  203.   BCReg nslots = osnap->nslots;
  204.   SnapShot *snap = &J->cur.snap[J->cur.nsnap];
  205.   if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
  206.     nmapofs = J->cur.nsnapmap;
  207.     J->cur.nsnap++;  /* Add new snapshot. */
  208.   } else/* Otherwise overwrite previous snapshot. */
  209.     snap--;
  210.     nmapofs = snap->mapofs;
  211.   }
  212.   J->guardemit.irt = 0;
  213.   /* Setup new snapshot. */
  214.   snap->mapofs = (uint16_t)nmapofs;
  215.   snap->ref = (IRRef1)J->cur.nins;
  216.   snap->nslots = nslots;
  217.   snap->topslot = osnap->topslot;
  218.   snap->count = 0;
  219.   nmap = &J->cur.snapmap[nmapofs];
  220.   /* Substitute snapshot slots. */
  221.   on = ln = nn = 0;
  222.   while (on < onent) {
  223.     SnapEntry osn = omap[on], lsn = loopmap[ln];
  224.     if (snap_slot(lsn) < snap_slot(osn)) {  /* Copy slot from loop map. */
  225.       nmap[nn++] = lsn;
  226.       ln++;
  227.     } else/* Copy substituted slot from snapshot map. */
  228.       if (snap_slot(lsn) == snap_slot(osn)) ln++;  /* Shadowed loop slot. */
  229.       if (!irref_isk(snap_ref(osn)))
  230.         osn = snap_setref(osn, subst[snap_ref(osn)]);
  231.       nmap[nn++] = osn;
  232.       on++;
  233.     }
  234.   }
  235.   while (snap_slot(loopmap[ln]) < nslots)  /* Copy remaining loop slots. */
  236.     nmap[nn++] = loopmap[ln++];
  237.   snap->nent = (uint8_t)nn;
  238.   omap += onent;
  239.   nmap += nn;
  240.   while (omap < nextmap)  /* Copy PC + frame links. */
  241.     *nmap++ = *omap++;
  242.   J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
  243. }

  244. typedef struct LoopState {
  245.   jit_State *J;
  246.   IRRef1 *subst;
  247.   MSize sizesubst;
  248. } LoopState;

  249. /* Unroll loop. */
  250. static void loop_unroll(LoopState *lps)
  251. {
  252.   jit_State *J = lps->J;
  253.   IRRef1 phi[LJ_MAX_PHI];
  254.   uint32_t nphi = 0;
  255.   IRRef1 *subst;
  256.   SnapNo onsnap;
  257.   SnapShot *osnap, *loopsnap;
  258.   SnapEntry *loopmap, *psentinel;
  259.   IRRef ins, invar;

  260.   /* Allocate substitution table.
  261.   ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
  262.   */
  263.   invar = J->cur.nins;
  264.   lps->sizesubst = invar - REF_BIAS;
  265.   lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
  266.   subst = lps->subst - REF_BIAS;
  267.   subst[REF_BASE] = REF_BASE;

  268.   /* LOOP separates the pre-roll from the loop body. */
  269.   emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);

  270.   /* Grow snapshot buffer and map for copy-substituted snapshots.
  271.   ** Need up to twice the number of snapshots minus #0 and loop snapshot.
  272.   ** Need up to twice the number of entries plus fallback substitutions
  273.   ** from the loop snapshot entries for each new snapshot.
  274.   ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
  275.   */
  276.   onsnap = J->cur.nsnap;
  277.   lj_snap_grow_buf(J, 2*onsnap-2);
  278.   lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);

  279.   /* The loop snapshot is used for fallback substitutions. */
  280.   loopsnap = &J->cur.snap[onsnap-1];
  281.   loopmap = &J->cur.snapmap[loopsnap->mapofs];
  282.   /* The PC of snapshot #0 and the loop snapshot must match. */
  283.   psentinel = &loopmap[loopsnap->nent];
  284.   lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
  285.   *psentinel = SNAP(255, 0, 0);  /* Replace PC with temporary sentinel. */

  286.   /* Start substitution with snapshot #1 (#0 is empty for root traces). */
  287.   osnap = &J->cur.snap[1];

  288.   /* Copy and substitute all recorded instructions and snapshots. */
  289.   for (ins = REF_FIRST; ins < invar; ins++) {
  290.     IRIns *ir;
  291.     IRRef op1, op2;

  292.     if (ins >= osnap->ref)  /* Instruction belongs to next snapshot? */
  293.       loop_subst_snap(J, osnap++, loopmap, subst);  /* Copy-substitute it. */

  294.     /* Substitute instruction operands. */
  295.     ir = IR(ins);
  296.     op1 = ir->op1;
  297.     if (!irref_isk(op1)) op1 = subst[op1];
  298.     op2 = ir->op2;
  299.     if (!irref_isk(op2)) op2 = subst[op2];
  300.     if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
  301.         op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
  302.       subst[ins] = (IRRef1)ins;  /* Shortcut. */
  303.     } else {
  304.       /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
  305.       IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
  306.       IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
  307.       subst[ins] = (IRRef1)ref;
  308.       if (ref != ins) {
  309.         IRIns *irr = IR(ref);
  310.         if (ref < invar) {  /* Loop-carried dependency? */
  311.           /* Potential PHI? */
  312.           if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
  313.             irt_setphi(irr->t);
  314.             if (nphi >= LJ_MAX_PHI)
  315.               lj_trace_err(J, LJ_TRERR_PHIOV);
  316.             phi[nphi++] = (IRRef1)ref;
  317.           }
  318.           /* Check all loop-carried dependencies for type instability. */
  319.           if (!irt_sametype(t, irr->t)) {
  320.             if (irt_isinteger(t) && irt_isinteger(irr->t))
  321.               continue;
  322.             else if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num. */
  323.               ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
  324.             else if (irt_isnum(irr->t) && irt_isinteger(t))  /* Fix num->int. */
  325.               ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
  326.                                     IRCONV_INT_NUM|IRCONV_CHECK));
  327.             else
  328.               lj_trace_err(J, LJ_TRERR_TYPEINS);
  329.             subst[ins] = (IRRef1)ref;
  330.             irr = IR(ref);
  331.             goto phiconv;
  332.           }
  333.         } else if (ref != REF_DROP && irr->o == IR_CONV &&
  334.                    ref > invar && irr->op1 < invar) {
  335.           /* May need an extra PHI for a CONV. */
  336.           ref = irr->op1;
  337.           irr = IR(ref);
  338.         phiconv:
  339.           if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
  340.             irt_setphi(irr->t);
  341.             if (nphi >= LJ_MAX_PHI)
  342.               lj_trace_err(J, LJ_TRERR_PHIOV);
  343.             phi[nphi++] = (IRRef1)ref;
  344.           }
  345.         }
  346.       }
  347.     }
  348.   }
  349.   if (!irt_isguard(J->guardemit))  /* Drop redundant snapshot. */
  350.     J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
  351.   lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
  352.   *psentinel = J->cur.snapmap[J->cur.snap[0].nent];  /* Restore PC. */

  353.   loop_emit_phi(J, subst, phi, nphi, onsnap);
  354. }

  355. /* Undo any partial changes made by the loop optimization. */
  356. static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
  357. {
  358.   ptrdiff_t i;
  359.   SnapShot *snap = &J->cur.snap[nsnap-1];
  360.   SnapEntry *map = J->cur.snapmap;
  361.   map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent];  /* Restore PC. */
  362.   J->cur.nsnapmap = (uint16_t)nsnapmap;
  363.   J->cur.nsnap = nsnap;
  364.   J->guardemit.irt = 0;
  365.   lj_ir_rollback(J, ins);
  366.   for (i = 0; i < BPROP_SLOTS; i++) {  /* Remove backprop. cache entries. */
  367.     BPropEntry *bp = &J->bpropcache[i];
  368.     if (bp->val >= ins)
  369.       bp->key = 0;
  370.   }
  371.   for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
  372.     IRIns *ir = IR(ins);
  373.     irt_clearphi(ir->t);
  374.     irt_clearmark(ir->t);
  375.   }
  376. }

  377. /* Protected callback for loop optimization. */
  378. static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
  379. {
  380.   UNUSED(L); UNUSED(dummy);
  381.   loop_unroll((LoopState *)ud);
  382.   return NULL;
  383. }

  384. /* Loop optimization. */
  385. int lj_opt_loop(jit_State *J)
  386. {
  387.   IRRef nins = J->cur.nins;
  388.   SnapNo nsnap = J->cur.nsnap;
  389.   MSize nsnapmap = J->cur.nsnapmap;
  390.   LoopState lps;
  391.   int errcode;
  392.   lps.J = J;
  393.   lps.subst = NULL;
  394.   lps.sizesubst = 0;
  395.   errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
  396.   lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
  397.   if (LJ_UNLIKELY(errcode)) {
  398.     lua_State *L = J->L;
  399.     if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) {  /* Trace error? */
  400.       int32_t e = numberVint(L->top-1);
  401.       switch ((TraceError)e) {
  402.       case LJ_TRERR_TYPEINS:  /* Type instability. */
  403.       case LJ_TRERR_GFAIL:  /* Guard would always fail. */
  404.         /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
  405.         if (--J->instunroll < 0/* But do not unroll forever. */
  406.           break;
  407.         L->top--;  /* Remove error object. */
  408.         loop_undo(J, nins, nsnap, nsnapmap);
  409.         return 1/* Loop optimization failed, continue recording. */
  410.       default:
  411.         break;
  412.       }
  413.     }
  414.     lj_err_throw(L, errcode);  /* Propagate all other errors. */
  415.   }
  416.   return 0/* Loop optimization is ok. */
  417. }

  418. #undef IR
  419. #undef emitir
  420. #undef emitir_raw

  421. #endif