src/lj_opt_fold.c - luajit-2.0-src

Data types defined

Functions defined

Macros defined

Source code

  1. /*
  2. ** FOLD: Constant Folding, Algebraic Simplifications and Reassociation.
  3. ** ABCelim: Array Bounds Check Elimination.
  4. ** CSE: Common-Subexpression Elimination.
  5. ** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
  6. */

  7. #define lj_opt_fold_c
  8. #define LUA_CORE

  9. #include <math.h>

  10. #include "lj_obj.h"

  11. #if LJ_HASJIT

  12. #include "lj_buf.h"
  13. #include "lj_str.h"
  14. #include "lj_tab.h"
  15. #include "lj_ir.h"
  16. #include "lj_jit.h"
  17. #include "lj_ircall.h"
  18. #include "lj_iropt.h"
  19. #include "lj_trace.h"
  20. #if LJ_HASFFI
  21. #include "lj_ctype.h"
  22. #include "lj_carith.h"
  23. #endif
  24. #include "lj_vm.h"
  25. #include "lj_strscan.h"
  26. #include "lj_strfmt.h"

  27. /* Here's a short description how the FOLD engine processes instructions:
  28. **
  29. ** The FOLD engine receives a single instruction stored in fins (J->fold.ins).
  30. ** The instruction and its operands are used to select matching fold rules.
  31. ** These are applied iteratively until a fixed point is reached.
  32. **
  33. ** The 8 bit opcode of the instruction itself plus the opcodes of the
  34. ** two instructions referenced by its operands form a 24 bit key
  35. ** 'ins left right' (unused operands -> 0, literals -> lowest 8 bits).
  36. **
  37. ** This key is used for partial matching against the fold rules. The
  38. ** left/right operand fields of the key are successively masked with
  39. ** the 'any' wildcard, from most specific to least specific:
  40. **
  41. **   ins left right
  42. **   ins any  right
  43. **   ins left any
  44. **   ins any  any
  45. **
  46. ** The masked key is used to lookup a matching fold rule in a semi-perfect
  47. ** hash table. If a matching rule is found, the related fold function is run.
  48. ** Multiple rules can share the same fold function. A fold rule may return
  49. ** one of several special values:
  50. **
  51. ** - NEXTFOLD means no folding was applied, because an additional test
  52. **   inside the fold function failed. Matching continues against less
  53. **   specific fold rules. Finally the instruction is passed on to CSE.
  54. **
  55. ** - RETRYFOLD means the instruction was modified in-place. Folding is
  56. **   retried as if this instruction had just been received.
  57. **
  58. ** All other return values are terminal actions -- no further folding is
  59. ** applied:
  60. **
  61. ** - INTFOLD(i) returns a reference to the integer constant i.
  62. **
  63. ** - LEFTFOLD and RIGHTFOLD return the left/right operand reference
  64. **   without emitting an instruction.
  65. **
  66. ** - CSEFOLD and EMITFOLD pass the instruction directly to CSE or emit
  67. **   it without passing through any further optimizations.
  68. **
  69. ** - FAILFOLD, DROPFOLD and CONDFOLD only apply to instructions which have
  70. **   no result (e.g. guarded assertions): FAILFOLD means the guard would
  71. **   always fail, i.e. the current trace is pointless. DROPFOLD means
  72. **   the guard is always true and has been eliminated. CONDFOLD is a
  73. **   shortcut for FAILFOLD + cond (i.e. drop if true, otherwise fail).
  74. **
  75. ** - Any other return value is interpreted as an IRRef or TRef. This
  76. **   can be a reference to an existing or a newly created instruction.
  77. **   Only the least-significant 16 bits (IRRef1) are used to form a TRef
  78. **   which is finally returned to the caller.
  79. **
  80. ** The FOLD engine receives instructions both from the trace recorder and
  81. ** substituted instructions from LOOP unrolling. This means all types
  82. ** of instructions may end up here, even though the recorder bypasses
  83. ** FOLD in some cases. Thus all loads, stores and allocations must have
  84. ** an any/any rule to avoid being passed on to CSE.
  85. **
  86. ** Carefully read the following requirements before adding or modifying
  87. ** any fold rules:
  88. **
  89. ** Requirement #1: All fold rules must preserve their destination type.
  90. **
  91. ** Consistently use INTFOLD() (KINT result) or lj_ir_knum() (KNUM result).
  92. ** Never use lj_ir_knumint() which can have either a KINT or KNUM result.
  93. **
  94. ** Requirement #2: Fold rules should not create *new* instructions which
  95. ** reference operands *across* PHIs.
  96. **
  97. ** E.g. a RETRYFOLD with 'fins->op1 = fleft->op1' is invalid if the
  98. ** left operand is a PHI. Then fleft->op1 would point across the PHI
  99. ** frontier to an invariant instruction. Adding a PHI for this instruction
  100. ** would be counterproductive. The solution is to add a barrier which
  101. ** prevents folding across PHIs, i.e. 'PHIBARRIER(fleft)' in this case.
  102. ** The only exception is for recurrences with high latencies like
  103. ** repeated int->num->int conversions.
  104. **
  105. ** One could relax this condition a bit if the referenced instruction is
  106. ** a PHI, too. But this often leads to worse code due to excessive
  107. ** register shuffling.
  108. **
  109. ** Note: returning *existing* instructions (e.g. LEFTFOLD) is ok, though.
  110. ** Even returning fleft->op1 would be ok, because a new PHI will added,
  111. ** if needed. But again, this leads to excessive register shuffling and
  112. ** should be avoided.
  113. **
  114. ** Requirement #3: The set of all fold rules must be monotonic to guarantee
  115. ** termination.
  116. **
  117. ** The goal is optimization, so one primarily wants to add strength-reducing
  118. ** rules. This means eliminating an instruction or replacing an instruction
  119. ** with one or more simpler instructions. Don't add fold rules which point
  120. ** into the other direction.
  121. **
  122. ** Some rules (like commutativity) do not directly reduce the strength of
  123. ** an instruction, but enable other fold rules (e.g. by moving constants
  124. ** to the right operand). These rules must be made unidirectional to avoid
  125. ** cycles.
  126. **
  127. ** Rule of thumb: the trace recorder expands the IR and FOLD shrinks it.
  128. */

  129. /* Some local macros to save typing. Undef'd at the end. */
  130. #define IR(ref)                (&J->cur.ir[(ref)])
  131. #define fins                (&J->fold.ins)
  132. #define fleft                (&J->fold.left)
  133. #define fright                (&J->fold.right)
  134. #define knumleft        (ir_knum(fleft)->n)
  135. #define knumright        (ir_knum(fright)->n)

  136. /* Pass IR on to next optimization in chain (FOLD). */
  137. #define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))

  138. /* Fold function type. Fastcall on x86 significantly reduces their size. */
  139. typedef IRRef (LJ_FASTCALL *FoldFunc)(jit_State *J);

  140. /* Macros for the fold specs, so buildvm can recognize them. */
  141. #define LJFOLD(x)
  142. #define LJFOLDX(x)
  143. #define LJFOLDF(name)        static TRef LJ_FASTCALL fold_##name(jit_State *J)
  144. /* Note: They must be at the start of a line or buildvm ignores them! */

  145. /* Barrier to prevent using operands across PHIs. */
  146. #define PHIBARRIER(ir)        if (irt_isphi((ir)->t)) return NEXTFOLD

  147. /* Barrier to prevent folding across a GC step.
  148. ** GC steps can only happen at the head of a trace and at LOOP.
  149. ** And the GC is only driven forward if there's at least one allocation.
  150. */
  151. #define gcstep_barrier(J, ref) \
  152.   ((ref) < J->chain[IR_LOOP] && \
  153.    (J->chain[IR_SNEW] || J->chain[IR_XSNEW] || \
  154.     J->chain[IR_TNEW] || J->chain[IR_TDUP] || \
  155.     J->chain[IR_CNEW] || J->chain[IR_CNEWI] || \
  156.     J->chain[IR_BUFSTR] || J->chain[IR_TOSTR] || J->chain[IR_CALLA]))

  157. /* -- Constant folding for FP numbers ------------------------------------- */

  158. LJFOLD(ADD KNUM KNUM)
  159. LJFOLD(SUB KNUM KNUM)
  160. LJFOLD(MUL KNUM KNUM)
  161. LJFOLD(DIV KNUM KNUM)
  162. LJFOLD(NEG KNUM KNUM)
  163. LJFOLD(ABS KNUM KNUM)
  164. LJFOLD(ATAN2 KNUM KNUM)
  165. LJFOLD(LDEXP KNUM KNUM)
  166. LJFOLD(MIN KNUM KNUM)
  167. LJFOLD(MAX KNUM KNUM)
  168. LJFOLDF(kfold_numarith)
  169. {
  170.   lua_Number a = knumleft;
  171.   lua_Number b = knumright;
  172.   lua_Number y = lj_vm_foldarith(a, b, fins->o - IR_ADD);
  173.   return lj_ir_knum(J, y);
  174. }

  175. LJFOLD(LDEXP KNUM KINT)
  176. LJFOLDF(kfold_ldexp)
  177. {
  178. #if LJ_TARGET_X86ORX64
  179.   UNUSED(J);
  180.   return NEXTFOLD;
  181. #else
  182.   return lj_ir_knum(J, ldexp(knumleft, fright->i));
  183. #endif
  184. }

  185. LJFOLD(FPMATH KNUM any)
  186. LJFOLDF(kfold_fpmath)
  187. {
  188.   lua_Number a = knumleft;
  189.   lua_Number y = lj_vm_foldfpm(a, fins->op2);
  190.   return lj_ir_knum(J, y);
  191. }

  192. LJFOLD(POW KNUM KINT)
  193. LJFOLDF(kfold_numpow)
  194. {
  195.   lua_Number a = knumleft;
  196.   lua_Number b = (lua_Number)fright->i;
  197.   lua_Number y = lj_vm_foldarith(a, b, IR_POW - IR_ADD);
  198.   return lj_ir_knum(J, y);
  199. }

  200. /* Must not use kfold_kref for numbers (could be NaN). */
  201. LJFOLD(EQ KNUM KNUM)
  202. LJFOLD(NE KNUM KNUM)
  203. LJFOLD(LT KNUM KNUM)
  204. LJFOLD(GE KNUM KNUM)
  205. LJFOLD(LE KNUM KNUM)
  206. LJFOLD(GT KNUM KNUM)
  207. LJFOLD(ULT KNUM KNUM)
  208. LJFOLD(UGE KNUM KNUM)
  209. LJFOLD(ULE KNUM KNUM)
  210. LJFOLD(UGT KNUM KNUM)
  211. LJFOLDF(kfold_numcomp)
  212. {
  213.   return CONDFOLD(lj_ir_numcmp(knumleft, knumright, (IROp)fins->o));
  214. }

  215. /* -- Constant folding for 32 bit integers -------------------------------- */

  216. static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op)
  217. {
  218.   switch (op) {
  219.   case IR_ADD: k1 += k2; break;
  220.   case IR_SUB: k1 -= k2; break;
  221.   case IR_MUL: k1 *= k2; break;
  222.   case IR_MOD: k1 = lj_vm_modi(k1, k2); break;
  223.   case IR_NEG: k1 = -k1; break;
  224.   case IR_BAND: k1 &= k2; break;
  225.   case IR_BOR: k1 |= k2; break;
  226.   case IR_BXOR: k1 ^= k2; break;
  227.   case IR_BSHL: k1 <<= (k2 & 31); break;
  228.   case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 31)); break;
  229.   case IR_BSAR: k1 >>= (k2 & 31); break;
  230.   case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 31)); break;
  231.   case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 31)); break;
  232.   case IR_MIN: k1 = k1 < k2 ? k1 : k2; break;
  233.   case IR_MAX: k1 = k1 > k2 ? k1 : k2; break;
  234.   default: lua_assert(0); break;
  235.   }
  236.   return k1;
  237. }

  238. LJFOLD(ADD KINT KINT)
  239. LJFOLD(SUB KINT KINT)
  240. LJFOLD(MUL KINT KINT)
  241. LJFOLD(MOD KINT KINT)
  242. LJFOLD(NEG KINT KINT)
  243. LJFOLD(BAND KINT KINT)
  244. LJFOLD(BOR KINT KINT)
  245. LJFOLD(BXOR KINT KINT)
  246. LJFOLD(BSHL KINT KINT)
  247. LJFOLD(BSHR KINT KINT)
  248. LJFOLD(BSAR KINT KINT)
  249. LJFOLD(BROL KINT KINT)
  250. LJFOLD(BROR KINT KINT)
  251. LJFOLD(MIN KINT KINT)
  252. LJFOLD(MAX KINT KINT)
  253. LJFOLDF(kfold_intarith)
  254. {
  255.   return INTFOLD(kfold_intop(fleft->i, fright->i, (IROp)fins->o));
  256. }

  257. LJFOLD(ADDOV KINT KINT)
  258. LJFOLD(SUBOV KINT KINT)
  259. LJFOLD(MULOV KINT KINT)
  260. LJFOLDF(kfold_intovarith)
  261. {
  262.   lua_Number n = lj_vm_foldarith((lua_Number)fleft->i, (lua_Number)fright->i,
  263.                                  fins->o - IR_ADDOV);
  264.   int32_t k = lj_num2int(n);
  265.   if (n != (lua_Number)k)
  266.     return FAILFOLD;
  267.   return INTFOLD(k);
  268. }

  269. LJFOLD(BNOT KINT)
  270. LJFOLDF(kfold_bnot)
  271. {
  272.   return INTFOLD(~fleft->i);
  273. }

  274. LJFOLD(BSWAP KINT)
  275. LJFOLDF(kfold_bswap)
  276. {
  277.   return INTFOLD((int32_t)lj_bswap((uint32_t)fleft->i));
  278. }

  279. LJFOLD(LT KINT KINT)
  280. LJFOLD(GE KINT KINT)
  281. LJFOLD(LE KINT KINT)
  282. LJFOLD(GT KINT KINT)
  283. LJFOLD(ULT KINT KINT)
  284. LJFOLD(UGE KINT KINT)
  285. LJFOLD(ULE KINT KINT)
  286. LJFOLD(UGT KINT KINT)
  287. LJFOLD(ABC KINT KINT)
  288. LJFOLDF(kfold_intcomp)
  289. {
  290.   int32_t a = fleft->i, b = fright->i;
  291.   switch ((IROp)fins->o) {
  292.   case IR_LT: return CONDFOLD(a < b);
  293.   case IR_GE: return CONDFOLD(a >= b);
  294.   case IR_LE: return CONDFOLD(a <= b);
  295.   case IR_GT: return CONDFOLD(a > b);
  296.   case IR_ULT: return CONDFOLD((uint32_t)a < (uint32_t)b);
  297.   case IR_UGE: return CONDFOLD((uint32_t)a >= (uint32_t)b);
  298.   case IR_ULE: return CONDFOLD((uint32_t)a <= (uint32_t)b);
  299.   case IR_ABC:
  300.   case IR_UGT: return CONDFOLD((uint32_t)a > (uint32_t)b);
  301.   default: lua_assert(0); return FAILFOLD;
  302.   }
  303. }

  304. LJFOLD(UGE any KINT)
  305. LJFOLDF(kfold_intcomp0)
  306. {
  307.   if (fright->i == 0)
  308.     return DROPFOLD;
  309.   return NEXTFOLD;
  310. }

  311. /* -- Constant folding for 64 bit integers -------------------------------- */

  312. static uint64_t kfold_int64arith(uint64_t k1, uint64_t k2, IROp op)
  313. {
  314.   switch (op) {
  315. #if LJ_HASFFI
  316.   case IR_ADD: k1 += k2; break;
  317.   case IR_SUB: k1 -= k2; break;
  318.   case IR_MUL: k1 *= k2; break;
  319.   case IR_BAND: k1 &= k2; break;
  320.   case IR_BOR: k1 |= k2; break;
  321.   case IR_BXOR: k1 ^= k2; break;
  322. #endif
  323.   default: UNUSED(k2); lua_assert(0); break;
  324.   }
  325.   return k1;
  326. }

  327. LJFOLD(ADD KINT64 KINT64)
  328. LJFOLD(SUB KINT64 KINT64)
  329. LJFOLD(MUL KINT64 KINT64)
  330. LJFOLD(BAND KINT64 KINT64)
  331. LJFOLD(BOR KINT64 KINT64)
  332. LJFOLD(BXOR KINT64 KINT64)
  333. LJFOLDF(kfold_int64arith)
  334. {
  335.   return INT64FOLD(kfold_int64arith(ir_k64(fleft)->u64,
  336.                                     ir_k64(fright)->u64, (IROp)fins->o));
  337. }

  338. LJFOLD(DIV KINT64 KINT64)
  339. LJFOLD(MOD KINT64 KINT64)
  340. LJFOLD(POW KINT64 KINT64)
  341. LJFOLDF(kfold_int64arith2)
  342. {
  343. #if LJ_HASFFI
  344.   uint64_t k1 = ir_k64(fleft)->u64, k2 = ir_k64(fright)->u64;
  345.   if (irt_isi64(fins->t)) {
  346.     k1 = fins->o == IR_DIV ? lj_carith_divi64((int64_t)k1, (int64_t)k2) :
  347.          fins->o == IR_MOD ? lj_carith_modi64((int64_t)k1, (int64_t)k2) :
  348.                              lj_carith_powi64((int64_t)k1, (int64_t)k2);
  349.   } else {
  350.     k1 = fins->o == IR_DIV ? lj_carith_divu64(k1, k2) :
  351.          fins->o == IR_MOD ? lj_carith_modu64(k1, k2) :
  352.                              lj_carith_powu64(k1, k2);
  353.   }
  354.   return INT64FOLD(k1);
  355. #else
  356.   UNUSED(J); lua_assert(0); return FAILFOLD;
  357. #endif
  358. }

  359. LJFOLD(BSHL KINT64 KINT)
  360. LJFOLD(BSHR KINT64 KINT)
  361. LJFOLD(BSAR KINT64 KINT)
  362. LJFOLD(BROL KINT64 KINT)
  363. LJFOLD(BROR KINT64 KINT)
  364. LJFOLDF(kfold_int64shift)
  365. {
  366. #if LJ_HASFFI
  367.   uint64_t k = ir_k64(fleft)->u64;
  368.   int32_t sh = (fright->i & 63);
  369.   return INT64FOLD(lj_carith_shift64(k, sh, fins->o - IR_BSHL));
  370. #else
  371.   UNUSED(J); lua_assert(0); return FAILFOLD;
  372. #endif
  373. }

  374. LJFOLD(BNOT KINT64)
  375. LJFOLDF(kfold_bnot64)
  376. {
  377. #if LJ_HASFFI
  378.   return INT64FOLD(~ir_k64(fleft)->u64);
  379. #else
  380.   UNUSED(J); lua_assert(0); return FAILFOLD;
  381. #endif
  382. }

  383. LJFOLD(BSWAP KINT64)
  384. LJFOLDF(kfold_bswap64)
  385. {
  386. #if LJ_HASFFI
  387.   return INT64FOLD(lj_bswap64(ir_k64(fleft)->u64));
  388. #else
  389.   UNUSED(J); lua_assert(0); return FAILFOLD;
  390. #endif
  391. }

  392. LJFOLD(LT KINT64 KINT64)
  393. LJFOLD(GE KINT64 KINT64)
  394. LJFOLD(LE KINT64 KINT64)
  395. LJFOLD(GT KINT64 KINT64)
  396. LJFOLD(ULT KINT64 KINT64)
  397. LJFOLD(UGE KINT64 KINT64)
  398. LJFOLD(ULE KINT64 KINT64)
  399. LJFOLD(UGT KINT64 KINT64)
  400. LJFOLDF(kfold_int64comp)
  401. {
  402. #if LJ_HASFFI
  403.   uint64_t a = ir_k64(fleft)->u64, b = ir_k64(fright)->u64;
  404.   switch ((IROp)fins->o) {
  405.   case IR_LT: return CONDFOLD(a < b);
  406.   case IR_GE: return CONDFOLD(a >= b);
  407.   case IR_LE: return CONDFOLD(a <= b);
  408.   case IR_GT: return CONDFOLD(a > b);
  409.   case IR_ULT: return CONDFOLD((uint64_t)a < (uint64_t)b);
  410.   case IR_UGE: return CONDFOLD((uint64_t)a >= (uint64_t)b);
  411.   case IR_ULE: return CONDFOLD((uint64_t)a <= (uint64_t)b);
  412.   case IR_UGT: return CONDFOLD((uint64_t)a > (uint64_t)b);
  413.   default: lua_assert(0); return FAILFOLD;
  414.   }
  415. #else
  416.   UNUSED(J); lua_assert(0); return FAILFOLD;
  417. #endif
  418. }

  419. LJFOLD(UGE any KINT64)
  420. LJFOLDF(kfold_int64comp0)
  421. {
  422. #if LJ_HASFFI
  423.   if (ir_k64(fright)->u64 == 0)
  424.     return DROPFOLD;
  425.   return NEXTFOLD;
  426. #else
  427.   UNUSED(J); lua_assert(0); return FAILFOLD;
  428. #endif
  429. }

  430. /* -- Constant folding for strings ---------------------------------------- */

  431. LJFOLD(SNEW KKPTR KINT)
  432. LJFOLDF(kfold_snew_kptr)
  433. {
  434.   GCstr *s = lj_str_new(J->L, (const char *)ir_kptr(fleft), (size_t)fright->i);
  435.   return lj_ir_kstr(J, s);
  436. }

  437. LJFOLD(SNEW any KINT)
  438. LJFOLDF(kfold_snew_empty)
  439. {
  440.   if (fright->i == 0)
  441.     return lj_ir_kstr(J, &J2G(J)->strempty);
  442.   return NEXTFOLD;
  443. }

  444. LJFOLD(STRREF KGC KINT)
  445. LJFOLDF(kfold_strref)
  446. {
  447.   GCstr *str = ir_kstr(fleft);
  448.   lua_assert((MSize)fright->i <= str->len);
  449.   return lj_ir_kkptr(J, (char *)strdata(str) + fright->i);
  450. }

  451. LJFOLD(STRREF SNEW any)
  452. LJFOLDF(kfold_strref_snew)
  453. {
  454.   PHIBARRIER(fleft);
  455.   if (irref_isk(fins->op2) && fright->i == 0) {
  456.     return fleft->op1;  /* strref(snew(ptr, len), 0) ==> ptr */
  457.   } else {
  458.     /* Reassociate: strref(snew(strref(str, a), len), b) ==> strref(str, a+b) */
  459.     IRIns *ir = IR(fleft->op1);
  460.     if (ir->o == IR_STRREF) {
  461.       IRRef1 str = ir->op1;  /* IRIns * is not valid across emitir. */
  462.       PHIBARRIER(ir);
  463.       fins->op2 = emitir(IRTI(IR_ADD), ir->op2, fins->op2); /* Clobbers fins! */
  464.       fins->op1 = str;
  465.       fins->ot = IRT(IR_STRREF, IRT_P32);
  466.       return RETRYFOLD;
  467.     }
  468.   }
  469.   return NEXTFOLD;
  470. }

  471. LJFOLD(CALLN CARG IRCALL_lj_str_cmp)
  472. LJFOLDF(kfold_strcmp)
  473. {
  474.   if (irref_isk(fleft->op1) && irref_isk(fleft->op2)) {
  475.     GCstr *a = ir_kstr(IR(fleft->op1));
  476.     GCstr *b = ir_kstr(IR(fleft->op2));
  477.     return INTFOLD(lj_str_cmp(a, b));
  478.   }
  479.   return NEXTFOLD;
  480. }

  481. /* -- Constant folding and forwarding for buffers ------------------------- */

  482. /*
  483. ** Buffer ops perform stores, but their effect is limited to the buffer
  484. ** itself. Also, buffer ops are chained: a use of an op implies a use of
  485. ** all other ops up the chain. Conversely, if an op is unused, all ops
  486. ** up the chain can go unsed. This largely eliminates the need to treat
  487. ** them as stores.
  488. **
  489. ** Alas, treating them as normal (IRM_N) ops doesn't work, because they
  490. ** cannot be CSEd in isolation. CSE for IRM_N is implicitly done in LOOP
  491. ** or if FOLD is disabled.
  492. **
  493. ** The compromise is to declare them as loads, emit them like stores and
  494. ** CSE whole chains manually when the BUFSTR is to be emitted. Any chain
  495. ** fragments left over from CSE are eliminated by DCE.
  496. */

  497. /* BUFHDR is emitted like a store, see below. */

  498. LJFOLD(BUFPUT BUFHDR BUFSTR)
  499. LJFOLDF(bufput_append)
  500. {
  501.   /* New buffer, no other buffer op inbetween and same buffer? */
  502.   if ((J->flags & JIT_F_OPT_FWD) &&
  503.       !(fleft->op2 & IRBUFHDR_APPEND) &&
  504.       fleft->prev == fright->op2 &&
  505.       fleft->op1 == IR(fright->op2)->op1) {
  506.     IRRef ref = fins->op1;
  507.     IR(ref)->op2 = (fleft->op2 | IRBUFHDR_APPEND);  /* Modify BUFHDR. */
  508.     IR(ref)->op1 = fright->op1;
  509.     return ref;
  510.   }
  511.   return EMITFOLD/* Always emit, CSE later. */
  512. }

  513. LJFOLD(BUFPUT any any)
  514. LJFOLDF(bufput_kgc)
  515. {
  516.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fright->o == IR_KGC) {
  517.     GCstr *s2 = ir_kstr(fright);
  518.     if (s2->len == 0) {  /* Empty string? */
  519.       return LEFTFOLD;
  520.     } else {
  521.       if (fleft->o == IR_BUFPUT && irref_isk(fleft->op2) &&
  522.           !irt_isphi(fleft->t)) {  /* Join two constant string puts in a row. */
  523.         GCstr *s1 = ir_kstr(IR(fleft->op2));
  524.         IRRef kref = lj_ir_kstr(J, lj_buf_cat2str(J->L, s1, s2));
  525.         /* lj_ir_kstr() may realloc the IR and invalidates any IRIns *. */
  526.         IR(fins->op1)->op2 = kref;  /* Modify previous BUFPUT. */
  527.         return fins->op1;
  528.       }
  529.     }
  530.   }
  531.   return EMITFOLD/* Always emit, CSE later. */
  532. }

  533. LJFOLD(BUFSTR any any)
  534. LJFOLDF(bufstr_kfold_cse)
  535. {
  536.   lua_assert(fleft->o == IR_BUFHDR || fleft->o == IR_BUFPUT ||
  537.              fleft->o == IR_CALLL);
  538.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
  539.     if (fleft->o == IR_BUFHDR) {  /* No put operations? */
  540.       if (!(fleft->op2 & IRBUFHDR_APPEND))  /* Empty buffer? */
  541.         return lj_ir_kstr(J, &J2G(J)->strempty);
  542.       fins->op1 = fleft->op1;
  543.       fins->op2 = fleft->prev;  /* Relies on checks in bufput_append. */
  544.       return CSEFOLD;
  545.     } else if (fleft->o == IR_BUFPUT) {
  546.       IRIns *irb = IR(fleft->op1);
  547.       if (irb->o == IR_BUFHDR && !(irb->op2 & IRBUFHDR_APPEND))
  548.         return fleft->op2;  /* Shortcut for a single put operation. */
  549.     }
  550.   }
  551.   /* Try to CSE the whole chain. */
  552.   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
  553.     IRRef ref = J->chain[IR_BUFSTR];
  554.     while (ref) {
  555.       IRIns *irs = IR(ref), *ira = fleft, *irb = IR(irs->op1);
  556.       while (ira->o == irb->o && ira->op2 == irb->op2) {
  557.         lua_assert(ira->o == IR_BUFHDR || ira->o == IR_BUFPUT ||
  558.                    ira->o == IR_CALLL || ira->o == IR_CARG);
  559.         if (ira->o == IR_BUFHDR && !(ira->op2 & IRBUFHDR_APPEND))
  560.           return ref;  /* CSE succeeded. */
  561.         if (ira->o == IR_CALLL && ira->op2 == IRCALL_lj_buf_puttab)
  562.           break;
  563.         ira = IR(ira->op1);
  564.         irb = IR(irb->op1);
  565.       }
  566.       ref = irs->prev;
  567.     }
  568.   }
  569.   return EMITFOLD/* No CSE possible. */
  570. }

  571. LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_reverse)
  572. LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_upper)
  573. LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_lower)
  574. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putquoted)
  575. LJFOLDF(bufput_kfold_op)
  576. {
  577.   if (irref_isk(fleft->op2)) {
  578.     const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
  579.     SBuf *sb = lj_buf_tmp_(J->L);
  580.     sb = ((SBuf * (LJ_FASTCALL *)(SBuf *, GCstr *))ci->func)(sb,
  581.                                                        ir_kstr(IR(fleft->op2)));
  582.     fins->o = IR_BUFPUT;
  583.     fins->op1 = fleft->op1;
  584.     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
  585.     return RETRYFOLD;
  586.   }
  587.   return EMITFOLD/* Always emit, CSE later. */
  588. }

  589. LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_rep)
  590. LJFOLDF(bufput_kfold_rep)
  591. {
  592.   if (irref_isk(fleft->op2)) {
  593.     IRIns *irc = IR(fleft->op1);
  594.     if (irref_isk(irc->op2)) {
  595.       SBuf *sb = lj_buf_tmp_(J->L);
  596.       sb = lj_buf_putstr_rep(sb, ir_kstr(IR(irc->op2)), IR(fleft->op2)->i);
  597.       fins->o = IR_BUFPUT;
  598.       fins->op1 = irc->op1;
  599.       fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
  600.       return RETRYFOLD;
  601.     }
  602.   }
  603.   return EMITFOLD/* Always emit, CSE later. */
  604. }

  605. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfxint)
  606. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_int)
  607. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_uint)
  608. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum)
  609. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfstr)
  610. LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfchar)
  611. LJFOLDF(bufput_kfold_fmt)
  612. {
  613.   IRIns *irc = IR(fleft->op1);
  614.   lua_assert(irref_isk(irc->op2));  /* SFormat must be const. */
  615.   if (irref_isk(fleft->op2)) {
  616.     SFormat sf = (SFormat)IR(irc->op2)->i;
  617.     IRIns *ira = IR(fleft->op2);
  618.     SBuf *sb = lj_buf_tmp_(J->L);
  619.     switch (fins->op2) {
  620.     case IRCALL_lj_strfmt_putfxint:
  621.       sb = lj_strfmt_putfxint(sb, sf, ir_k64(ira)->u64);
  622.       break;
  623.     case IRCALL_lj_strfmt_putfstr:
  624.       sb = lj_strfmt_putfstr(sb, sf, ir_kstr(ira));
  625.       break;
  626.     case IRCALL_lj_strfmt_putfchar:
  627.       sb = lj_strfmt_putfchar(sb, sf, ira->i);
  628.       break;
  629.     case IRCALL_lj_strfmt_putfnum_int:
  630.     case IRCALL_lj_strfmt_putfnum_uint:
  631.     case IRCALL_lj_strfmt_putfnum:
  632.     default: {
  633.       const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
  634.       sb = ((SBuf * (*)(SBuf *, SFormat, lua_Number))ci->func)(sb, sf,
  635.                                                          ir_knum(ira)->n);
  636.       break;
  637.       }
  638.     }
  639.     fins->o = IR_BUFPUT;
  640.     fins->op1 = irc->op1;
  641.     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
  642.     return RETRYFOLD;
  643.   }
  644.   return EMITFOLD/* Always emit, CSE later. */
  645. }

  646. /* -- Constant folding of pointer arithmetic ------------------------------ */

  647. LJFOLD(ADD KGC KINT)
  648. LJFOLD(ADD KGC KINT64)
  649. LJFOLDF(kfold_add_kgc)
  650. {
  651.   GCobj *o = ir_kgc(fleft);
  652. #if LJ_64
  653.   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
  654. #else
  655.   ptrdiff_t ofs = fright->i;
  656. #endif
  657. #if LJ_HASFFI
  658.   if (irt_iscdata(fleft->t)) {
  659.     CType *ct = ctype_raw(ctype_ctsG(J2G(J)), gco2cd(o)->ctypeid);
  660.     if (ctype_isnum(ct->info) || ctype_isenum(ct->info) ||
  661.         ctype_isptr(ct->info) || ctype_isfunc(ct->info) ||
  662.         ctype_iscomplex(ct->info) || ctype_isvector(ct->info))
  663.       return lj_ir_kkptr(J, (char *)o + ofs);
  664.   }
  665. #endif
  666.   return lj_ir_kptr(J, (char *)o + ofs);
  667. }

  668. LJFOLD(ADD KPTR KINT)
  669. LJFOLD(ADD KPTR KINT64)
  670. LJFOLD(ADD KKPTR KINT)
  671. LJFOLD(ADD KKPTR KINT64)
  672. LJFOLDF(kfold_add_kptr)
  673. {
  674.   void *p = ir_kptr(fleft);
  675. #if LJ_64
  676.   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
  677. #else
  678.   ptrdiff_t ofs = fright->i;
  679. #endif
  680.   return lj_ir_kptr_(J, fleft->o, (char *)p + ofs);
  681. }

  682. LJFOLD(ADD any KGC)
  683. LJFOLD(ADD any KPTR)
  684. LJFOLD(ADD any KKPTR)
  685. LJFOLDF(kfold_add_kright)
  686. {
  687.   if (fleft->o == IR_KINT || fleft->o == IR_KINT64) {
  688.     IRRef1 tmp = fins->op1; fins->op1 = fins->op2; fins->op2 = tmp;
  689.     return RETRYFOLD;
  690.   }
  691.   return NEXTFOLD;
  692. }

  693. /* -- Constant folding of conversions ------------------------------------- */

  694. LJFOLD(TOBIT KNUM KNUM)
  695. LJFOLDF(kfold_tobit)
  696. {
  697.   return INTFOLD(lj_num2bit(knumleft));
  698. }

  699. LJFOLD(CONV KINT IRCONV_NUM_INT)
  700. LJFOLDF(kfold_conv_kint_num)
  701. {
  702.   return lj_ir_knum(J, (lua_Number)fleft->i);
  703. }

  704. LJFOLD(CONV KINT IRCONV_NUM_U32)
  705. LJFOLDF(kfold_conv_kintu32_num)
  706. {
  707.   return lj_ir_knum(J, (lua_Number)(uint32_t)fleft->i);
  708. }

  709. LJFOLD(CONV KINT IRCONV_INT_I8)
  710. LJFOLD(CONV KINT IRCONV_INT_U8)
  711. LJFOLD(CONV KINT IRCONV_INT_I16)
  712. LJFOLD(CONV KINT IRCONV_INT_U16)
  713. LJFOLDF(kfold_conv_kint_ext)
  714. {
  715.   int32_t k = fleft->i;
  716.   if ((fins->op2 & IRCONV_SRCMASK) == IRT_I8) k = (int8_t)k;
  717.   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_U8) k = (uint8_t)k;
  718.   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_I16) k = (int16_t)k;
  719.   else k = (uint16_t)k;
  720.   return INTFOLD(k);
  721. }

  722. LJFOLD(CONV KINT IRCONV_I64_INT)
  723. LJFOLD(CONV KINT IRCONV_U64_INT)
  724. LJFOLD(CONV KINT IRCONV_I64_U32)
  725. LJFOLD(CONV KINT IRCONV_U64_U32)
  726. LJFOLDF(kfold_conv_kint_i64)
  727. {
  728.   if ((fins->op2 & IRCONV_SEXT))
  729.     return INT64FOLD((uint64_t)(int64_t)fleft->i);
  730.   else
  731.     return INT64FOLD((uint64_t)(int64_t)(uint32_t)fleft->i);
  732. }

  733. LJFOLD(CONV KINT64 IRCONV_NUM_I64)
  734. LJFOLDF(kfold_conv_kint64_num_i64)
  735. {
  736.   return lj_ir_knum(J, (lua_Number)(int64_t)ir_kint64(fleft)->u64);
  737. }

  738. LJFOLD(CONV KINT64 IRCONV_NUM_U64)
  739. LJFOLDF(kfold_conv_kint64_num_u64)
  740. {
  741.   return lj_ir_knum(J, (lua_Number)ir_kint64(fleft)->u64);
  742. }

  743. LJFOLD(CONV KINT64 IRCONV_INT_I64)
  744. LJFOLD(CONV KINT64 IRCONV_U32_I64)
  745. LJFOLDF(kfold_conv_kint64_int_i64)
  746. {
  747.   return INTFOLD((int32_t)ir_kint64(fleft)->u64);
  748. }

  749. LJFOLD(CONV KNUM IRCONV_INT_NUM)
  750. LJFOLDF(kfold_conv_knum_int_num)
  751. {
  752.   lua_Number n = knumleft;
  753.   int32_t k = lj_num2int(n);
  754.   if (irt_isguard(fins->t) && n != (lua_Number)k) {
  755.     /* We're about to create a guard which always fails, like CONV +1.5.
  756.     ** Some pathological loops cause this during LICM, e.g.:
  757.     **   local x,k,t = 0,1.5,{1,[1.5]=2}
  758.     **   for i=1,200 do x = x+ t[k]; k = k == 1 and 1.5 or 1 end
  759.     **   assert(x == 300)
  760.     */
  761.     return FAILFOLD;
  762.   }
  763.   return INTFOLD(k);
  764. }

  765. LJFOLD(CONV KNUM IRCONV_U32_NUM)
  766. LJFOLDF(kfold_conv_knum_u32_num)
  767. {
  768. #ifdef _MSC_VER
  769.   {  /* Workaround for MSVC bug. */
  770.     volatile uint32_t u = (uint32_t)knumleft;
  771.     return INTFOLD((int32_t)u);
  772.   }
  773. #else
  774.   return INTFOLD((int32_t)(uint32_t)knumleft);
  775. #endif
  776. }

  777. LJFOLD(CONV KNUM IRCONV_I64_NUM)
  778. LJFOLDF(kfold_conv_knum_i64_num)
  779. {
  780.   return INT64FOLD((uint64_t)(int64_t)knumleft);
  781. }

  782. LJFOLD(CONV KNUM IRCONV_U64_NUM)
  783. LJFOLDF(kfold_conv_knum_u64_num)
  784. {
  785.   return INT64FOLD(lj_num2u64(knumleft));
  786. }

  787. LJFOLD(TOSTR KNUM any)
  788. LJFOLDF(kfold_tostr_knum)
  789. {
  790.   return lj_ir_kstr(J, lj_strfmt_num(J->L, ir_knum(fleft)));
  791. }

  792. LJFOLD(TOSTR KINT any)
  793. LJFOLDF(kfold_tostr_kint)
  794. {
  795.   return lj_ir_kstr(J, fins->op2 == IRTOSTR_INT ?
  796.                        lj_strfmt_int(J->L, fleft->i) :
  797.                        lj_strfmt_char(J->L, fleft->i));
  798. }

  799. LJFOLD(STRTO KGC)
  800. LJFOLDF(kfold_strto)
  801. {
  802.   TValue n;
  803.   if (lj_strscan_num(ir_kstr(fleft), &n))
  804.     return lj_ir_knum(J, numV(&n));
  805.   return FAILFOLD;
  806. }

  807. /* -- Constant folding of equality checks --------------------------------- */

  808. /* Don't constant-fold away FLOAD checks against KNULL. */
  809. LJFOLD(EQ FLOAD KNULL)
  810. LJFOLD(NE FLOAD KNULL)
  811. LJFOLDX(lj_opt_cse)

  812. /* But fold all other KNULL compares, since only KNULL is equal to KNULL. */
  813. LJFOLD(EQ any KNULL)
  814. LJFOLD(NE any KNULL)
  815. LJFOLD(EQ KNULL any)
  816. LJFOLD(NE KNULL any)
  817. LJFOLD(EQ KINT KINT)  /* Constants are unique, so same refs <==> same value. */
  818. LJFOLD(NE KINT KINT)
  819. LJFOLD(EQ KINT64 KINT64)
  820. LJFOLD(NE KINT64 KINT64)
  821. LJFOLD(EQ KGC KGC)
  822. LJFOLD(NE KGC KGC)
  823. LJFOLDF(kfold_kref)
  824. {
  825.   return CONDFOLD((fins->op1 == fins->op2) ^ (fins->o == IR_NE));
  826. }

  827. /* -- Algebraic shortcuts ------------------------------------------------- */

  828. LJFOLD(FPMATH FPMATH IRFPM_FLOOR)
  829. LJFOLD(FPMATH FPMATH IRFPM_CEIL)
  830. LJFOLD(FPMATH FPMATH IRFPM_TRUNC)
  831. LJFOLDF(shortcut_round)
  832. {
  833.   IRFPMathOp op = (IRFPMathOp)fleft->op2;
  834.   if (op == IRFPM_FLOOR || op == IRFPM_CEIL || op == IRFPM_TRUNC)
  835.     return LEFTFOLD/* round(round_left(x)) = round_left(x) */
  836.   return NEXTFOLD;
  837. }

  838. LJFOLD(ABS ABS KNUM)
  839. LJFOLDF(shortcut_left)
  840. {
  841.   return LEFTFOLD/* f(g(x)) ==> g(x) */
  842. }

  843. LJFOLD(ABS NEG KNUM)
  844. LJFOLDF(shortcut_dropleft)
  845. {
  846.   PHIBARRIER(fleft);
  847.   fins->op1 = fleft->op1;  /* abs(neg(x)) ==> abs(x) */
  848.   return RETRYFOLD;
  849. }

  850. /* Note: no safe shortcuts with STRTO and TOSTR ("1e2" ==> +100 ==> "100"). */
  851. LJFOLD(NEG NEG any)
  852. LJFOLD(BNOT BNOT)
  853. LJFOLD(BSWAP BSWAP)
  854. LJFOLDF(shortcut_leftleft)
  855. {
  856.   PHIBARRIER(fleft);  /* See above. Fold would be ok, but not beneficial. */
  857.   return fleft->op1;  /* f(g(x)) ==> x */
  858. }

  859. /* -- FP algebraic simplifications ---------------------------------------- */

  860. /* FP arithmetic is tricky -- there's not much to simplify.
  861. ** Please note the following common pitfalls before sending "improvements":
  862. **   x+0 ==> x  is INVALID for x=-0
  863. **   0-x ==> -x is INVALID for x=+0
  864. **   x*0 ==> 0  is INVALID for x=-0, x=+-Inf or x=NaN
  865. */

  866. LJFOLD(ADD NEG any)
  867. LJFOLDF(simplify_numadd_negx)
  868. {
  869.   PHIBARRIER(fleft);
  870.   fins->o = IR_SUB;  /* (-a) + b ==> b - a */
  871.   fins->op1 = fins->op2;
  872.   fins->op2 = fleft->op1;
  873.   return RETRYFOLD;
  874. }

  875. LJFOLD(ADD any NEG)
  876. LJFOLDF(simplify_numadd_xneg)
  877. {
  878.   PHIBARRIER(fright);
  879.   fins->o = IR_SUB;  /* a + (-b) ==> a - b */
  880.   fins->op2 = fright->op1;
  881.   return RETRYFOLD;
  882. }

  883. LJFOLD(SUB any KNUM)
  884. LJFOLDF(simplify_numsub_k)
  885. {
  886.   lua_Number n = knumright;
  887.   if (n == 0.0/* x - (+-0) ==> x */
  888.     return LEFTFOLD;
  889.   return NEXTFOLD;
  890. }

  891. LJFOLD(SUB NEG KNUM)
  892. LJFOLDF(simplify_numsub_negk)
  893. {
  894.   PHIBARRIER(fleft);
  895.   fins->op2 = fleft->op1;  /* (-x) - k ==> (-k) - x */
  896.   fins->op1 = (IRRef1)lj_ir_knum(J, -knumright);
  897.   return RETRYFOLD;
  898. }

  899. LJFOLD(SUB any NEG)
  900. LJFOLDF(simplify_numsub_xneg)
  901. {
  902.   PHIBARRIER(fright);
  903.   fins->o = IR_ADD;  /* a - (-b) ==> a + b */
  904.   fins->op2 = fright->op1;
  905.   return RETRYFOLD;
  906. }

  907. LJFOLD(MUL any KNUM)
  908. LJFOLD(DIV any KNUM)
  909. LJFOLDF(simplify_nummuldiv_k)
  910. {
  911.   lua_Number n = knumright;
  912.   if (n == 1.0) {  /* x o 1 ==> x */
  913.     return LEFTFOLD;
  914.   } else if (n == -1.0) {  /* x o -1 ==> -x */
  915.     fins->o = IR_NEG;
  916.     fins->op2 = (IRRef1)lj_ir_knum_neg(J);
  917.     return RETRYFOLD;
  918.   } else if (fins->o == IR_MUL && n == 2.0) {  /* x * 2 ==> x + x */
  919.     fins->o = IR_ADD;
  920.     fins->op2 = fins->op1;
  921.     return RETRYFOLD;
  922.   } else if (fins->o == IR_DIV) {  /* x / 2^k ==> x * 2^-k */
  923.     uint64_t u = ir_knum(fright)->u64;
  924.     uint32_t ex = ((uint32_t)(u >> 52) & 0x7ff);
  925.     if ((u & U64x(000fffff,ffffffff)) == 0 && ex - 1 < 0x7fd) {
  926.       u = (u & ((uint64_t)1 << 63)) | ((uint64_t)(0x7fe - ex) << 52);
  927.       fins->o = IR_MUL;  /* Multiply by exact reciprocal. */
  928.       fins->op2 = lj_ir_knum_u64(J, u);
  929.       return RETRYFOLD;
  930.     }
  931.   }
  932.   return NEXTFOLD;
  933. }

  934. LJFOLD(MUL NEG KNUM)
  935. LJFOLD(DIV NEG KNUM)
  936. LJFOLDF(simplify_nummuldiv_negk)
  937. {
  938.   PHIBARRIER(fleft);
  939.   fins->op1 = fleft->op1;  /* (-a) o k ==> a o (-k) */
  940.   fins->op2 = (IRRef1)lj_ir_knum(J, -knumright);
  941.   return RETRYFOLD;
  942. }

  943. LJFOLD(MUL NEG NEG)
  944. LJFOLD(DIV NEG NEG)
  945. LJFOLDF(simplify_nummuldiv_negneg)
  946. {
  947.   PHIBARRIER(fleft);
  948.   PHIBARRIER(fright);
  949.   fins->op1 = fleft->op1;  /* (-a) o (-b) ==> a o b */
  950.   fins->op2 = fright->op1;
  951.   return RETRYFOLD;
  952. }

  953. LJFOLD(POW any KINT)
  954. LJFOLDF(simplify_numpow_xk)
  955. {
  956.   int32_t k = fright->i;
  957.   TRef ref = fins->op1;
  958.   if (k == 0/* x ^ 0 ==> 1 */
  959.     return lj_ir_knum_one(J);  /* Result must be a number, not an int. */
  960.   if (k == 1/* x ^ 1 ==> x */
  961.     return LEFTFOLD;
  962.   if ((uint32_t)(k+65536) > 2*65536u/* Limit code explosion. */
  963.     return NEXTFOLD;
  964.   if (k < 0) {  /* x ^ (-k) ==> (1/x) ^ k. */
  965.     ref = emitir(IRTN(IR_DIV), lj_ir_knum_one(J), ref);
  966.     k = -k;
  967.   }
  968.   /* Unroll x^k for 1 <= k <= 65536. */
  969.   for (; (k & 1) == 0; k >>= 1/* Handle leading zeros. */
  970.     ref = emitir(IRTN(IR_MUL), ref, ref);
  971.   if ((k >>= 1) != 0) {  /* Handle trailing bits. */
  972.     TRef tmp = emitir(IRTN(IR_MUL), ref, ref);
  973.     for (; k != 1; k >>= 1) {
  974.       if (k & 1)
  975.         ref = emitir(IRTN(IR_MUL), ref, tmp);
  976.       tmp = emitir(IRTN(IR_MUL), tmp, tmp);
  977.     }
  978.     ref = emitir(IRTN(IR_MUL), ref, tmp);
  979.   }
  980.   return ref;
  981. }

  982. LJFOLD(POW KNUM any)
  983. LJFOLDF(simplify_numpow_kx)
  984. {
  985.   lua_Number n = knumleft;
  986.   if (n == 2.0) {  /* 2.0 ^ i ==> ldexp(1.0, tonum(i)) */
  987.     fins->o = IR_CONV;
  988. #if LJ_TARGET_X86ORX64
  989.     fins->op1 = fins->op2;
  990.     fins->op2 = IRCONV_NUM_INT;
  991.     fins->op2 = (IRRef1)lj_opt_fold(J);
  992. #endif
  993.     fins->op1 = (IRRef1)lj_ir_knum_one(J);
  994.     fins->o = IR_LDEXP;
  995.     return RETRYFOLD;
  996.   }
  997.   return NEXTFOLD;
  998. }

  999. /* -- Simplify conversions ------------------------------------------------ */

  1000. LJFOLD(CONV CONV IRCONV_NUM_INT/* _NUM */
  1001. LJFOLDF(shortcut_conv_num_int)
  1002. {
  1003.   PHIBARRIER(fleft);
  1004.   /* Only safe with a guarded conversion to int. */
  1005.   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_NUM && irt_isguard(fleft->t))
  1006.     return fleft->op1;  /* f(g(x)) ==> x */
  1007.   return NEXTFOLD;
  1008. }

  1009. LJFOLD(CONV CONV IRCONV_INT_NUM/* _INT */
  1010. LJFOLD(CONV CONV IRCONV_U32_NUM)  /* _U32*/
  1011. LJFOLDF(simplify_conv_int_num)
  1012. {
  1013.   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
  1014.   if ((fleft->op2 & IRCONV_SRCMASK) ==
  1015.       ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH))
  1016.     return fleft->op1;
  1017.   return NEXTFOLD;
  1018. }

  1019. LJFOLD(CONV CONV IRCONV_I64_NUM)  /* _INT or _U32 */
  1020. LJFOLD(CONV CONV IRCONV_U64_NUM)  /* _INT or _U32 */
  1021. LJFOLDF(simplify_conv_i64_num)
  1022. {
  1023.   PHIBARRIER(fleft);
  1024.   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
  1025.     /* Reduce to a sign-extension. */
  1026.     fins->op1 = fleft->op1;
  1027.     fins->op2 = ((IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
  1028.     return RETRYFOLD;
  1029.   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
  1030. #if LJ_TARGET_X64
  1031.     return fleft->op1;
  1032. #else
  1033.     /* Reduce to a zero-extension. */
  1034.     fins->op1 = fleft->op1;
  1035.     fins->op2 = (IRT_I64<<5)|IRT_U32;
  1036.     return RETRYFOLD;
  1037. #endif
  1038.   }
  1039.   return NEXTFOLD;
  1040. }

  1041. LJFOLD(CONV CONV IRCONV_INT_I64)  /* _INT or _U32 */
  1042. LJFOLD(CONV CONV IRCONV_INT_U64)  /* _INT or _U32 */
  1043. LJFOLD(CONV CONV IRCONV_U32_I64)  /* _INT or _U32 */
  1044. LJFOLD(CONV CONV IRCONV_U32_U64)  /* _INT or _U32 */
  1045. LJFOLDF(simplify_conv_int_i64)
  1046. {
  1047.   int src;
  1048.   PHIBARRIER(fleft);
  1049.   src = (fleft->op2 & IRCONV_SRCMASK);
  1050.   if (src == IRT_INT || src == IRT_U32) {
  1051.     if (src == ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH)) {
  1052.       return fleft->op1;
  1053.     } else {
  1054.       fins->op2 = ((fins->op2 & IRCONV_DSTMASK) | src);
  1055.       fins->op1 = fleft->op1;
  1056.       return RETRYFOLD;
  1057.     }
  1058.   }
  1059.   return NEXTFOLD;
  1060. }

  1061. LJFOLD(CONV CONV IRCONV_FLOAT_NUM)  /* _FLOAT */
  1062. LJFOLDF(simplify_conv_flt_num)
  1063. {
  1064.   PHIBARRIER(fleft);
  1065.   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_FLOAT)
  1066.     return fleft->op1;
  1067.   return NEXTFOLD;
  1068. }

  1069. /* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
  1070. LJFOLD(TOBIT CONV KNUM)
  1071. LJFOLDF(simplify_tobit_conv)
  1072. {
  1073.   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
  1074.   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
  1075.     lua_assert(irt_isnum(fleft->t));
  1076.     return fleft->op1;
  1077.   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
  1078.     lua_assert(irt_isnum(fleft->t));
  1079.     fins->o = IR_CONV;
  1080.     fins->op1 = fleft->op1;
  1081.     fins->op2 = (IRT_INT<<5)|IRT_U32;
  1082.     return RETRYFOLD;
  1083.   }
  1084.   return NEXTFOLD;
  1085. }

  1086. /* Shortcut floor/ceil/round + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
  1087. LJFOLD(FPMATH CONV IRFPM_FLOOR)
  1088. LJFOLD(FPMATH CONV IRFPM_CEIL)
  1089. LJFOLD(FPMATH CONV IRFPM_TRUNC)
  1090. LJFOLDF(simplify_floor_conv)
  1091. {
  1092.   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT ||
  1093.       (fleft->op2 & IRCONV_SRCMASK) == IRT_U32)
  1094.     return LEFTFOLD;
  1095.   return NEXTFOLD;
  1096. }

  1097. /* Strength reduction of widening. */
  1098. LJFOLD(CONV any IRCONV_I64_INT)
  1099. LJFOLD(CONV any IRCONV_U64_INT)
  1100. LJFOLDF(simplify_conv_sext)
  1101. {
  1102.   IRRef ref = fins->op1;
  1103.   int64_t ofs = 0;
  1104.   if (!(fins->op2 & IRCONV_SEXT))
  1105.     return NEXTFOLD;
  1106.   PHIBARRIER(fleft);
  1107.   if (fleft->o == IR_XLOAD && (irt_isu8(fleft->t) || irt_isu16(fleft->t)))
  1108.     goto ok_reduce;
  1109.   if (fleft->o == IR_ADD && irref_isk(fleft->op2)) {
  1110.     ofs = (int64_t)IR(fleft->op2)->i;
  1111.     ref = fleft->op1;
  1112.   }
  1113.   /* Use scalar evolution analysis results to strength-reduce sign-extension. */
  1114.   if (ref == J->scev.idx) {
  1115.     IRRef lo = J->scev.dir ? J->scev.start : J->scev.stop;
  1116.     lua_assert(irt_isint(J->scev.t));
  1117.     if (lo && IR(lo)->i + ofs >= 0) {
  1118.     ok_reduce:
  1119. #if LJ_TARGET_X64
  1120.       /* Eliminate widening. All 32 bit ops do an implicit zero-extension. */
  1121.       return LEFTFOLD;
  1122. #else
  1123.       /* Reduce to a (cheaper) zero-extension. */
  1124.       fins->op2 &= ~IRCONV_SEXT;
  1125.       return RETRYFOLD;
  1126. #endif
  1127.     }
  1128.   }
  1129.   return NEXTFOLD;
  1130. }

  1131. /* Strength reduction of narrowing. */
  1132. LJFOLD(CONV ADD IRCONV_INT_I64)
  1133. LJFOLD(CONV SUB IRCONV_INT_I64)
  1134. LJFOLD(CONV MUL IRCONV_INT_I64)
  1135. LJFOLD(CONV ADD IRCONV_INT_U64)
  1136. LJFOLD(CONV SUB IRCONV_INT_U64)
  1137. LJFOLD(CONV MUL IRCONV_INT_U64)
  1138. LJFOLD(CONV ADD IRCONV_U32_I64)
  1139. LJFOLD(CONV SUB IRCONV_U32_I64)
  1140. LJFOLD(CONV MUL IRCONV_U32_I64)
  1141. LJFOLD(CONV ADD IRCONV_U32_U64)
  1142. LJFOLD(CONV SUB IRCONV_U32_U64)
  1143. LJFOLD(CONV MUL IRCONV_U32_U64)
  1144. LJFOLDF(simplify_conv_narrow)
  1145. {
  1146.   IROp op = (IROp)fleft->o;
  1147.   IRType t = irt_type(fins->t);
  1148.   IRRef op1 = fleft->op1, op2 = fleft->op2, mode = fins->op2;
  1149.   PHIBARRIER(fleft);
  1150.   op1 = emitir(IRTI(IR_CONV), op1, mode);
  1151.   op2 = emitir(IRTI(IR_CONV), op2, mode);
  1152.   fins->ot = IRT(op, t);
  1153.   fins->op1 = op1;
  1154.   fins->op2 = op2;
  1155.   return RETRYFOLD;
  1156. }

  1157. /* Special CSE rule for CONV. */
  1158. LJFOLD(CONV any any)
  1159. LJFOLDF(cse_conv)
  1160. {
  1161.   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
  1162.     IRRef op1 = fins->op1, op2 = (fins->op2 & IRCONV_MODEMASK);
  1163.     uint8_t guard = irt_isguard(fins->t);
  1164.     IRRef ref = J->chain[IR_CONV];
  1165.     while (ref > op1) {
  1166.       IRIns *ir = IR(ref);
  1167.       /* Commoning with stronger checks is ok. */
  1168.       if (ir->op1 == op1 && (ir->op2 & IRCONV_MODEMASK) == op2 &&
  1169.           irt_isguard(ir->t) >= guard)
  1170.         return ref;
  1171.       ref = ir->prev;
  1172.     }
  1173.   }
  1174.   return EMITFOLD/* No fallthrough to regular CSE. */
  1175. }

  1176. /* FP conversion narrowing. */
  1177. LJFOLD(TOBIT ADD KNUM)
  1178. LJFOLD(TOBIT SUB KNUM)
  1179. LJFOLD(CONV ADD IRCONV_INT_NUM)
  1180. LJFOLD(CONV SUB IRCONV_INT_NUM)
  1181. LJFOLD(CONV ADD IRCONV_I64_NUM)
  1182. LJFOLD(CONV SUB IRCONV_I64_NUM)
  1183. LJFOLDF(narrow_convert)
  1184. {
  1185.   PHIBARRIER(fleft);
  1186.   /* Narrowing ignores PHIs and repeating it inside the loop is not useful. */
  1187.   if (J->chain[IR_LOOP])
  1188.     return NEXTFOLD;
  1189.   lua_assert(fins->o != IR_CONV || (fins->op2&IRCONV_CONVMASK) != IRCONV_TOBIT);
  1190.   return lj_opt_narrow_convert(J);
  1191. }

  1192. /* -- Integer algebraic simplifications ----------------------------------- */

  1193. LJFOLD(ADD any KINT)
  1194. LJFOLD(ADDOV any KINT)
  1195. LJFOLD(SUBOV any KINT)
  1196. LJFOLDF(simplify_intadd_k)
  1197. {
  1198.   if (fright->i == 0/* i o 0 ==> i */
  1199.     return LEFTFOLD;
  1200.   return NEXTFOLD;
  1201. }

  1202. LJFOLD(MULOV any KINT)
  1203. LJFOLDF(simplify_intmul_k)
  1204. {
  1205.   if (fright->i == 0/* i * 0 ==> 0 */
  1206.     return RIGHTFOLD;
  1207.   if (fright->i == 1/* i * 1 ==> i */
  1208.     return LEFTFOLD;
  1209.   if (fright->i == 2) {  /* i * 2 ==> i + i */
  1210.     fins->o = IR_ADDOV;
  1211.     fins->op2 = fins->op1;
  1212.     return RETRYFOLD;
  1213.   }
  1214.   return NEXTFOLD;
  1215. }

  1216. LJFOLD(SUB any KINT)
  1217. LJFOLDF(simplify_intsub_k)
  1218. {
  1219.   if (fright->i == 0/* i - 0 ==> i */
  1220.     return LEFTFOLD;
  1221.   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
  1222.   fins->op2 = (IRRef1)lj_ir_kint(J, -fright->i);  /* Overflow for -2^31 ok. */
  1223.   return RETRYFOLD;
  1224. }

  1225. LJFOLD(SUB KINT any)
  1226. LJFOLD(SUB KINT64 any)
  1227. LJFOLDF(simplify_intsub_kleft)
  1228. {
  1229.   if (fleft->o == IR_KINT ? (fleft->i == 0) : (ir_kint64(fleft)->u64 == 0)) {
  1230.     fins->o = IR_NEG;  /* 0 - i ==> -i */
  1231.     fins->op1 = fins->op2;
  1232.     return RETRYFOLD;
  1233.   }
  1234.   return NEXTFOLD;
  1235. }

  1236. LJFOLD(ADD any KINT64)
  1237. LJFOLDF(simplify_intadd_k64)
  1238. {
  1239.   if (ir_kint64(fright)->u64 == 0/* i + 0 ==> i */
  1240.     return LEFTFOLD;
  1241.   return NEXTFOLD;
  1242. }

  1243. LJFOLD(SUB any KINT64)
  1244. LJFOLDF(simplify_intsub_k64)
  1245. {
  1246.   uint64_t k = ir_kint64(fright)->u64;
  1247.   if (k == 0/* i - 0 ==> i */
  1248.     return LEFTFOLD;
  1249.   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
  1250.   fins->op2 = (IRRef1)lj_ir_kint64(J, (uint64_t)-(int64_t)k);
  1251.   return RETRYFOLD;
  1252. }

  1253. static TRef simplify_intmul_k(jit_State *J, int32_t k)
  1254. {
  1255.   /* Note: many more simplifications are possible, e.g. 2^k1 +- 2^k2.
  1256.   ** But this is mainly intended for simple address arithmetic.
  1257.   ** Also it's easier for the backend to optimize the original multiplies.
  1258.   */
  1259.   if (k == 0) {  /* i * 0 ==> 0 */
  1260.     return RIGHTFOLD;
  1261.   } else if (k == 1) {  /* i * 1 ==> i */
  1262.     return LEFTFOLD;
  1263.   } else if ((k & (k-1)) == 0) {  /* i * 2^k ==> i << k */
  1264.     fins->o = IR_BSHL;
  1265.     fins->op2 = lj_ir_kint(J, lj_fls((uint32_t)k));
  1266.     return RETRYFOLD;
  1267.   }
  1268.   return NEXTFOLD;
  1269. }

  1270. LJFOLD(MUL any KINT)
  1271. LJFOLDF(simplify_intmul_k32)
  1272. {
  1273.   if (fright->i >= 0)
  1274.     return simplify_intmul_k(J, fright->i);
  1275.   return NEXTFOLD;
  1276. }

  1277. LJFOLD(MUL any KINT64)
  1278. LJFOLDF(simplify_intmul_k64)
  1279. {
  1280. #if LJ_HASFFI
  1281.   if (ir_kint64(fright)->u64 < 0x80000000u)
  1282.     return simplify_intmul_k(J, (int32_t)ir_kint64(fright)->u64);
  1283.   return NEXTFOLD;
  1284. #else
  1285.   UNUSED(J); lua_assert(0); return FAILFOLD;
  1286. #endif
  1287. }

  1288. LJFOLD(MOD any KINT)
  1289. LJFOLDF(simplify_intmod_k)
  1290. {
  1291.   int32_t k = fright->i;
  1292.   lua_assert(k != 0);
  1293.   if (k > 0 && (k & (k-1)) == 0) {  /* i % (2^k) ==> i & (2^k-1) */
  1294.     fins->o = IR_BAND;
  1295.     fins->op2 = lj_ir_kint(J, k-1);
  1296.     return RETRYFOLD;
  1297.   }
  1298.   return NEXTFOLD;
  1299. }

  1300. LJFOLD(MOD KINT any)
  1301. LJFOLDF(simplify_intmod_kleft)
  1302. {
  1303.   if (fleft->i == 0)
  1304.     return INTFOLD(0);
  1305.   return NEXTFOLD;
  1306. }

  1307. LJFOLD(SUB any any)
  1308. LJFOLD(SUBOV any any)
  1309. LJFOLDF(simplify_intsub)
  1310. {
  1311.   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))  /* i - i ==> 0 */
  1312.     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
  1313.   return NEXTFOLD;
  1314. }

  1315. LJFOLD(SUB ADD any)
  1316. LJFOLDF(simplify_intsubadd_leftcancel)
  1317. {
  1318.   if (!irt_isnum(fins->t)) {
  1319.     PHIBARRIER(fleft);
  1320.     if (fins->op2 == fleft->op1)  /* (i + j) - i ==> j */
  1321.       return fleft->op2;
  1322.     if (fins->op2 == fleft->op2)  /* (i + j) - j ==> i */
  1323.       return fleft->op1;
  1324.   }
  1325.   return NEXTFOLD;
  1326. }

  1327. LJFOLD(SUB SUB any)
  1328. LJFOLDF(simplify_intsubsub_leftcancel)
  1329. {
  1330.   if (!irt_isnum(fins->t)) {
  1331.     PHIBARRIER(fleft);
  1332.     if (fins->op2 == fleft->op1) {  /* (i - j) - i ==> 0 - j */
  1333.       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
  1334.       fins->op2 = fleft->op2;
  1335.       return RETRYFOLD;
  1336.     }
  1337.   }
  1338.   return NEXTFOLD;
  1339. }

  1340. LJFOLD(SUB any SUB)
  1341. LJFOLDF(simplify_intsubsub_rightcancel)
  1342. {
  1343.   if (!irt_isnum(fins->t)) {
  1344.     PHIBARRIER(fright);
  1345.     if (fins->op1 == fright->op1)  /* i - (i - j) ==> j */
  1346.       return fright->op2;
  1347.   }
  1348.   return NEXTFOLD;
  1349. }

  1350. LJFOLD(SUB any ADD)
  1351. LJFOLDF(simplify_intsubadd_rightcancel)
  1352. {
  1353.   if (!irt_isnum(fins->t)) {
  1354.     PHIBARRIER(fright);
  1355.     if (fins->op1 == fright->op1) {  /* i - (i + j) ==> 0 - j */
  1356.       fins->op2 = fright->op2;
  1357.       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
  1358.       return RETRYFOLD;
  1359.     }
  1360.     if (fins->op1 == fright->op2) {  /* i - (j + i) ==> 0 - j */
  1361.       fins->op2 = fright->op1;
  1362.       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
  1363.       return RETRYFOLD;
  1364.     }
  1365.   }
  1366.   return NEXTFOLD;
  1367. }

  1368. LJFOLD(SUB ADD ADD)
  1369. LJFOLDF(simplify_intsubaddadd_cancel)
  1370. {
  1371.   if (!irt_isnum(fins->t)) {
  1372.     PHIBARRIER(fleft);
  1373.     PHIBARRIER(fright);
  1374.     if (fleft->op1 == fright->op1) {  /* (i + j1) - (i + j2) ==> j1 - j2 */
  1375.       fins->op1 = fleft->op2;
  1376.       fins->op2 = fright->op2;
  1377.       return RETRYFOLD;
  1378.     }
  1379.     if (fleft->op1 == fright->op2) {  /* (i + j1) - (j2 + i) ==> j1 - j2 */
  1380.       fins->op1 = fleft->op2;
  1381.       fins->op2 = fright->op1;
  1382.       return RETRYFOLD;
  1383.     }
  1384.     if (fleft->op2 == fright->op1) {  /* (j1 + i) - (i + j2) ==> j1 - j2 */
  1385.       fins->op1 = fleft->op1;
  1386.       fins->op2 = fright->op2;
  1387.       return RETRYFOLD;
  1388.     }
  1389.     if (fleft->op2 == fright->op2) {  /* (j1 + i) - (j2 + i) ==> j1 - j2 */
  1390.       fins->op1 = fleft->op1;
  1391.       fins->op2 = fright->op1;
  1392.       return RETRYFOLD;
  1393.     }
  1394.   }
  1395.   return NEXTFOLD;
  1396. }

  1397. LJFOLD(BAND any KINT)
  1398. LJFOLD(BAND any KINT64)
  1399. LJFOLDF(simplify_band_k)
  1400. {
  1401.   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
  1402.                                      (int64_t)ir_k64(fright)->u64;
  1403.   if (k == 0/* i & 0 ==> 0 */
  1404.     return RIGHTFOLD;
  1405.   if (k == -1/* i & -1 ==> i */
  1406.     return LEFTFOLD;
  1407.   return NEXTFOLD;
  1408. }

  1409. LJFOLD(BOR any KINT)
  1410. LJFOLD(BOR any KINT64)
  1411. LJFOLDF(simplify_bor_k)
  1412. {
  1413.   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
  1414.                                      (int64_t)ir_k64(fright)->u64;
  1415.   if (k == 0/* i | 0 ==> i */
  1416.     return LEFTFOLD;
  1417.   if (k == -1/* i | -1 ==> -1 */
  1418.     return RIGHTFOLD;
  1419.   return NEXTFOLD;
  1420. }

  1421. LJFOLD(BXOR any KINT)
  1422. LJFOLD(BXOR any KINT64)
  1423. LJFOLDF(simplify_bxor_k)
  1424. {
  1425.   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
  1426.                                      (int64_t)ir_k64(fright)->u64;
  1427.   if (k == 0/* i xor 0 ==> i */
  1428.     return LEFTFOLD;
  1429.   if (k == -1) {  /* i xor -1 ==> ~i */
  1430.     fins->o = IR_BNOT;
  1431.     fins->op2 = 0;
  1432.     return RETRYFOLD;
  1433.   }
  1434.   return NEXTFOLD;
  1435. }

  1436. LJFOLD(BSHL any KINT)
  1437. LJFOLD(BSHR any KINT)
  1438. LJFOLD(BSAR any KINT)
  1439. LJFOLD(BROL any KINT)
  1440. LJFOLD(BROR any KINT)
  1441. LJFOLDF(simplify_shift_ik)
  1442. {
  1443.   int32_t mask = irt_is64(fins->t) ? 63 : 31;
  1444.   int32_t k = (fright->i & mask);
  1445.   if (k == 0/* i o 0 ==> i */
  1446.     return LEFTFOLD;
  1447.   if (k == 1 && fins->o == IR_BSHL) {  /* i << 1 ==> i + i */
  1448.     fins->o = IR_ADD;
  1449.     fins->op2 = fins->op1;
  1450.     return RETRYFOLD;
  1451.   }
  1452.   if (k != fright->i) {  /* i o k ==> i o (k & mask) */
  1453.     fins->op2 = (IRRef1)lj_ir_kint(J, k);
  1454.     return RETRYFOLD;
  1455.   }
  1456. #ifndef LJ_TARGET_UNIFYROT
  1457.   if (fins->o == IR_BROR) {  /* bror(i, k) ==> brol(i, (-k)&mask) */
  1458.     fins->o = IR_BROL;
  1459.     fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&mask);
  1460.     return RETRYFOLD;
  1461.   }
  1462. #endif
  1463.   return NEXTFOLD;
  1464. }

  1465. LJFOLD(BSHL any BAND)
  1466. LJFOLD(BSHR any BAND)
  1467. LJFOLD(BSAR any BAND)
  1468. LJFOLD(BROL any BAND)
  1469. LJFOLD(BROR any BAND)
  1470. LJFOLDF(simplify_shift_andk)
  1471. {
  1472.   IRIns *irk = IR(fright->op2);
  1473.   PHIBARRIER(fright);
  1474.   if ((fins->o < IR_BROL ? LJ_TARGET_MASKSHIFT : LJ_TARGET_MASKROT) &&
  1475.       irk->o == IR_KINT) {  /* i o (j & mask) ==> i o j */
  1476.     int32_t mask = irt_is64(fins->t) ? 63 : 31;
  1477.     int32_t k = irk->i & mask;
  1478.     if (k == mask) {
  1479.       fins->op2 = fright->op1;
  1480.       return RETRYFOLD;
  1481.     }
  1482.   }
  1483.   return NEXTFOLD;
  1484. }

  1485. LJFOLD(BSHL KINT any)
  1486. LJFOLD(BSHR KINT any)
  1487. LJFOLD(BSHL KINT64 any)
  1488. LJFOLD(BSHR KINT64 any)
  1489. LJFOLDF(simplify_shift1_ki)
  1490. {
  1491.   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
  1492.                                     (int64_t)ir_k64(fleft)->u64;
  1493.   if (k == 0/* 0 o i ==> 0 */
  1494.     return LEFTFOLD;
  1495.   return NEXTFOLD;
  1496. }

  1497. LJFOLD(BSAR KINT any)
  1498. LJFOLD(BROL KINT any)
  1499. LJFOLD(BROR KINT any)
  1500. LJFOLD(BSAR KINT64 any)
  1501. LJFOLD(BROL KINT64 any)
  1502. LJFOLD(BROR KINT64 any)
  1503. LJFOLDF(simplify_shift2_ki)
  1504. {
  1505.   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
  1506.                                     (int64_t)ir_k64(fleft)->u64;
  1507.   if (k == 0 || k == -1/* 0 o i ==> 0; -1 o i ==> -1 */
  1508.     return LEFTFOLD;
  1509.   return NEXTFOLD;
  1510. }

  1511. LJFOLD(BSHL BAND KINT)
  1512. LJFOLD(BSHR BAND KINT)
  1513. LJFOLD(BROL BAND KINT)
  1514. LJFOLD(BROR BAND KINT)
  1515. LJFOLDF(simplify_shiftk_andk)
  1516. {
  1517.   IRIns *irk = IR(fleft->op2);
  1518.   PHIBARRIER(fleft);
  1519.   if (irk->o == IR_KINT) {  /* (i & k1) o k2 ==> (i o k2) & (k1 o k2) */
  1520.     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
  1521.     fins->op1 = fleft->op1;
  1522.     fins->op1 = (IRRef1)lj_opt_fold(J);
  1523.     fins->op2 = (IRRef1)lj_ir_kint(J, k);
  1524.     fins->ot = IRTI(IR_BAND);
  1525.     return RETRYFOLD;
  1526.   }
  1527.   return NEXTFOLD;
  1528. }

  1529. LJFOLD(BAND BSHL KINT)
  1530. LJFOLD(BAND BSHR KINT)
  1531. LJFOLDF(simplify_andk_shiftk)
  1532. {
  1533.   IRIns *irk = IR(fleft->op2);
  1534.   if (irk->o == IR_KINT &&
  1535.       kfold_intop(-1, irk->i, (IROp)fleft->o) == fright->i)
  1536.     return LEFTFOLD/* (i o k1) & k2 ==> i, if (-1 o k1) == k2 */
  1537.   return NEXTFOLD;
  1538. }

  1539. /* -- Reassociation ------------------------------------------------------- */

  1540. LJFOLD(ADD ADD KINT)
  1541. LJFOLD(MUL MUL KINT)
  1542. LJFOLD(BAND BAND KINT)
  1543. LJFOLD(BOR BOR KINT)
  1544. LJFOLD(BXOR BXOR KINT)
  1545. LJFOLDF(reassoc_intarith_k)
  1546. {
  1547.   IRIns *irk = IR(fleft->op2);
  1548.   if (irk->o == IR_KINT) {
  1549.     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
  1550.     if (k == irk->i)  /* (i o k1) o k2 ==> i o k1, if (k1 o k2) == k1. */
  1551.       return LEFTFOLD;
  1552.     PHIBARRIER(fleft);
  1553.     fins->op1 = fleft->op1;
  1554.     fins->op2 = (IRRef1)lj_ir_kint(J, k);
  1555.     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
  1556.   }
  1557.   return NEXTFOLD;
  1558. }

  1559. LJFOLD(ADD ADD KINT64)
  1560. LJFOLD(MUL MUL KINT64)
  1561. LJFOLD(BAND BAND KINT64)
  1562. LJFOLD(BOR BOR KINT64)
  1563. LJFOLD(BXOR BXOR KINT64)
  1564. LJFOLDF(reassoc_intarith_k64)
  1565. {
  1566. #if LJ_HASFFI
  1567.   IRIns *irk = IR(fleft->op2);
  1568.   if (irk->o == IR_KINT64) {
  1569.     uint64_t k = kfold_int64arith(ir_k64(irk)->u64,
  1570.                                   ir_k64(fright)->u64, (IROp)fins->o);
  1571.     PHIBARRIER(fleft);
  1572.     fins->op1 = fleft->op1;
  1573.     fins->op2 = (IRRef1)lj_ir_kint64(J, k);
  1574.     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
  1575.   }
  1576.   return NEXTFOLD;
  1577. #else
  1578.   UNUSED(J); lua_assert(0); return FAILFOLD;
  1579. #endif
  1580. }

  1581. LJFOLD(MIN MIN any)
  1582. LJFOLD(MAX MAX any)
  1583. LJFOLD(BAND BAND any)
  1584. LJFOLD(BOR BOR any)
  1585. LJFOLDF(reassoc_dup)
  1586. {
  1587.   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
  1588.     return LEFTFOLD/* (a o b) o a ==> a o b; (a o b) o b ==> a o b */
  1589.   return NEXTFOLD;
  1590. }

  1591. LJFOLD(BXOR BXOR any)
  1592. LJFOLDF(reassoc_bxor)
  1593. {
  1594.   PHIBARRIER(fleft);
  1595.   if (fins->op2 == fleft->op1)  /* (a xor b) xor a ==> b */
  1596.     return fleft->op2;
  1597.   if (fins->op2 == fleft->op2)  /* (a xor b) xor b ==> a */
  1598.     return fleft->op1;
  1599.   return NEXTFOLD;
  1600. }

  1601. LJFOLD(BSHL BSHL KINT)
  1602. LJFOLD(BSHR BSHR KINT)
  1603. LJFOLD(BSAR BSAR KINT)
  1604. LJFOLD(BROL BROL KINT)
  1605. LJFOLD(BROR BROR KINT)
  1606. LJFOLDF(reassoc_shift)
  1607. {
  1608.   IRIns *irk = IR(fleft->op2);
  1609.   PHIBARRIER(fleft);  /* The (shift any KINT) rule covers k2 == 0 and more. */
  1610.   if (irk->o == IR_KINT) {  /* (i o k1) o k2 ==> i o (k1 + k2) */
  1611.     int32_t mask = irt_is64(fins->t) ? 63 : 31;
  1612.     int32_t k = (irk->i & mask) + (fright->i & mask);
  1613.     if (k > mask) {  /* Combined shift too wide? */
  1614.       if (fins->o == IR_BSHL || fins->o == IR_BSHR)
  1615.         return mask == 31 ? INTFOLD(0) : INT64FOLD(0);
  1616.       else if (fins->o == IR_BSAR)
  1617.         k = mask;
  1618.       else
  1619.         k &= mask;
  1620.     }
  1621.     fins->op1 = fleft->op1;
  1622.     fins->op2 = (IRRef1)lj_ir_kint(J, k);
  1623.     return RETRYFOLD;
  1624.   }
  1625.   return NEXTFOLD;
  1626. }

  1627. LJFOLD(MIN MIN KNUM)
  1628. LJFOLD(MAX MAX KNUM)
  1629. LJFOLD(MIN MIN KINT)
  1630. LJFOLD(MAX MAX KINT)
  1631. LJFOLDF(reassoc_minmax_k)
  1632. {
  1633.   IRIns *irk = IR(fleft->op2);
  1634.   if (irk->o == IR_KNUM) {
  1635.     lua_Number a = ir_knum(irk)->n;
  1636.     lua_Number y = lj_vm_foldarith(a, knumright, fins->o - IR_ADD);
  1637.     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
  1638.       return LEFTFOLD;
  1639.     PHIBARRIER(fleft);
  1640.     fins->op1 = fleft->op1;
  1641.     fins->op2 = (IRRef1)lj_ir_knum(J, y);
  1642.     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
  1643.   } else if (irk->o == IR_KINT) {
  1644.     int32_t a = irk->i;
  1645.     int32_t y = kfold_intop(a, fright->i, fins->o);
  1646.     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
  1647.       return LEFTFOLD;
  1648.     PHIBARRIER(fleft);
  1649.     fins->op1 = fleft->op1;
  1650.     fins->op2 = (IRRef1)lj_ir_kint(J, y);
  1651.     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
  1652.   }
  1653.   return NEXTFOLD;
  1654. }

  1655. LJFOLD(MIN MAX any)
  1656. LJFOLD(MAX MIN any)
  1657. LJFOLDF(reassoc_minmax_left)
  1658. {
  1659.   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
  1660.     return RIGHTFOLD/* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */
  1661.   return NEXTFOLD;
  1662. }

  1663. LJFOLD(MIN any MAX)
  1664. LJFOLD(MAX any MIN)
  1665. LJFOLDF(reassoc_minmax_right)
  1666. {
  1667.   if (fins->op1 == fright->op1 || fins->op1 == fright->op2)
  1668.     return LEFTFOLD/* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */
  1669.   return NEXTFOLD;
  1670. }

  1671. /* -- Array bounds check elimination -------------------------------------- */

  1672. /* Eliminate ABC across PHIs to handle t[i-1] forwarding case.
  1673. ** ABC(asize, (i+k)+(-k)) ==> ABC(asize, i), but only if it already exists.
  1674. ** Could be generalized to (i+k1)+k2 ==> i+(k1+k2), but needs better disambig.
  1675. */
  1676. LJFOLD(ABC any ADD)
  1677. LJFOLDF(abc_fwd)
  1678. {
  1679.   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
  1680.     if (irref_isk(fright->op2)) {
  1681.       IRIns *add2 = IR(fright->op1);
  1682.       if (add2->o == IR_ADD && irref_isk(add2->op2) &&
  1683.           IR(fright->op2)->i == -IR(add2->op2)->i) {
  1684.         IRRef ref = J->chain[IR_ABC];
  1685.         IRRef lim = add2->op1;
  1686.         if (fins->op1 > lim) lim = fins->op1;
  1687.         while (ref > lim) {
  1688.           IRIns *ir = IR(ref);
  1689.           if (ir->op1 == fins->op1 && ir->op2 == add2->op1)
  1690.             return DROPFOLD;
  1691.           ref = ir->prev;
  1692.         }
  1693.       }
  1694.     }
  1695.   }
  1696.   return NEXTFOLD;
  1697. }

  1698. /* Eliminate ABC for constants.
  1699. ** ABC(asize, k1), ABC(asize k2) ==> ABC(asize, max(k1, k2))
  1700. ** Drop second ABC if k2 is lower. Otherwise patch first ABC with k2.
  1701. */
  1702. LJFOLD(ABC any KINT)
  1703. LJFOLDF(abc_k)
  1704. {
  1705.   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
  1706.     IRRef ref = J->chain[IR_ABC];
  1707.     IRRef asize = fins->op1;
  1708.     while (ref > asize) {
  1709.       IRIns *ir = IR(ref);
  1710.       if (ir->op1 == asize && irref_isk(ir->op2)) {
  1711.         int32_t k = IR(ir->op2)->i;
  1712.         if (fright->i > k)
  1713.           ir->op2 = fins->op2;
  1714.         return DROPFOLD;
  1715.       }
  1716.       ref = ir->prev;
  1717.     }
  1718.     return EMITFOLD/* Already performed CSE. */
  1719.   }
  1720.   return NEXTFOLD;
  1721. }

  1722. /* Eliminate invariant ABC inside loop. */
  1723. LJFOLD(ABC any any)
  1724. LJFOLDF(abc_invar)
  1725. {
  1726.   /* Invariant ABC marked as PTR. Drop if op1 is invariant, too. */
  1727.   if (!irt_isint(fins->t) && fins->op1 < J->chain[IR_LOOP] &&
  1728.       !irt_isphi(IR(fins->op1)->t))
  1729.     return DROPFOLD;
  1730.   return NEXTFOLD;
  1731. }

  1732. /* -- Commutativity ------------------------------------------------------- */

  1733. /* The refs of commutative ops are canonicalized. Lower refs go to the right.
  1734. ** Rationale behind this:
  1735. ** - It (also) moves constants to the right.
  1736. ** - It reduces the number of FOLD rules (e.g. (BOR any KINT) suffices).
  1737. ** - It helps CSE to find more matches.
  1738. ** - The assembler generates better code with constants at the right.
  1739. */

  1740. LJFOLD(ADD any any)
  1741. LJFOLD(MUL any any)
  1742. LJFOLD(ADDOV any any)
  1743. LJFOLD(MULOV any any)
  1744. LJFOLDF(comm_swap)
  1745. {
  1746.   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
  1747.     IRRef1 tmp = fins->op1;
  1748.     fins->op1 = fins->op2;
  1749.     fins->op2 = tmp;
  1750.     return RETRYFOLD;
  1751.   }
  1752.   return NEXTFOLD;
  1753. }

  1754. LJFOLD(EQ any any)
  1755. LJFOLD(NE any any)
  1756. LJFOLDF(comm_equal)
  1757. {
  1758.   /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
  1759.   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
  1760.     return CONDFOLD(fins->o == IR_EQ);
  1761.   return fold_comm_swap(J);
  1762. }

  1763. LJFOLD(LT any any)
  1764. LJFOLD(GE any any)
  1765. LJFOLD(LE any any)
  1766. LJFOLD(GT any any)
  1767. LJFOLD(ULT any any)
  1768. LJFOLD(UGE any any)
  1769. LJFOLD(ULE any any)
  1770. LJFOLD(UGT any any)
  1771. LJFOLDF(comm_comp)
  1772. {
  1773.   /* For non-numbers only: x <=> x ==> drop; x <> x ==> fail */
  1774.   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
  1775.     return CONDFOLD((fins->o ^ (fins->o >> 1)) & 1);
  1776.   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
  1777.     IRRef1 tmp = fins->op1;
  1778.     fins->op1 = fins->op2;
  1779.     fins->op2 = tmp;
  1780.     fins->o ^= 3; /* GT <-> LT, GE <-> LE, does not affect U */
  1781.     return RETRYFOLD;
  1782.   }
  1783.   return NEXTFOLD;
  1784. }

  1785. LJFOLD(BAND any any)
  1786. LJFOLD(BOR any any)
  1787. LJFOLD(MIN any any)
  1788. LJFOLD(MAX any any)
  1789. LJFOLDF(comm_dup)
  1790. {
  1791.   if (fins->op1 == fins->op2)  /* x o x ==> x */
  1792.     return LEFTFOLD;
  1793.   return fold_comm_swap(J);
  1794. }

  1795. LJFOLD(BXOR any any)
  1796. LJFOLDF(comm_bxor)
  1797. {
  1798.   if (fins->op1 == fins->op2)  /* i xor i ==> 0 */
  1799.     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
  1800.   return fold_comm_swap(J);
  1801. }

  1802. /* -- Simplification of compound expressions ------------------------------ */

  1803. static TRef kfold_xload(jit_State *J, IRIns *ir, const void *p)
  1804. {
  1805.   int32_t k;
  1806.   switch (irt_type(ir->t)) {
  1807.   case IRT_NUM: return lj_ir_knum_u64(J, *(uint64_t *)p);
  1808.   case IRT_I8: k = (int32_t)*(int8_t *)p; break;
  1809.   case IRT_U8: k = (int32_t)*(uint8_t *)p; break;
  1810.   case IRT_I16: k = (int32_t)(int16_t)lj_getu16(p); break;
  1811.   case IRT_U16: k = (int32_t)(uint16_t)lj_getu16(p); break;
  1812.   case IRT_INT: case IRT_U32: k = (int32_t)lj_getu32(p); break;
  1813.   case IRT_I64: case IRT_U64: return lj_ir_kint64(J, *(uint64_t *)p);
  1814.   default: return 0;
  1815.   }
  1816.   return lj_ir_kint(J, k);
  1817. }

  1818. /* Turn: string.sub(str, a, b) == kstr
  1819. ** into: string.byte(str, a) == string.byte(kstr, 1) etc.
  1820. ** Note: this creates unaligned XLOADs on x86/x64.
  1821. */
  1822. LJFOLD(EQ SNEW KGC)
  1823. LJFOLD(NE SNEW KGC)
  1824. LJFOLDF(merge_eqne_snew_kgc)
  1825. {
  1826.   GCstr *kstr = ir_kstr(fright);
  1827.   int32_t len = (int32_t)kstr->len;
  1828.   lua_assert(irt_isstr(fins->t));

  1829. #if LJ_TARGET_UNALIGNED
  1830. #define FOLD_SNEW_MAX_LEN        4  /* Handle string lengths 0, 1, 2, 3, 4. */
  1831. #define FOLD_SNEW_TYPE8                IRT_I8        /* Creates shorter immediates. */
  1832. #else
  1833. #define FOLD_SNEW_MAX_LEN        1  /* Handle string lengths 0 or 1. */
  1834. #define FOLD_SNEW_TYPE8                IRT_U8  /* Prefer unsigned loads. */
  1835. #endif

  1836.   PHIBARRIER(fleft);
  1837.   if (len <= FOLD_SNEW_MAX_LEN) {
  1838.     IROp op = (IROp)fins->o;
  1839.     IRRef strref = fleft->op1;
  1840.     if (IR(strref)->o != IR_STRREF)
  1841.       return NEXTFOLD;
  1842.     if (op == IR_EQ) {
  1843.       emitir(IRTGI(IR_EQ), fleft->op2, lj_ir_kint(J, len));
  1844.       /* Caveat: fins/fleft/fright is no longer valid after emitir. */
  1845.     } else {
  1846.       /* NE is not expanded since this would need an OR of two conds. */
  1847.       if (!irref_isk(fleft->op2))  /* Only handle the constant length case. */
  1848.         return NEXTFOLD;
  1849.       if (IR(fleft->op2)->i != len)
  1850.         return DROPFOLD;
  1851.     }
  1852.     if (len > 0) {
  1853.       /* A 4 byte load for length 3 is ok -- all strings have an extra NUL. */
  1854.       uint16_t ot = (uint16_t)(len == 1 ? IRT(IR_XLOAD, FOLD_SNEW_TYPE8) :
  1855.                                len == 2 ? IRT(IR_XLOAD, IRT_U16) :
  1856.                                IRTI(IR_XLOAD));
  1857.       TRef tmp = emitir(ot, strref,
  1858.                         IRXLOAD_READONLY | (len > 1 ? IRXLOAD_UNALIGNED : 0));
  1859.       TRef val = kfold_xload(J, IR(tref_ref(tmp)), strdata(kstr));
  1860.       if (len == 3)
  1861.         tmp = emitir(IRTI(IR_BAND), tmp,
  1862.                      lj_ir_kint(J, LJ_ENDIAN_SELECT(0x00ffffff, 0xffffff00)));
  1863.       fins->op1 = (IRRef1)tmp;
  1864.       fins->op2 = (IRRef1)val;
  1865.       fins->ot = (IROpT)IRTGI(op);
  1866.       return RETRYFOLD;
  1867.     } else {
  1868.       return DROPFOLD;
  1869.     }
  1870.   }
  1871.   return NEXTFOLD;
  1872. }

  1873. /* -- Loads --------------------------------------------------------------- */

  1874. /* Loads cannot be folded or passed on to CSE in general.
  1875. ** Alias analysis is needed to check for forwarding opportunities.
  1876. **
  1877. ** Caveat: *all* loads must be listed here or they end up at CSE!
  1878. */

  1879. LJFOLD(ALOAD any)
  1880. LJFOLDX(lj_opt_fwd_aload)

  1881. /* From HREF fwd (see below). Must eliminate, not supported by fwd/backend. */
  1882. LJFOLD(HLOAD KKPTR)
  1883. LJFOLDF(kfold_hload_kkptr)
  1884. {
  1885.   UNUSED(J);
  1886.   lua_assert(ir_kptr(fleft) == niltvg(J2G(J)));
  1887.   return TREF_NIL;
  1888. }

  1889. LJFOLD(HLOAD any)
  1890. LJFOLDX(lj_opt_fwd_hload)

  1891. LJFOLD(ULOAD any)
  1892. LJFOLDX(lj_opt_fwd_uload)

  1893. LJFOLD(CALLL any IRCALL_lj_tab_len)
  1894. LJFOLDX(lj_opt_fwd_tab_len)

  1895. /* Upvalue refs are really loads, but there are no corresponding stores.
  1896. ** So CSE is ok for them, except for UREFO across a GC step (see below).
  1897. ** If the referenced function is const, its upvalue addresses are const, too.
  1898. ** This can be used to improve CSE by looking for the same address,
  1899. ** even if the upvalues originate from a different function.
  1900. */
  1901. LJFOLD(UREFO KGC any)
  1902. LJFOLD(UREFC KGC any)
  1903. LJFOLDF(cse_uref)
  1904. {
  1905.   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
  1906.     IRRef ref = J->chain[fins->o];
  1907.     GCfunc *fn = ir_kfunc(fleft);
  1908.     GCupval *uv = gco2uv(gcref(fn->l.uvptr[(fins->op2 >> 8)]));
  1909.     while (ref > 0) {
  1910.       IRIns *ir = IR(ref);
  1911.       if (irref_isk(ir->op1)) {
  1912.         GCfunc *fn2 = ir_kfunc(IR(ir->op1));
  1913.         if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) {
  1914.           if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
  1915.             break;
  1916.           return ref;
  1917.         }
  1918.       }
  1919.       ref = ir->prev;
  1920.     }
  1921.   }
  1922.   return EMITFOLD;
  1923. }

  1924. LJFOLD(HREFK any any)
  1925. LJFOLDX(lj_opt_fwd_hrefk)

  1926. LJFOLD(HREF TNEW any)
  1927. LJFOLDF(fwd_href_tnew)
  1928. {
  1929.   if (lj_opt_fwd_href_nokey(J))
  1930.     return lj_ir_kkptr(J, niltvg(J2G(J)));
  1931.   return NEXTFOLD;
  1932. }

  1933. LJFOLD(HREF TDUP KPRI)
  1934. LJFOLD(HREF TDUP KGC)
  1935. LJFOLD(HREF TDUP KNUM)
  1936. LJFOLDF(fwd_href_tdup)
  1937. {
  1938.   TValue keyv;
  1939.   lj_ir_kvalue(J->L, &keyv, fright);
  1940.   if (lj_tab_get(J->L, ir_ktab(IR(fleft->op1)), &keyv) == niltvg(J2G(J)) &&
  1941.       lj_opt_fwd_href_nokey(J))
  1942.     return lj_ir_kkptr(J, niltvg(J2G(J)));
  1943.   return NEXTFOLD;
  1944. }

  1945. /* We can safely FOLD/CSE array/hash refs and field loads, since there
  1946. ** are no corresponding stores. But we need to check for any NEWREF with
  1947. ** an aliased table, as it may invalidate all of the pointers and fields.
  1948. ** Only HREF needs the NEWREF check -- AREF and HREFK already depend on
  1949. ** FLOADs. And NEWREF itself is treated like a store (see below).
  1950. ** LREF is constant (per trace) since coroutine switches are not inlined.
  1951. */
  1952. LJFOLD(FLOAD TNEW IRFL_TAB_ASIZE)
  1953. LJFOLDF(fload_tab_tnew_asize)
  1954. {
  1955.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
  1956.     return INTFOLD(fleft->op1);
  1957.   return NEXTFOLD;
  1958. }

  1959. LJFOLD(FLOAD TNEW IRFL_TAB_HMASK)
  1960. LJFOLDF(fload_tab_tnew_hmask)
  1961. {
  1962.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
  1963.     return INTFOLD((1 << fleft->op2)-1);
  1964.   return NEXTFOLD;
  1965. }

  1966. LJFOLD(FLOAD TDUP IRFL_TAB_ASIZE)
  1967. LJFOLDF(fload_tab_tdup_asize)
  1968. {
  1969.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
  1970.     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->asize);
  1971.   return NEXTFOLD;
  1972. }

  1973. LJFOLD(FLOAD TDUP IRFL_TAB_HMASK)
  1974. LJFOLDF(fload_tab_tdup_hmask)
  1975. {
  1976.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
  1977.     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->hmask);
  1978.   return NEXTFOLD;
  1979. }

  1980. LJFOLD(HREF any any)
  1981. LJFOLD(FLOAD any IRFL_TAB_ARRAY)
  1982. LJFOLD(FLOAD any IRFL_TAB_NODE)
  1983. LJFOLD(FLOAD any IRFL_TAB_ASIZE)
  1984. LJFOLD(FLOAD any IRFL_TAB_HMASK)
  1985. LJFOLDF(fload_tab_ah)
  1986. {
  1987.   TRef tr = lj_opt_cse(J);
  1988.   return lj_opt_fwd_tptr(J, tref_ref(tr)) ? tr : EMITFOLD;
  1989. }

  1990. /* Strings are immutable, so we can safely FOLD/CSE the related FLOAD. */
  1991. LJFOLD(FLOAD KGC IRFL_STR_LEN)
  1992. LJFOLDF(fload_str_len_kgc)
  1993. {
  1994.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
  1995.     return INTFOLD((int32_t)ir_kstr(fleft)->len);
  1996.   return NEXTFOLD;
  1997. }

  1998. LJFOLD(FLOAD SNEW IRFL_STR_LEN)
  1999. LJFOLDF(fload_str_len_snew)
  2000. {
  2001.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
  2002.     PHIBARRIER(fleft);
  2003.     return fleft->op2;
  2004.   }
  2005.   return NEXTFOLD;
  2006. }

  2007. LJFOLD(FLOAD TOSTR IRFL_STR_LEN)
  2008. LJFOLDF(fload_str_len_tostr)
  2009. {
  2010.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fleft->op2 == IRTOSTR_CHAR)
  2011.     return INTFOLD(1);
  2012.   return NEXTFOLD;
  2013. }

  2014. /* The C type ID of cdata objects is immutable. */
  2015. LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
  2016. LJFOLDF(fload_cdata_typeid_kgc)
  2017. {
  2018.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
  2019.     return INTFOLD((int32_t)ir_kcdata(fleft)->ctypeid);
  2020.   return NEXTFOLD;
  2021. }

  2022. /* Get the contents of immutable cdata objects. */
  2023. LJFOLD(FLOAD KGC IRFL_CDATA_PTR)
  2024. LJFOLD(FLOAD KGC IRFL_CDATA_INT)
  2025. LJFOLD(FLOAD KGC IRFL_CDATA_INT64)
  2026. LJFOLDF(fload_cdata_int64_kgc)
  2027. {
  2028.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
  2029.     void *p = cdataptr(ir_kcdata(fleft));
  2030.     if (irt_is64(fins->t))
  2031.       return INT64FOLD(*(uint64_t *)p);
  2032.     else
  2033.       return INTFOLD(*(int32_t *)p);
  2034.   }
  2035.   return NEXTFOLD;
  2036. }

  2037. LJFOLD(FLOAD CNEW IRFL_CDATA_CTYPEID)
  2038. LJFOLD(FLOAD CNEWI IRFL_CDATA_CTYPEID)
  2039. LJFOLDF(fload_cdata_typeid_cnew)
  2040. {
  2041.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
  2042.     return fleft->op1;  /* No PHI barrier needed. CNEW/CNEWI op1 is const. */
  2043.   return NEXTFOLD;
  2044. }

  2045. /* Pointer, int and int64 cdata objects are immutable. */
  2046. LJFOLD(FLOAD CNEWI IRFL_CDATA_PTR)
  2047. LJFOLD(FLOAD CNEWI IRFL_CDATA_INT)
  2048. LJFOLD(FLOAD CNEWI IRFL_CDATA_INT64)
  2049. LJFOLDF(fload_cdata_ptr_int64_cnew)
  2050. {
  2051.   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
  2052.     return fleft->op2;  /* Fold even across PHI to avoid allocations. */
  2053.   return NEXTFOLD;
  2054. }

  2055. LJFOLD(FLOAD any IRFL_STR_LEN)
  2056. LJFOLD(FLOAD any IRFL_FUNC_ENV)
  2057. LJFOLD(FLOAD any IRFL_THREAD_ENV)
  2058. LJFOLD(FLOAD any IRFL_CDATA_CTYPEID)
  2059. LJFOLD(FLOAD any IRFL_CDATA_PTR)
  2060. LJFOLD(FLOAD any IRFL_CDATA_INT)
  2061. LJFOLD(FLOAD any IRFL_CDATA_INT64)
  2062. LJFOLD(VLOAD any any)  /* Vararg loads have no corresponding stores. */
  2063. LJFOLDX(lj_opt_cse)

  2064. /* All other field loads need alias analysis. */
  2065. LJFOLD(FLOAD any any)
  2066. LJFOLDX(lj_opt_fwd_fload)

  2067. /* This is for LOOP only. Recording handles SLOADs internally. */
  2068. LJFOLD(SLOAD any any)
  2069. LJFOLDF(fwd_sload)
  2070. {
  2071.   if ((fins->op2 & IRSLOAD_FRAME)) {
  2072.     TRef tr = lj_opt_cse(J);
  2073.     return tref_ref(tr) < J->chain[IR_RETF] ? EMITFOLD : tr;
  2074.   } else {
  2075.     lua_assert(J->slot[fins->op1] != 0);
  2076.     return J->slot[fins->op1];
  2077.   }
  2078. }

  2079. /* Only fold for KKPTR. The pointer _and_ the contents must be const. */
  2080. LJFOLD(XLOAD KKPTR any)
  2081. LJFOLDF(xload_kptr)
  2082. {
  2083.   TRef tr = kfold_xload(J, fins, ir_kptr(fleft));
  2084.   return tr ? tr : NEXTFOLD;
  2085. }

  2086. LJFOLD(XLOAD any any)
  2087. LJFOLDX(lj_opt_fwd_xload)

  2088. /* -- Write barriers ------------------------------------------------------ */

  2089. /* Write barriers are amenable to CSE, but not across any incremental
  2090. ** GC steps.
  2091. **
  2092. ** The same logic applies to open upvalue references, because a stack
  2093. ** may be resized during a GC step (not the current stack, but maybe that
  2094. ** of a coroutine).
  2095. */
  2096. LJFOLD(TBAR any)
  2097. LJFOLD(OBAR any any)
  2098. LJFOLD(UREFO any any)
  2099. LJFOLDF(barrier_tab)
  2100. {
  2101.   TRef tr = lj_opt_cse(J);
  2102.   if (gcstep_barrier(J, tref_ref(tr)))  /* CSE across GC step? */
  2103.     return EMITFOLD/* Raw emit. Assumes fins is left intact by CSE. */
  2104.   return tr;
  2105. }

  2106. LJFOLD(TBAR TNEW)
  2107. LJFOLD(TBAR TDUP)
  2108. LJFOLDF(barrier_tnew_tdup)
  2109. {
  2110.   /* New tables are always white and never need a barrier. */
  2111.   if (fins->op1 < J->chain[IR_LOOP])  /* Except across a GC step. */
  2112.     return NEXTFOLD;
  2113.   return DROPFOLD;
  2114. }

  2115. /* -- Profiling ----------------------------------------------------------- */

  2116. LJFOLD(PROF any any)
  2117. LJFOLDF(prof)
  2118. {
  2119.   IRRef ref = J->chain[IR_PROF];
  2120.   if (ref+1 == J->cur.nins)  /* Drop neighbouring IR_PROF. */
  2121.     return ref;
  2122.   return EMITFOLD;
  2123. }

  2124. /* -- Stores and allocations ---------------------------------------------- */

  2125. /* Stores and allocations cannot be folded or passed on to CSE in general.
  2126. ** But some stores can be eliminated with dead-store elimination (DSE).
  2127. **
  2128. ** Caveat: *all* stores and allocs must be listed here or they end up at CSE!
  2129. */

  2130. LJFOLD(ASTORE any any)
  2131. LJFOLD(HSTORE any any)
  2132. LJFOLDX(lj_opt_dse_ahstore)

  2133. LJFOLD(USTORE any any)
  2134. LJFOLDX(lj_opt_dse_ustore)

  2135. LJFOLD(FSTORE any any)
  2136. LJFOLDX(lj_opt_dse_fstore)

  2137. LJFOLD(XSTORE any any)
  2138. LJFOLDX(lj_opt_dse_xstore)

  2139. LJFOLD(NEWREF any any)  /* Treated like a store. */
  2140. LJFOLD(CALLA any any)
  2141. LJFOLD(CALLL any any)  /* Safeguard fallback. */
  2142. LJFOLD(CALLS any any)
  2143. LJFOLD(CALLXS any any)
  2144. LJFOLD(XBAR)
  2145. LJFOLD(RETF any any)  /* Modifies BASE. */
  2146. LJFOLD(TNEW any any)
  2147. LJFOLD(TDUP any)
  2148. LJFOLD(CNEW any any)
  2149. LJFOLD(XSNEW any any)
  2150. LJFOLD(BUFHDR any any)
  2151. LJFOLDX(lj_ir_emit)

  2152. /* ------------------------------------------------------------------------ */

  2153. /* Every entry in the generated hash table is a 32 bit pattern:
  2154. **
  2155. ** xxxxxxxx iiiiiii lllllll rrrrrrrrrr
  2156. **
  2157. **   xxxxxxxx = 8 bit index into fold function table
  2158. **    iiiiiii = 7 bit folded instruction opcode
  2159. **    lllllll = 7 bit left instruction opcode
  2160. ** rrrrrrrrrr = 8 bit right instruction opcode or 10 bits from literal field
  2161. */

  2162. #include "lj_folddef.h"

  2163. /* ------------------------------------------------------------------------ */

  2164. /* Fold IR instruction. */
  2165. TRef LJ_FASTCALL lj_opt_fold(jit_State *J)
  2166. {
  2167.   uint32_t key, any;
  2168.   IRRef ref;

  2169.   if (LJ_UNLIKELY((J->flags & JIT_F_OPT_MASK) != JIT_F_OPT_DEFAULT)) {
  2170.     lua_assert(((JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE|JIT_F_OPT_DSE) |
  2171.                 JIT_F_OPT_DEFAULT) == JIT_F_OPT_DEFAULT);
  2172.     /* Folding disabled? Chain to CSE, but not for loads/stores/allocs. */
  2173.     if (!(J->flags & JIT_F_OPT_FOLD) && irm_kind(lj_ir_mode[fins->o]) == IRM_N)
  2174.       return lj_opt_cse(J);

  2175.     /* No FOLD, forwarding or CSE? Emit raw IR for loads, except for SLOAD. */
  2176.     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE)) !=
  2177.                     (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE) &&
  2178.         irm_kind(lj_ir_mode[fins->o]) == IRM_L && fins->o != IR_SLOAD)
  2179.       return lj_ir_emit(J);

  2180.     /* No FOLD or DSE? Emit raw IR for stores. */
  2181.     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_DSE)) !=
  2182.                     (JIT_F_OPT_FOLD|JIT_F_OPT_DSE) &&
  2183.         irm_kind(lj_ir_mode[fins->o]) == IRM_S)
  2184.       return lj_ir_emit(J);
  2185.   }

  2186.   /* Fold engine start/retry point. */
  2187. retry:
  2188.   /* Construct key from opcode and operand opcodes (unless literal/none). */
  2189.   key = ((uint32_t)fins->o << 17);
  2190.   if (fins->op1 >= J->cur.nk) {
  2191.     key += (uint32_t)IR(fins->op1)->o << 10;
  2192.     *fleft = *IR(fins->op1);
  2193.   }
  2194.   if (fins->op2 >= J->cur.nk) {
  2195.     key += (uint32_t)IR(fins->op2)->o;
  2196.     *fright = *IR(fins->op2);
  2197.   } else {
  2198.     key += (fins->op2 & 0x3ffu);  /* Literal mask. Must include IRCONV_*MASK. */
  2199.   }

  2200.   /* Check for a match in order from most specific to least specific. */
  2201.   any = 0;
  2202.   for (;;) {
  2203.     uint32_t k = key | (any & 0x1ffff);
  2204.     uint32_t h = fold_hashkey(k);
  2205.     uint32_t fh = fold_hash[h];  /* Lookup key in semi-perfect hash table. */
  2206.     if ((fh & 0xffffff) == k || (fh = fold_hash[h+1], (fh & 0xffffff) == k)) {
  2207.       ref = (IRRef)tref_ref(fold_func[fh >> 24](J));
  2208.       if (ref != NEXTFOLD)
  2209.         break;
  2210.     }
  2211.     if (any == 0xfffff/* Exhausted folding. Pass on to CSE. */
  2212.       return lj_opt_cse(J);
  2213.     any = (any | (any >> 10)) ^ 0xffc00;
  2214.   }

  2215.   /* Return value processing, ordered by frequency. */
  2216.   if (LJ_LIKELY(ref >= MAX_FOLD))
  2217.     return TREF(ref, irt_t(IR(ref)->t));
  2218.   if (ref == RETRYFOLD)
  2219.     goto retry;
  2220.   if (ref == KINTFOLD)
  2221.     return lj_ir_kint(J, fins->i);
  2222.   if (ref == FAILFOLD)
  2223.     lj_trace_err(J, LJ_TRERR_GFAIL);
  2224.   lua_assert(ref == DROPFOLD);
  2225.   return REF_DROP;
  2226. }

  2227. /* -- Common-Subexpression Elimination ------------------------------------ */

  2228. /* CSE an IR instruction. This is very fast due to the skip-list chains. */
  2229. TRef LJ_FASTCALL lj_opt_cse(jit_State *J)
  2230. {
  2231.   /* Avoid narrow to wide store-to-load forwarding stall */
  2232.   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
  2233.   IROp op = fins->o;
  2234.   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
  2235.     /* Limited search for same operands in per-opcode chain. */
  2236.     IRRef ref = J->chain[op];
  2237.     IRRef lim = fins->op1;
  2238.     if (fins->op2 > lim) lim = fins->op2;  /* Relies on lit < REF_BIAS. */
  2239.     while (ref > lim) {
  2240.       if (IR(ref)->op12 == op12)
  2241.         return TREF(ref, irt_t(IR(ref)->t));  /* Common subexpression found. */
  2242.       ref = IR(ref)->prev;
  2243.     }
  2244.   }
  2245.   /* Otherwise emit IR (inlined for speed). */
  2246.   {
  2247.     IRRef ref = lj_ir_nextins(J);
  2248.     IRIns *ir = IR(ref);
  2249.     ir->prev = J->chain[op];
  2250.     ir->op12 = op12;
  2251.     J->chain[op] = (IRRef1)ref;
  2252.     ir->o = fins->o;
  2253.     J->guardemit.irt |= fins->t.irt;
  2254.     return TREF(ref, irt_t((ir->t = fins->t)));
  2255.   }
  2256. }

  2257. /* CSE with explicit search limit. */
  2258. TRef LJ_FASTCALL lj_opt_cselim(jit_State *J, IRRef lim)
  2259. {
  2260.   IRRef ref = J->chain[fins->o];
  2261.   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
  2262.   while (ref > lim) {
  2263.     if (IR(ref)->op12 == op12)
  2264.       return ref;
  2265.     ref = IR(ref)->prev;
  2266.   }
  2267.   return lj_ir_emit(J);
  2268. }

  2269. /* ------------------------------------------------------------------------ */

  2270. #undef IR
  2271. #undef fins
  2272. #undef fleft
  2273. #undef fright
  2274. #undef knumleft
  2275. #undef knumright
  2276. #undef emitir

  2277. #endif