src/lj_strscan.c - luajit-2.0-src

Functions defined

Macros defined

Source code

  1. /*
  2. ** String scanning.
  3. ** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
  4. */

  5. #include <math.h>

  6. #define lj_strscan_c
  7. #define LUA_CORE

  8. #include "lj_obj.h"
  9. #include "lj_char.h"
  10. #include "lj_strscan.h"

  11. /* -- Scanning numbers ---------------------------------------------------- */

  12. /*
  13. ** Rationale for the builtin string to number conversion library:
  14. **
  15. ** It removes a dependency on libc's strtod(), which is a true portability
  16. ** nightmare. Mainly due to the plethora of supported OS and toolchain
  17. ** combinations. Sadly, the various implementations
  18. ** a) are often buggy, incomplete (no hex floats) and/or imprecise,
  19. ** b) sometimes crash or hang on certain inputs,
  20. ** c) return non-standard NaNs that need to be filtered out, and
  21. ** d) fail if the locale-specific decimal separator is not a dot,
  22. **    which can only be fixed with atrocious workarounds.
  23. **
  24. ** Also, most of the strtod() implementations are hopelessly bloated,
  25. ** which is not just an I-cache hog, but a problem for static linkage
  26. ** on embedded systems, too.
  27. **
  28. ** OTOH the builtin conversion function is very compact. Even though it
  29. ** does a lot more, like parsing long longs, octal or imaginary numbers
  30. ** and returning the result in different formats:
  31. ** a) It needs less than 3 KB (!) of machine code (on x64 with -Os),
  32. ** b) it doesn't perform any dynamic allocation and,
  33. ** c) it needs only around 600 bytes of stack space.
  34. **
  35. ** The builtin function is faster than strtod() for typical inputs, e.g.
  36. ** "123", "1.5" or "1e6". Arguably, it's slower for very large exponents,
  37. ** which are not very common (this could be fixed, if needed).
  38. **
  39. ** And most importantly, the builtin function is equally precise on all
  40. ** platforms. It correctly converts and rounds any input to a double.
  41. ** If this is not the case, please send a bug report -- but PLEASE verify
  42. ** that the implementation you're comparing to is not the culprit!
  43. **
  44. ** The implementation quickly pre-scans the entire string first and
  45. ** handles simple integers on-the-fly. Otherwise, it dispatches to the
  46. ** base-specific parser. Hex and octal is straightforward.
  47. **
  48. ** Decimal to binary conversion uses a fixed-length circular buffer in
  49. ** base 100. Some simple cases are handled directly. For other cases, the
  50. ** number in the buffer is up-scaled or down-scaled until the integer part
  51. ** is in the proper range. Then the integer part is rounded and converted
  52. ** to a double which is finally rescaled to the result. Denormals need
  53. ** special treatment to prevent incorrect 'double rounding'.
  54. */

  55. /* Definitions for circular decimal digit buffer (base 100 = 2 digits/byte). */
  56. #define STRSCAN_DIG        1024
  57. #define STRSCAN_MAXDIG        800                /* 772 + extra are sufficient. */
  58. #define STRSCAN_DDIG        (STRSCAN_DIG/2)
  59. #define STRSCAN_DMASK        (STRSCAN_DDIG-1)

  60. /* Helpers for circular buffer. */
  61. #define DNEXT(a)        (((a)+1) & STRSCAN_DMASK)
  62. #define DPREV(a)        (((a)-1) & STRSCAN_DMASK)
  63. #define DLEN(lo, hi)        ((int32_t)(((lo)-(hi)) & STRSCAN_DMASK))

  64. #define casecmp(c, k)        (((c) | 0x20) == k)

  65. /* Final conversion to double. */
  66. static void strscan_double(uint64_t x, TValue *o, int32_t ex2, int32_t neg)
  67. {
  68.   double n;

  69.   /* Avoid double rounding for denormals. */
  70.   if (LJ_UNLIKELY(ex2 <= -1075 && x != 0)) {
  71.     /* NYI: all of this generates way too much code on 32 bit CPUs. */
  72. #if defined(__GNUC__) && LJ_64
  73.     int32_t b = (int32_t)(__builtin_clzll(x)^63);
  74. #else
  75.     int32_t b = (x>>32) ? 32+(int32_t)lj_fls((uint32_t)(x>>32)) :
  76.                           (int32_t)lj_fls((uint32_t)x);
  77. #endif
  78.     if ((int32_t)b + ex2 <= -1023 && (int32_t)b + ex2 >= -1075) {
  79.       uint64_t rb = (uint64_t)1 << (-1075-ex2);
  80.       if ((x & rb) && ((x & (rb+rb+rb-1)))) x += rb+rb;
  81.       x = (x & ~(rb+rb-1));
  82.     }
  83.   }

  84.   /* Convert to double using a signed int64_t conversion, then rescale. */
  85.   lua_assert((int64_t)x >= 0);
  86.   n = (double)(int64_t)x;
  87.   if (neg) n = -n;
  88.   if (ex2) n = ldexp(n, ex2);
  89.   o->n = n;
  90. }

  91. /* Parse hexadecimal number. */
  92. static StrScanFmt strscan_hex(const uint8_t *p, TValue *o,
  93.                               StrScanFmt fmt, uint32_t opt,
  94.                               int32_t ex2, int32_t neg, uint32_t dig)
  95. {
  96.   uint64_t x = 0;
  97.   uint32_t i;

  98.   /* Scan hex digits. */
  99.   for (i = dig > 16 ? 16 : dig ; i; i--, p++) {
  100.     uint32_t d = (*p != '.' ? *p : *++p); if (d > '9') d += 9;
  101.     x = (x << 4) + (d & 15);
  102.   }

  103.   /* Summarize rounding-effect of excess digits. */
  104.   for (i = 16; i < dig; i++, p++)
  105.     x |= ((*p != '.' ? *p : *++p) != '0'), ex2 += 4;

  106.   /* Format-specific handling. */
  107.   switch (fmt) {
  108.   case STRSCAN_INT:
  109.     if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
  110.       o->i = neg ? -(int32_t)x : (int32_t)x;
  111.       return STRSCAN_INT;  /* Fast path for 32 bit integers. */
  112.     }
  113.     if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; break; }
  114.     /* fallthrough */
  115.   case STRSCAN_U32:
  116.     if (dig > 8) return STRSCAN_ERROR;
  117.     o->i = neg ? -(int32_t)x : (int32_t)x;
  118.     return STRSCAN_U32;
  119.   case STRSCAN_I64:
  120.   case STRSCAN_U64:
  121.     if (dig > 16) return STRSCAN_ERROR;
  122.     o->u64 = neg ? (uint64_t)-(int64_t)x : x;
  123.     return fmt;
  124.   default:
  125.     break;
  126.   }

  127.   /* Reduce range then convert to double. */
  128.   if ((x & U64x(c0000000,0000000))) { x = (x >> 2) | (x & 3); ex2 += 2; }
  129.   strscan_double(x, o, ex2, neg);
  130.   return fmt;
  131. }

  132. /* Parse octal number. */
  133. static StrScanFmt strscan_oct(const uint8_t *p, TValue *o,
  134.                               StrScanFmt fmt, int32_t neg, uint32_t dig)
  135. {
  136.   uint64_t x = 0;

  137.   /* Scan octal digits. */
  138.   if (dig > 22 || (dig == 22 && *p > '1')) return STRSCAN_ERROR;
  139.   while (dig-- > 0) {
  140.     if (!(*p >= '0' && *p <= '7')) return STRSCAN_ERROR;
  141.     x = (x << 3) + (*p++ & 7);
  142.   }

  143.   /* Format-specific handling. */
  144.   switch (fmt) {
  145.   case STRSCAN_INT:
  146.     if (x >= 0x80000000u+neg) fmt = STRSCAN_U32;
  147.     /* fallthrough */
  148.   case STRSCAN_U32:
  149.     if ((x >> 32)) return STRSCAN_ERROR;
  150.     o->i = neg ? -(int32_t)x : (int32_t)x;
  151.     break;
  152.   default:
  153.   case STRSCAN_I64:
  154.   case STRSCAN_U64:
  155.     o->u64 = neg ? (uint64_t)-(int64_t)x : x;
  156.     break;
  157.   }
  158.   return fmt;
  159. }

  160. /* Parse decimal number. */
  161. static StrScanFmt strscan_dec(const uint8_t *p, TValue *o,
  162.                               StrScanFmt fmt, uint32_t opt,
  163.                               int32_t ex10, int32_t neg, uint32_t dig)
  164. {
  165.   uint8_t xi[STRSCAN_DDIG], *xip = xi;

  166.   if (dig) {
  167.     uint32_t i = dig;
  168.     if (i > STRSCAN_MAXDIG) {
  169.       ex10 += (int32_t)(i - STRSCAN_MAXDIG);
  170.       i = STRSCAN_MAXDIG;
  171.     }
  172.     /* Scan unaligned leading digit. */
  173.     if (((ex10^i) & 1))
  174.       *xip++ = ((*p != '.' ? *p : *++p) & 15), i--, p++;
  175.     /* Scan aligned double-digits. */
  176.     for ( ; i > 1; i -= 2) {
  177.       uint32_t d = 10 * ((*p != '.' ? *p : *++p) & 15); p++;
  178.       *xip++ = d + ((*p != '.' ? *p : *++p) & 15); p++;
  179.     }
  180.     /* Scan and realign trailing digit. */
  181.     if (i) *xip++ = 10 * ((*p != '.' ? *p : *++p) & 15), ex10--, dig++, p++;

  182.     /* Summarize rounding-effect of excess digits. */
  183.     if (dig > STRSCAN_MAXDIG) {
  184.       do {
  185.         if ((*p != '.' ? *p : *++p) != '0') { xip[-1] |= 1; break; }
  186.         p++;
  187.       } while (--dig > STRSCAN_MAXDIG);
  188.       dig = STRSCAN_MAXDIG;
  189.     } else/* Simplify exponent. */
  190.       while (ex10 > 0 && dig <= 18) *xip++ = 0, ex10 -= 2, dig += 2;
  191.     }
  192.   } else/* Only got zeros. */
  193.     ex10 = 0;
  194.     xi[0] = 0;
  195.   }

  196.   /* Fast path for numbers in integer format (but handles e.g. 1e6, too). */
  197.   if (dig <= 20 && ex10 == 0) {
  198.     uint8_t *xis;
  199.     uint64_t x = xi[0];
  200.     double n;
  201.     for (xis = xi+1; xis < xip; xis++) x = x * 100 + *xis;
  202.     if (!(dig == 20 && (xi[0] > 18 || (int64_t)x >= 0))) {  /* No overflow? */
  203.       /* Format-specific handling. */
  204.       switch (fmt) {
  205.       case STRSCAN_INT:
  206.         if (!(opt & STRSCAN_OPT_TONUM) && x < 0x80000000u+neg) {
  207.           o->i = neg ? -(int32_t)x : (int32_t)x;
  208.           return STRSCAN_INT;  /* Fast path for 32 bit integers. */
  209.         }
  210.         if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; goto plainnumber; }
  211.         /* fallthrough */
  212.       case STRSCAN_U32:
  213.         if ((x >> 32) != 0) return STRSCAN_ERROR;
  214.         o->i = neg ? -(int32_t)x : (int32_t)x;
  215.         return STRSCAN_U32;
  216.       case STRSCAN_I64:
  217.       case STRSCAN_U64:
  218.         o->u64 = neg ? (uint64_t)-(int64_t)x : x;
  219.         return fmt;
  220.       default:
  221.       plainnumber:  /* Fast path for plain numbers < 2^63. */
  222.         if ((int64_t)x < 0) break;
  223.         n = (double)(int64_t)x;
  224.         if (neg) n = -n;
  225.         o->n = n;
  226.         return fmt;
  227.       }
  228.     }
  229.   }

  230.   /* Slow non-integer path. */
  231.   if (fmt == STRSCAN_INT) {
  232.     if ((opt & STRSCAN_OPT_C)) return STRSCAN_ERROR;
  233.     fmt = STRSCAN_NUM;
  234.   } else if (fmt > STRSCAN_INT) {
  235.     return STRSCAN_ERROR;
  236.   }
  237.   {
  238.     uint32_t hi = 0, lo = (uint32_t)(xip-xi);
  239.     int32_t ex2 = 0, idig = (int32_t)lo + (ex10 >> 1);

  240.     lua_assert(lo > 0 && (ex10 & 1) == 0);

  241.     /* Handle simple overflow/underflow. */
  242.     if (idig > 310/2) { if (neg) setminfV(o); else setpinfV(o); return fmt; }
  243.     else if (idig < -326/2) { o->n = neg ? -0.0 : 0.0; return fmt; }

  244.     /* Scale up until we have at least 17 or 18 integer part digits. */
  245.     while (idig < 9 && idig < DLEN(lo, hi)) {
  246.       uint32_t i, cy = 0;
  247.       ex2 -= 6;
  248.       for (i = DPREV(lo); ; i = DPREV(i)) {
  249.         uint32_t d = (xi[i] << 6) + cy;
  250.         cy = (((d >> 2) * 5243) >> 17); d = d - cy * 100/* Div/mod 100. */
  251.         xi[i] = (uint8_t)d;
  252.         if (i == hi) break;
  253.         if (d == 0 && i == DPREV(lo)) lo = i;
  254.       }
  255.       if (cy) {
  256.         hi = DPREV(hi);
  257.         if (xi[DPREV(lo)] == 0) lo = DPREV(lo);
  258.         else if (hi == lo) { lo = DPREV(lo); xi[DPREV(lo)] |= xi[lo]; }
  259.         xi[hi] = (uint8_t)cy; idig++;
  260.       }
  261.     }

  262.     /* Scale down until no more than 17 or 18 integer part digits remain. */
  263.     while (idig > 9) {
  264.       uint32_t i = hi, cy = 0;
  265.       ex2 += 6;
  266.       do {
  267.         cy += xi[i];
  268.         xi[i] = (cy >> 6);
  269.         cy = 100 * (cy & 0x3f);
  270.         if (xi[i] == 0 && i == hi) hi = DNEXT(hi), idig--;
  271.         i = DNEXT(i);
  272.       } while (i != lo);
  273.       while (cy) {
  274.         if (hi == lo) { xi[DPREV(lo)] |= 1; break; }
  275.         xi[lo] = (cy >> 6); lo = DNEXT(lo);
  276.         cy = 100 * (cy & 0x3f);
  277.       }
  278.     }

  279.     /* Collect integer part digits and convert to rescaled double. */
  280.     {
  281.       uint64_t x = xi[hi];
  282.       uint32_t i;
  283.       for (i = DNEXT(hi); --idig > 0 && i != lo; i = DNEXT(i))
  284.         x = x * 100 + xi[i];
  285.       if (i == lo) {
  286.         while (--idig >= 0) x = x * 100;
  287.       } else/* Gather round bit from remaining digits. */
  288.         x <<= 1; ex2--;
  289.         do {
  290.           if (xi[i]) { x |= 1; break; }
  291.           i = DNEXT(i);
  292.         } while (i != lo);
  293.       }
  294.       strscan_double(x, o, ex2, neg);
  295.     }
  296.   }
  297.   return fmt;
  298. }

  299. /* Scan string containing a number. Returns format. Returns value in o. */
  300. StrScanFmt lj_strscan_scan(const uint8_t *p, TValue *o, uint32_t opt)
  301. {
  302.   int32_t neg = 0;

  303.   /* Remove leading space, parse sign and non-numbers. */
  304.   if (LJ_UNLIKELY(!lj_char_isdigit(*p))) {
  305.     while (lj_char_isspace(*p)) p++;
  306.     if (*p == '+' || *p == '-') neg = (*p++ == '-');
  307.     if (LJ_UNLIKELY(*p >= 'A')) {  /* Parse "inf", "infinity" or "nan". */
  308.       TValue tmp;
  309.       setnanV(&tmp);
  310.       if (casecmp(p[0],'i') && casecmp(p[1],'n') && casecmp(p[2],'f')) {
  311.         if (neg) setminfV(&tmp); else setpinfV(&tmp);
  312.         p += 3;
  313.         if (casecmp(p[0],'i') && casecmp(p[1],'n') && casecmp(p[2],'i') &&
  314.             casecmp(p[3],'t') && casecmp(p[4],'y')) p += 5;
  315.       } else if (casecmp(p[0],'n') && casecmp(p[1],'a') && casecmp(p[2],'n')) {
  316.         p += 3;
  317.       }
  318.       while (lj_char_isspace(*p)) p++;
  319.       if (*p) return STRSCAN_ERROR;
  320.       o->u64 = tmp.u64;
  321.       return STRSCAN_NUM;
  322.     }
  323.   }

  324.   /* Parse regular number. */
  325.   {
  326.     StrScanFmt fmt = STRSCAN_INT;
  327.     int cmask = LJ_CHAR_DIGIT;
  328.     int base = (opt & STRSCAN_OPT_C) && *p == '0' ? 0 : 10;
  329.     const uint8_t *sp, *dp = NULL;
  330.     uint32_t dig = 0, hasdig = 0, x = 0;
  331.     int32_t ex = 0;

  332.     /* Determine base and skip leading zeros. */
  333.     if (LJ_UNLIKELY(*p <= '0')) {
  334.       if (*p == '0' && casecmp(p[1], 'x'))
  335.         base = 16, cmask = LJ_CHAR_XDIGIT, p += 2;
  336.       for ( ; ; p++) {
  337.         if (*p == '0') {
  338.           hasdig = 1;
  339.         } else if (*p == '.') {
  340.           if (dp) return STRSCAN_ERROR;
  341.           dp = p;
  342.         } else {
  343.           break;
  344.         }
  345.       }
  346.     }

  347.     /* Preliminary digit and decimal point scan. */
  348.     for (sp = p; ; p++) {
  349.       if (LJ_LIKELY(lj_char_isa(*p, cmask))) {
  350.         x = x * 10 + (*p & 15);  /* For fast path below. */
  351.         dig++;
  352.       } else if (*p == '.') {
  353.         if (dp) return STRSCAN_ERROR;
  354.         dp = p;
  355.       } else {
  356.         break;
  357.       }
  358.     }
  359.     if (!(hasdig | dig)) return STRSCAN_ERROR;

  360.     /* Handle decimal point. */
  361.     if (dp) {
  362.       fmt = STRSCAN_NUM;
  363.       if (dig) {
  364.         ex = (int32_t)(dp-(p-1)); dp = p-1;
  365.         while (ex < 0 && *dp-- == '0') ex++, dig--;  /* Skip trailing zeros. */
  366.         if (base == 16) ex *= 4;
  367.       }
  368.     }

  369.     /* Parse exponent. */
  370.     if (casecmp(*p, (uint32_t)(base == 16 ? 'p' : 'e'))) {
  371.       uint32_t xx;
  372.       int negx = 0;
  373.       fmt = STRSCAN_NUM; p++;
  374.       if (*p == '+' || *p == '-') negx = (*p++ == '-');
  375.       if (!lj_char_isdigit(*p)) return STRSCAN_ERROR;
  376.       xx = (*p++ & 15);
  377.       while (lj_char_isdigit(*p)) {
  378.         if (xx < 65536) xx = xx * 10 + (*p & 15);
  379.         p++;
  380.       }
  381.       ex += negx ? -(int32_t)xx : (int32_t)xx;
  382.     }

  383.     /* Parse suffix. */
  384.     if (*p) {
  385.       /* I (IMAG), U (U32), LL (I64), ULL/LLU (U64), L (long), UL/LU (ulong). */
  386.       /* NYI: f (float). Not needed until cp_number() handles non-integers. */
  387.       if (casecmp(*p, 'i')) {
  388.         if (!(opt & STRSCAN_OPT_IMAG)) return STRSCAN_ERROR;
  389.         p++; fmt = STRSCAN_IMAG;
  390.       } else if (fmt == STRSCAN_INT) {
  391.         if (casecmp(*p, 'u')) p++, fmt = STRSCAN_U32;
  392.         if (casecmp(*p, 'l')) {
  393.           p++;
  394.           if (casecmp(*p, 'l')) p++, fmt += STRSCAN_I64 - STRSCAN_INT;
  395.           else if (!(opt & STRSCAN_OPT_C)) return STRSCAN_ERROR;
  396.           else if (sizeof(long) == 8) fmt += STRSCAN_I64 - STRSCAN_INT;
  397.         }
  398.         if (casecmp(*p, 'u') && (fmt == STRSCAN_INT || fmt == STRSCAN_I64))
  399.           p++, fmt += STRSCAN_U32 - STRSCAN_INT;
  400.         if ((fmt == STRSCAN_U32 && !(opt & STRSCAN_OPT_C)) ||
  401.             (fmt >= STRSCAN_I64 && !(opt & STRSCAN_OPT_LL)))
  402.           return STRSCAN_ERROR;
  403.       }
  404.       while (lj_char_isspace(*p)) p++;
  405.       if (*p) return STRSCAN_ERROR;
  406.     }

  407.     /* Fast path for decimal 32 bit integers. */
  408.     if (fmt == STRSCAN_INT && base == 10 &&
  409.         (dig < 10 || (dig == 10 && *sp <= '2' && x < 0x80000000u+neg))) {
  410.       int32_t y = neg ? -(int32_t)x : (int32_t)x;
  411.       if ((opt & STRSCAN_OPT_TONUM)) {
  412.         o->n = (double)y;
  413.         return STRSCAN_NUM;
  414.       } else {
  415.         o->i = y;
  416.         return STRSCAN_INT;
  417.       }
  418.     }

  419.     /* Dispatch to base-specific parser. */
  420.     if (base == 0 && !(fmt == STRSCAN_NUM || fmt == STRSCAN_IMAG))
  421.       return strscan_oct(sp, o, fmt, neg, dig);
  422.     if (base == 16)
  423.       fmt = strscan_hex(sp, o, fmt, opt, ex, neg, dig);
  424.     else
  425.       fmt = strscan_dec(sp, o, fmt, opt, ex, neg, dig);

  426.     /* Try to convert number to integer, if requested. */
  427.     if (fmt == STRSCAN_NUM && (opt & STRSCAN_OPT_TOINT)) {
  428.       double n = o->n;
  429.       int32_t i = lj_num2int(n);
  430.       if (n == (lua_Number)i) { o->i = i; return STRSCAN_INT; }
  431.     }
  432.     return fmt;
  433.   }
  434. }

  435. int LJ_FASTCALL lj_strscan_num(GCstr *str, TValue *o)
  436. {
  437.   StrScanFmt fmt = lj_strscan_scan((const uint8_t *)strdata(str), o,
  438.                                    STRSCAN_OPT_TONUM);
  439.   lua_assert(fmt == STRSCAN_ERROR || fmt == STRSCAN_NUM);
  440.   return (fmt != STRSCAN_ERROR);
  441. }

  442. #if LJ_DUALNUM
  443. int LJ_FASTCALL lj_strscan_number(GCstr *str, TValue *o)
  444. {
  445.   StrScanFmt fmt = lj_strscan_scan((const uint8_t *)strdata(str), o,
  446.                                    STRSCAN_OPT_TOINT);
  447.   lua_assert(fmt == STRSCAN_ERROR || fmt == STRSCAN_NUM || fmt == STRSCAN_INT);
  448.   if (fmt == STRSCAN_INT) setitype(o, LJ_TISNUM);
  449.   return (fmt != STRSCAN_ERROR);
  450. }
  451. #endif

  452. #undef DNEXT
  453. #undef DPREV
  454. #undef DLEN