src/lj_lex.c - luajit-2.0-src

Global variables defined

Functions defined

Macros defined

Source code

  1. /*
  2. ** Lexical analyzer.
  3. ** Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
  4. **
  5. ** Major portions taken verbatim or adapted from the Lua interpreter.
  6. ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
  7. */

  8. #define lj_lex_c
  9. #define LUA_CORE

  10. #include "lj_obj.h"
  11. #include "lj_gc.h"
  12. #include "lj_err.h"
  13. #include "lj_buf.h"
  14. #include "lj_str.h"
  15. #if LJ_HASFFI
  16. #include "lj_tab.h"
  17. #include "lj_ctype.h"
  18. #include "lj_cdata.h"
  19. #include "lualib.h"
  20. #endif
  21. #include "lj_state.h"
  22. #include "lj_lex.h"
  23. #include "lj_parse.h"
  24. #include "lj_char.h"
  25. #include "lj_strscan.h"
  26. #include "lj_strfmt.h"

  27. /* Lua lexer token names. */
  28. static const char *const tokennames[] = {
  29. #define TKSTR1(name)                #name,
  30. #define TKSTR2(name, sym)        #sym,
  31. TKDEF(TKSTR1, TKSTR2)
  32. #undef TKSTR1
  33. #undef TKSTR2
  34.   NULL
  35. };

  36. /* -- Buffer handling ----------------------------------------------------- */

  37. #define LEX_EOF                        (-1)
  38. #define lex_iseol(ls)                (ls->c == '\n' || ls->c == '\r')

  39. /* Get more input from reader. */
  40. static LJ_NOINLINE LexChar lex_more(LexState *ls)
  41. {
  42.   size_t sz;
  43.   const char *p = ls->rfunc(ls->L, ls->rdata, &sz);
  44.   if (p == NULL || sz == 0) return LEX_EOF;
  45.   ls->pe = p + sz;
  46.   ls->p = p + 1;
  47.   return (LexChar)(uint8_t)p[0];
  48. }

  49. /* Get next character. */
  50. static LJ_AINLINE LexChar lex_next(LexState *ls)
  51. {
  52.   return (ls->c = ls->p < ls->pe ? (LexChar)(uint8_t)*ls->p++ : lex_more(ls));
  53. }

  54. /* Save character. */
  55. static LJ_AINLINE void lex_save(LexState *ls, LexChar c)
  56. {
  57.   lj_buf_putb(&ls->sb, c);
  58. }

  59. /* Save previous character and get next character. */
  60. static LJ_AINLINE LexChar lex_savenext(LexState *ls)
  61. {
  62.   lex_save(ls, ls->c);
  63.   return lex_next(ls);
  64. }

  65. /* Skip line break. Handles "\n", "\r", "\r\n" or "\n\r". */
  66. static void lex_newline(LexState *ls)
  67. {
  68.   LexChar old = ls->c;
  69.   lua_assert(lex_iseol(ls));
  70.   lex_next(ls);  /* Skip "\n" or "\r". */
  71.   if (lex_iseol(ls) && ls->c != old) lex_next(ls);  /* Skip "\n\r" or "\r\n". */
  72.   if (++ls->linenumber >= LJ_MAX_LINE)
  73.     lj_lex_error(ls, ls->tok, LJ_ERR_XLINES);
  74. }

  75. /* -- Scanner for terminals ----------------------------------------------- */

  76. /* Parse a number literal. */
  77. static void lex_number(LexState *ls, TValue *tv)
  78. {
  79.   StrScanFmt fmt;
  80.   LexChar c, xp = 'e';
  81.   lua_assert(lj_char_isdigit(ls->c));
  82.   if ((c = ls->c) == '0' && (lex_savenext(ls) | 0x20) == 'x')
  83.     xp = 'p';
  84.   while (lj_char_isident(ls->c) || ls->c == '.' ||
  85.          ((ls->c == '-' || ls->c == '+') && (c | 0x20) == xp)) {
  86.     c = ls->c;
  87.     lex_savenext(ls);
  88.   }
  89.   lex_save(ls, '\0');
  90.   fmt = lj_strscan_scan((const uint8_t *)sbufB(&ls->sb), tv,
  91.           (LJ_DUALNUM ? STRSCAN_OPT_TOINT : STRSCAN_OPT_TONUM) |
  92.           (LJ_HASFFI ? (STRSCAN_OPT_LL|STRSCAN_OPT_IMAG) : 0));
  93.   if (LJ_DUALNUM && fmt == STRSCAN_INT) {
  94.     setitype(tv, LJ_TISNUM);
  95.   } else if (fmt == STRSCAN_NUM) {
  96.     /* Already in correct format. */
  97. #if LJ_HASFFI
  98.   } else if (fmt != STRSCAN_ERROR) {
  99.     lua_State *L = ls->L;
  100.     GCcdata *cd;
  101.     lua_assert(fmt == STRSCAN_I64 || fmt == STRSCAN_U64 || fmt == STRSCAN_IMAG);
  102.     if (!ctype_ctsG(G(L))) {
  103.       ptrdiff_t oldtop = savestack(L, L->top);
  104.       luaopen_ffi(L);  /* Load FFI library on-demand. */
  105.       L->top = restorestack(L, oldtop);
  106.     }
  107.     if (fmt == STRSCAN_IMAG) {
  108.       cd = lj_cdata_new_(L, CTID_COMPLEX_DOUBLE, 2*sizeof(double));
  109.       ((double *)cdataptr(cd))[0] = 0;
  110.       ((double *)cdataptr(cd))[1] = numV(tv);
  111.     } else {
  112.       cd = lj_cdata_new_(L, fmt==STRSCAN_I64 ? CTID_INT64 : CTID_UINT64, 8);
  113.       *(uint64_t *)cdataptr(cd) = tv->u64;
  114.     }
  115.     lj_parse_keepcdata(ls, tv, cd);
  116. #endif
  117.   } else {
  118.     lua_assert(fmt == STRSCAN_ERROR);
  119.     lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
  120.   }
  121. }

  122. /* Skip equal signs for "[=...=[" and "]=...=]" and return their count. */
  123. static int lex_skipeq(LexState *ls)
  124. {
  125.   int count = 0;
  126.   LexChar s = ls->c;
  127.   lua_assert(s == '[' || s == ']');
  128.   while (lex_savenext(ls) == '=')
  129.     count++;
  130.   return (ls->c == s) ? count : (-count) - 1;
  131. }

  132. /* Parse a long string or long comment (tv set to NULL). */
  133. static void lex_longstring(LexState *ls, TValue *tv, int sep)
  134. {
  135.   lex_savenext(ls);  /* Skip second '['. */
  136.   if (lex_iseol(ls))  /* Skip initial newline. */
  137.     lex_newline(ls);
  138.   for (;;) {
  139.     switch (ls->c) {
  140.     case LEX_EOF:
  141.       lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
  142.       break;
  143.     case ']':
  144.       if (lex_skipeq(ls) == sep) {
  145.         lex_savenext(ls);  /* Skip second ']'. */
  146.         goto endloop;
  147.       }
  148.       break;
  149.     case '\n':
  150.     case '\r':
  151.       lex_save(ls, '\n');
  152.       lex_newline(ls);
  153.       if (!tv) lj_buf_reset(&ls->sb);  /* Don't waste space for comments. */
  154.       break;
  155.     default:
  156.       lex_savenext(ls);
  157.       break;
  158.     }
  159.   } endloop:
  160.   if (tv) {
  161.     GCstr *str = lj_parse_keepstr(ls, sbufB(&ls->sb) + (2 + (MSize)sep),
  162.                                       sbuflen(&ls->sb) - 2*(2 + (MSize)sep));
  163.     setstrV(ls->L, tv, str);
  164.   }
  165. }

  166. /* Parse a string. */
  167. static void lex_string(LexState *ls, TValue *tv)
  168. {
  169.   LexChar delim = ls->c;  /* Delimiter is '\'' or '"'. */
  170.   lex_savenext(ls);
  171.   while (ls->c != delim) {
  172.     switch (ls->c) {
  173.     case LEX_EOF:
  174.       lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
  175.       continue;
  176.     case '\n':
  177.     case '\r':
  178.       lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
  179.       continue;
  180.     case '\\': {
  181.       LexChar c = lex_next(ls);  /* Skip the '\\'. */
  182.       switch (c) {
  183.       case 'a': c = '\a'; break;
  184.       case 'b': c = '\b'; break;
  185.       case 'f': c = '\f'; break;
  186.       case 'n': c = '\n'; break;
  187.       case 'r': c = '\r'; break;
  188.       case 't': c = '\t'; break;
  189.       case 'v': c = '\v'; break;
  190.       case 'x'/* Hexadecimal escape '\xXX'. */
  191.         c = (lex_next(ls) & 15u) << 4;
  192.         if (!lj_char_isdigit(ls->c)) {
  193.           if (!lj_char_isxdigit(ls->c)) goto err_xesc;
  194.           c += 9 << 4;
  195.         }
  196.         c += (lex_next(ls) & 15u);
  197.         if (!lj_char_isdigit(ls->c)) {
  198.           if (!lj_char_isxdigit(ls->c)) goto err_xesc;
  199.           c += 9;
  200.         }
  201.         break;
  202.       case 'z'/* Skip whitespace. */
  203.         lex_next(ls);
  204.         while (lj_char_isspace(ls->c))
  205.           if (lex_iseol(ls)) lex_newline(ls); else lex_next(ls);
  206.         continue;
  207.       case '\n': case '\r': lex_save(ls, '\n'); lex_newline(ls); continue;
  208.       case '\\': case '\"': case '\'': break;
  209.       case LEX_EOF: continue;
  210.       default:
  211.         if (!lj_char_isdigit(c))
  212.           goto err_xesc;
  213.         c -= '0'/* Decimal escape '\ddd'. */
  214.         if (lj_char_isdigit(lex_next(ls))) {
  215.           c = c*10 + (ls->c - '0');
  216.           if (lj_char_isdigit(lex_next(ls))) {
  217.             c = c*10 + (ls->c - '0');
  218.             if (c > 255) {
  219.             err_xesc:
  220.               lj_lex_error(ls, TK_string, LJ_ERR_XESC);
  221.             }
  222.             lex_next(ls);
  223.           }
  224.         }
  225.         lex_save(ls, c);
  226.         continue;
  227.       }
  228.       lex_save(ls, c);
  229.       lex_next(ls);
  230.       continue;
  231.       }
  232.     default:
  233.       lex_savenext(ls);
  234.       break;
  235.     }
  236.   }
  237.   lex_savenext(ls);  /* Skip trailing delimiter. */
  238.   setstrV(ls->L, tv,
  239.           lj_parse_keepstr(ls, sbufB(&ls->sb)+1, sbuflen(&ls->sb)-2));
  240. }

  241. /* -- Main lexical scanner ------------------------------------------------ */

  242. /* Get next lexical token. */
  243. static LexToken lex_scan(LexState *ls, TValue *tv)
  244. {
  245.   lj_buf_reset(&ls->sb);
  246.   for (;;) {
  247.     if (lj_char_isident(ls->c)) {
  248.       GCstr *s;
  249.       if (lj_char_isdigit(ls->c)) {  /* Numeric literal. */
  250.         lex_number(ls, tv);
  251.         return TK_number;
  252.       }
  253.       /* Identifier or reserved word. */
  254.       do {
  255.         lex_savenext(ls);
  256.       } while (lj_char_isident(ls->c));
  257.       s = lj_parse_keepstr(ls, sbufB(&ls->sb), sbuflen(&ls->sb));
  258.       setstrV(ls->L, tv, s);
  259.       if (s->reserved > 0/* Reserved word? */
  260.         return TK_OFS + s->reserved;
  261.       return TK_name;
  262.     }
  263.     switch (ls->c) {
  264.     case '\n':
  265.     case '\r':
  266.       lex_newline(ls);
  267.       continue;
  268.     case ' ':
  269.     case '\t':
  270.     case '\v':
  271.     case '\f':
  272.       lex_next(ls);
  273.       continue;
  274.     case '-':
  275.       lex_next(ls);
  276.       if (ls->c != '-') return '-';
  277.       lex_next(ls);
  278.       if (ls->c == '[') {  /* Long comment "--[=*[...]=*]". */
  279.         int sep = lex_skipeq(ls);
  280.         lj_buf_reset(&ls->sb);  /* `lex_skipeq' may dirty the buffer */
  281.         if (sep >= 0) {
  282.           lex_longstring(ls, NULL, sep);
  283.           lj_buf_reset(&ls->sb);
  284.           continue;
  285.         }
  286.       }
  287.       /* Short comment "--.*\n". */
  288.       while (!lex_iseol(ls) && ls->c != LEX_EOF)
  289.         lex_next(ls);
  290.       continue;
  291.     case '[': {
  292.       int sep = lex_skipeq(ls);
  293.       if (sep >= 0) {
  294.         lex_longstring(ls, tv, sep);
  295.         return TK_string;
  296.       } else if (sep == -1) {
  297.         return '[';
  298.       } else {
  299.         lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
  300.         continue;
  301.       }
  302.       }
  303.     case '=':
  304.       lex_next(ls);
  305.       if (ls->c != '=') return '='; else { lex_next(ls); return TK_eq; }
  306.     case '<':
  307.       lex_next(ls);
  308.       if (ls->c != '=') return '<'; else { lex_next(ls); return TK_le; }
  309.     case '>':
  310.       lex_next(ls);
  311.       if (ls->c != '=') return '>'; else { lex_next(ls); return TK_ge; }
  312.     case '~':
  313.       lex_next(ls);
  314.       if (ls->c != '=') return '~'; else { lex_next(ls); return TK_ne; }
  315.     case ':':
  316.       lex_next(ls);
  317.       if (ls->c != ':') return ':'; else { lex_next(ls); return TK_label; }
  318.     case '"':
  319.     case '\'':
  320.       lex_string(ls, tv);
  321.       return TK_string;
  322.     case '.':
  323.       if (lex_savenext(ls) == '.') {
  324.         lex_next(ls);
  325.         if (ls->c == '.') {
  326.           lex_next(ls);
  327.           return TK_dots;   /* ... */
  328.         }
  329.         return TK_concat;   /* .. */
  330.       } else if (!lj_char_isdigit(ls->c)) {
  331.         return '.';
  332.       } else {
  333.         lex_number(ls, tv);
  334.         return TK_number;
  335.       }
  336.     case LEX_EOF:
  337.       return TK_eof;
  338.     default: {
  339.       LexChar c = ls->c;
  340.       lex_next(ls);
  341.       return c;  /* Single-char tokens (+ - / ...). */
  342.     }
  343.     }
  344.   }
  345. }

  346. /* -- Lexer API ----------------------------------------------------------- */

  347. /* Setup lexer state. */
  348. int lj_lex_setup(lua_State *L, LexState *ls)
  349. {
  350.   int header = 0;
  351.   ls->L = L;
  352.   ls->fs = NULL;
  353.   ls->pe = ls->p = NULL;
  354.   ls->vstack = NULL;
  355.   ls->sizevstack = 0;
  356.   ls->vtop = 0;
  357.   ls->bcstack = NULL;
  358.   ls->sizebcstack = 0;
  359.   ls->tok = 0;
  360.   ls->lookahead = TK_eof;  /* No look-ahead token. */
  361.   ls->linenumber = 1;
  362.   ls->lastline = 1;
  363.   lex_next(ls);  /* Read-ahead first char. */
  364.   if (ls->c == 0xef && ls->p + 2 <= ls->pe && (uint8_t)ls->p[0] == 0xbb &&
  365.       (uint8_t)ls->p[1] == 0xbf) {  /* Skip UTF-8 BOM (if buffered). */
  366.     ls->p += 2;
  367.     lex_next(ls);
  368.     header = 1;
  369.   }
  370.   if (ls->c == '#') {  /* Skip POSIX #! header line. */
  371.     do {
  372.       lex_next(ls);
  373.       if (ls->c == LEX_EOF) return 0;
  374.     } while (!lex_iseol(ls));
  375.     lex_newline(ls);
  376.     header = 1;
  377.   }
  378.   if (ls->c == LUA_SIGNATURE[0]) {  /* Bytecode dump. */
  379.     if (header) {
  380.       /*
  381.       ** Loading bytecode with an extra header is disabled for security
  382.       ** reasons. This may circumvent the usual check for bytecode vs.
  383.       ** Lua code by looking at the first char. Since this is a potential
  384.       ** security violation no attempt is made to echo the chunkname either.
  385.       */
  386.       setstrV(L, L->top++, lj_err_str(L, LJ_ERR_BCBAD));
  387.       lj_err_throw(L, LUA_ERRSYNTAX);
  388.     }
  389.     return 1;
  390.   }
  391.   return 0;
  392. }

  393. /* Cleanup lexer state. */
  394. void lj_lex_cleanup(lua_State *L, LexState *ls)
  395. {
  396.   global_State *g = G(L);
  397.   lj_mem_freevec(g, ls->bcstack, ls->sizebcstack, BCInsLine);
  398.   lj_mem_freevec(g, ls->vstack, ls->sizevstack, VarInfo);
  399.   lj_buf_free(g, &ls->sb);
  400. }

  401. /* Return next lexical token. */
  402. void lj_lex_next(LexState *ls)
  403. {
  404.   ls->lastline = ls->linenumber;
  405.   if (LJ_LIKELY(ls->lookahead == TK_eof)) {  /* No lookahead token? */
  406.     ls->tok = lex_scan(ls, &ls->tokval);  /* Get next token. */
  407.   } else/* Otherwise return lookahead token. */
  408.     ls->tok = ls->lookahead;
  409.     ls->lookahead = TK_eof;
  410.     ls->tokval = ls->lookaheadval;
  411.   }
  412. }

  413. /* Look ahead for the next token. */
  414. LexToken lj_lex_lookahead(LexState *ls)
  415. {
  416.   lua_assert(ls->lookahead == TK_eof);
  417.   ls->lookahead = lex_scan(ls, &ls->lookaheadval);
  418.   return ls->lookahead;
  419. }

  420. /* Convert token to string. */
  421. const char *lj_lex_token2str(LexState *ls, LexToken tok)
  422. {
  423.   if (tok > TK_OFS)
  424.     return tokennames[tok-TK_OFS-1];
  425.   else if (!lj_char_iscntrl(tok))
  426.     return lj_strfmt_pushf(ls->L, "%c", tok);
  427.   else
  428.     return lj_strfmt_pushf(ls->L, "char(%d)", tok);
  429. }

  430. /* Lexer error. */
  431. void lj_lex_error(LexState *ls, LexToken tok, ErrMsg em, ...)
  432. {
  433.   const char *tokstr;
  434.   va_list argp;
  435.   if (tok == 0) {
  436.     tokstr = NULL;
  437.   } else if (tok == TK_name || tok == TK_string || tok == TK_number) {
  438.     lex_save(ls, '\0');
  439.     tokstr = sbufB(&ls->sb);
  440.   } else {
  441.     tokstr = lj_lex_token2str(ls, tok);
  442.   }
  443.   va_start(argp, em);
  444.   lj_err_lex(ls->L, ls->chunkname, tokstr, ls->linenumber, em, argp);
  445.   va_end(argp);
  446. }

  447. /* Initialize strings for reserved words. */
  448. void lj_lex_init(lua_State *L)
  449. {
  450.   uint32_t i;
  451.   for (i = 0; i < TK_RESERVED; i++) {
  452.     GCstr *s = lj_str_newz(L, tokennames[i]);
  453.     fixstring(s);  /* Reserved words are never collected. */
  454.     s->reserved = (uint8_t)(i+1);
  455.   }
  456. }