stapregex-parse.cxx - systemtap
Functions defined
Macros defined
Source code
#include "util.h"
#include "stapregex-tree.h"
#include "stapregex-parse.h"
#include <cstdlib>
#include <cstring>
#include <string>
using namespace std;
namespace stapregex {
char octCh(unsigned c)
{
return '0' + c % 8;
}
void prtCh(std::ostream& o, unsigned c)
{
int oc = (int)(c);
switch (oc)
{
case '\'':
o << "\\'";
break;
case '"':
o << "\\\"";
break;
case '\n':
o << "\\n";
break;
case '\t':
o << "\\t";
break;
case '\v':
o << "\\v";
break;
case '\b':
o << "\\b";
break;
case '\r':
o << "\\r";
break;
case '\f':
o << "\\f";
break;
case '\a':
o << "\\a";
break;
case '\\':
o << "\\\\";
break;
default:
if ((oc < 256) && isprint(oc))
{
o << (char) oc;
}
else
{
o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc);
}
}
}
void print_escaped(std::ostream& o, char c)
{
prtCh(o, c);
}
cursor::cursor() : input(NULL), pos(~0),
last_pos(~0), finished(false), next_c(0), last_c(0) {}
cursor::cursor(const std::string *input, bool do_unescape)
: input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
{
next_c = 0; last_c = 0;
finished = ( pos >= input->length() );
}
char
cursor::next ()
{
if (! next_c && finished)
throw regex_error(_("unexpected end of regex"), pos);
if (! next_c)
get_unescaped();
last_c = next_c;
next_c = 0;
return last_c;
}
char
cursor::peek ()
{
if (! next_c && ! finished)
get_unescaped();
last_c = next_c;
return next_c;
}
bool
cursor::has (unsigned n)
{
return ( pos <= input->length() - n );
}
void
cursor::get_unescaped ()
{
static const char *hex = "0123456789abcdef";
static const char *oct = "01234567";
last_pos = pos;
char c = (*input)[pos];
if (c != '\\' || !do_unescape)
{
next_c = c;
pos++;
finished = ( pos >= input->length() );
return;
}
pos++;
if (pos >= input->length())
throw regex_error(_("unexpected end of regex"), pos);
c = (*input)[pos];
switch (c)
{
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 't': c = '\t'; break;
case 'n': c = '\n'; break;
case 'v': c = '\v'; break;
case 'f': c = '\f'; break;
case 'r': c = '\r'; break;
case 'x':
{
if (pos >= input->length() - 2)
throw regex_error(_("two hex digits required in escape sequence"), pos);
const char *d1 = strchr(hex, tolower((*input)[pos+1]));
const char *d2 = strchr(hex, tolower((*input)[pos+2]));
if (!d1 || !d2)
throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
c = (char)((d1-hex) << 4) + (char)(d2-hex);
pos += 2; break;
}
case '4' ... '7':
XXX throw regex_error(_("octal escape sequence out of range"), pos);
case '0' ... '3':
{
if (pos >= input->length() - 2)
throw regex_error(_("three octal digits required in escape sequence"), pos);
const char *d0 = strchr(oct, (*input)[pos]);
const char *d1 = strchr(oct, (*input)[pos+1]);
const char *d2 = strchr(oct, (*input)[pos+2]);
if (!d0 || !d1 || !d2)
throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
pos += 2; break;
}
default:
;
}
next_c = c;
pos++;
finished = ( pos >= input->length() );
}
regexp *
regex_parser::parse (bool do_tag)
{
cur = cursor(&input, do_unescape);
num_tags = 0; this->do_tag = do_tag;
regexp *result = parse_expr ();
if (do_tag) {
result = new cat_op(new tag_op(num_tags++), result);
result = new cat_op(result, new tag_op(num_tags++));
}
if (! cur.finished)
{
char c = cur.peek ();
if (c == ')')
parse_error (_("unbalanced ')'"), cur.pos);
else
parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
}
result->num_tags = num_tags;
return result;
}
bool
regex_parser::isspecial (char c)
{
return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
|| c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
|| c == '^' || c == '$' );
}
void
regex_parser::expect (char expected)
{
char c = 0;
try {
c = cur.next ();
} catch (const regex_error &e) {
parse_error (_F("expected %c, found end of regex", expected));
}
if (c != expected)
parse_error (_F("expected %c, found %c", expected, c));
}
void
regex_parser::parse_error (const string& msg, unsigned pos)
{
throw regex_error(msg, pos);
}
void
regex_parser::parse_error (const string& msg)
{
parse_error (msg, cur.last_pos);
}
regexp *
regex_parser::parse_expr ()
{
regexp *result = parse_term ();
char c = cur.peek ();
while (c && c == '|')
{
cur.next ();
regexp *alt = parse_term ();
result = make_alt (result, alt);
c = cur.peek ();
}
return result;
}
regexp *
regex_parser::parse_term ()
{
regexp *result = parse_factor ();
char c = cur.peek ();
while (c && c != '|' && c != ')')
{
regexp *next = parse_factor ();
result = new cat_op(result, next);
c = cur.peek ();
}
return result;
}
regexp *
regex_parser::parse_factor ()
{
regexp *result;
regexp *old_result = NULL;
char c = cur.peek ();
if (! c || c == '|' || c == ')')
{
result = new null_op;
return result;
}
else if (c == '*' || c == '+' || c == '?' || c == '{')
{
parse_error(_F("unexpected '%c'", c));
}
if (isspecial (c) && c != '\\')
cur.next ();
if (c == '.')
{
result = make_dot ();
}
else if (c == '[')
{
result = parse_char_range ();
expect (']');
}
else if (c == '(')
{
result = parse_expr ();
if (do_tag) {
result = new cat_op(new tag_op(num_tags++), result);
result = new cat_op(result, new tag_op(num_tags++));
} else {
XXX result = new cat_op(result, new null_op);
}
expect (')');
}
else if (c == '^' || c == '$')
{
result = new anchor_op(c);
}
else {
string accumulate;
char d = 0;
while (c && ( ! isspecial (c) || c == '\\' ))
{
if (c == '\\')
{
cur.next ();
c = cur.peek ();
}
cur.next ();
d = cur.peek ();
if (d == '*' || d == '+' || d == '?' || d == '{')
{
d = c; break;
}
accumulate.push_back (c);
c = d; d = 0;
}
result = str_to_re (accumulate);
if (d != 0) {
old_result = result; result = str_to_re (string(1,d));
}
}
c = cur.peek ();
while (c == '*' || c == '+' || c == '?' || c == '{')
{
cur.next ();
if (result->type_of() == "anchor_op")
{
parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
}
if (c == '*')
{
result = make_alt (new close_op(result), new null_op);
}
else if (c == '+')
{
result = new close_op(result);
}
else if (c == '?')
{
result = make_alt (result, new null_op);
}
else if (c == '{')
{
int minsize = parse_number ();
int maxsize = -1;
c = cur.next ();
if (c == ',')
{
c = cur.peek ();
if (c == '}')
{
cur.next ();
maxsize = -1;
}
else if (isdigit (c))
{
maxsize = parse_number ();
expect ('}');
}
else
parse_error(_("expected '}' or number"), cur.pos);
}
else if (c == '}')
{
maxsize = minsize;
}
else
parse_error(_("expected ',' or '}'"));
if (!do_tag && minsize == 0 && maxsize == 0)
{
XXX delete result;
result = new null_op;
}
else if (minsize == 0 && maxsize == -1)
{
result = make_alt (new close_op(result), new null_op);
}
else if (minsize == 1 && maxsize == -1)
{
result = new close_op(result);
}
else
{
result = new closev_op(result, minsize, maxsize);
}
}
c = cur.peek ();
}
if (old_result)
result = new cat_op(old_result, result);
return result;
}
regexp *
regex_parser::parse_char_range ()
{
range *ran = NULL;
bool inv = false;
char c = cur.peek ();
if (c == '^')
{
inv = true;
cur.next ();
c = cur.peek ();
}
for (;;)
{
if (cur.finished) parse_error(_("unclosed character class"));
range *add = stapregex_getrange (cur);
range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
delete ran; if (new_ran != add) delete add;
ran = new_ran;
c = cur.peek ();
if (c == ']')
break;
}
if (inv)
{
range *new_ran = range_invert(ran);
delete ran;
ran = new_ran;
}
if (ran == NULL)
return new null_op;
return new match_op(ran);
}
unsigned
regex_parser::parse_number ()
{
string digits;
char c = cur.peek ();
while (c && isdigit (c))
{
cur.next ();
digits.push_back (c);
c = cur.peek ();
}
if (digits == "") parse_error(_("expected number"), cur.pos);
char *endptr = NULL;
int val = strtol (digits.c_str (), &endptr, 10);
if (*endptr != '\0' || errno == ERANGE) parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
#define MAX_DFA_REPETITIONS 12345
if (val >= MAX_DFA_REPETITIONS) XXX parse_error(_F("%s is too large", digits.c_str()), cur.pos);
return atoi (digits.c_str ());
}
std::map<std::string, range *> named_char_classes;
range *
named_char_class (const string& name)
{
if (named_char_classes.empty())
{
named_char_classes["alpha"] = new range("A-Za-z");
named_char_classes["alnum"] = new range("A-Za-z0-9");
named_char_classes["blank"] = new range(" \t");
named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); XXX named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
named_char_classes["xdigit"] = new range("0-9a-fA-F");
named_char_classes["graph"] = new range("\x21-\x7E");
named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
named_char_classes["print"] = new range("\x20-\x7E");
named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
}
if (named_char_classes.find(name) == named_char_classes.end())
{
throw regex_error (_F("unknown character class '%s'", name.c_str())); XXX }
return new range(*named_char_classes[name]);
}
range *
stapregex_getrange (cursor& cur)
{
char c = cur.peek ();
if (c == '\\')
{
cur.next (); c = cur.peek (); cur.next ();
}
else if (c == '[')
{
char old_c = c; cur.next (); c = cur.peek ();
if (c == ':')
{
cur.next (); c = cur.peek (); string charclass;
for (;;)
{
if (cur.finished)
throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
{
cur.next (); cur.next (); return named_char_class(charclass);
}
charclass.push_back(c); cur.next(); c = cur.peek();
}
}
else
{
c = old_c;
}
}
else
cur.next ();
char lb = c, ub;
if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
{
ub = lb;
}
else
{
cur.next (); ub = cur.peek ();
if (ub < lb)
throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
cur.next ();
}
return new range(lb, ub);
}
};