ovn/lib/lex.c

   1 /*
   2  * Copyright (c) 2015 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "lex.h"
  19 #include <ctype.h>
  20 #include <errno.h>
  21 #include <stdarg.h>
  22 #include "dynamic-string.h"
  23 #include "json.h"
  24 #include "util.h"
  25 \f
  26 /* Returns a string that represents 'format'. */
  27 const char *
  28 lex_format_to_string(enum lex_format format)
  29 {
  30     switch (format) {
  31     case LEX_F_DECIMAL:
  32         return "decimal";
  33     case LEX_F_HEXADECIMAL:
  34         return "hexadecimal";
  35     case LEX_F_IPV4:
  36         return "IPv4";
  37     case LEX_F_IPV6:
  38         return "IPv6";
  39     case LEX_F_ETHERNET:
  40         return "Ethernet";
  41     default:
  42         abort();
  43     }
  44 }
  45 \f
  46 /* Initializes 'token'. */
  47 void
  48 lex_token_init(struct lex_token *token)
  49 {
  50     token->type = LEX_T_END;
  51     token->s = NULL;
  52 }
  53
  54 /* Frees memory owned by 'token'. */
  55 void
  56 lex_token_destroy(struct lex_token *token)
  57 {
  58     free(token->s);
  59 }
  60
  61 /* Exchanges 'a' and 'b'. */
  62 void
  63 lex_token_swap(struct lex_token *a, struct lex_token *b)
  64 {
  65     struct lex_token tmp = *a;
  66     *a = *b;
  67     *b = tmp;
  68 }
  69 \f
  70 /* lex_token_format(). */
  71
  72 static size_t
  73 lex_token_n_zeros(enum lex_format format)
  74 {
  75     switch (format) {
  76     case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
  77     case LEX_F_HEXADECIMAL: return 0;
  78     case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
  79     case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
  80     case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
  81     default: OVS_NOT_REACHED();
  82     }
  83 }
  84
  85 /* Returns the effective format for 'token', that is, the format in which it
  86  * should actually be printed.  This is ordinarily the same as 'token->format',
  87  * but it's always possible that someone sets up a token with a format that
  88  * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
  89  * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
  90  * to avoid confusion in the future.) */
  91 static enum lex_format
  92 lex_token_get_format(const struct lex_token *token)
  93 {
  94     size_t n_zeros = lex_token_n_zeros(token->format);
  95     return (is_all_zeros(&token->value, n_zeros)
  96             && (token->type != LEX_T_MASKED_INTEGER
  97                 || is_all_zeros(&token->mask, n_zeros))
  98             ? token->format
  99             : LEX_F_HEXADECIMAL);
 100 }
 101
 102 static void
 103 lex_token_format_value(const union mf_subvalue *value,
 104                        enum lex_format format, struct ds *s)
 105 {
 106     switch (format) {
 107     case LEX_F_DECIMAL:
 108         ds_put_format(s, "%"PRIu64, ntohll(value->integer));
 109         break;
 110
 111     case LEX_F_HEXADECIMAL:
 112         mf_format_subvalue(value, s);
 113         break;
 114
 115     case LEX_F_IPV4:
 116         ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
 117         break;
 118
 119     case LEX_F_IPV6:
 120         print_ipv6_addr(s, &value->ipv6);
 121         break;
 122
 123     case LEX_F_ETHERNET:
 124         ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
 125         break;
 126
 127     default:
 128         OVS_NOT_REACHED();
 129     }
 130
 131 }
 132
 133 static void
 134 lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
 135 {
 136     enum lex_format format = lex_token_get_format(token);
 137
 138     lex_token_format_value(&token->value, format, s);
 139     ds_put_char(s, '/');
 140
 141     const union mf_subvalue *mask = &token->mask;
 142     if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
 143         ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
 144     } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
 145         ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
 146     } else {
 147         lex_token_format_value(&token->mask, format, s);
 148     }
 149 }
 150
 151 /* Appends a string representation of 'token' to 's', in a format that can be
 152  * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
 153  * parsed back.) */
 154 void
 155 lex_token_format(const struct lex_token *token, struct ds *s)
 156 {
 157     switch (token->type) {
 158     case LEX_T_END:
 159         ds_put_cstr(s, "$");
 160         break;
 161
 162     case LEX_T_ID:
 163         ds_put_cstr(s, token->s);
 164         break;
 165
 166     case LEX_T_ERROR:
 167         ds_put_cstr(s, "error(");
 168         json_string_escape(token->s, s);
 169         ds_put_char(s, ')');
 170         break;
 171
 172     case LEX_T_STRING:
 173         json_string_escape(token->s, s);
 174         break;
 175
 176     case LEX_T_INTEGER:
 177         lex_token_format_value(&token->value, lex_token_get_format(token), s);
 178         break;
 179
 180     case LEX_T_MASKED_INTEGER:
 181         lex_token_format_masked_integer(token, s);
 182         break;
 183
 184     case LEX_T_LPAREN:
 185         ds_put_cstr(s, "(");
 186         break;
 187     case LEX_T_RPAREN:
 188         ds_put_cstr(s, ")");
 189         break;
 190     case LEX_T_LCURLY:
 191         ds_put_cstr(s, "{");
 192         break;
 193     case LEX_T_RCURLY:
 194         ds_put_cstr(s, "}");
 195         break;
 196     case LEX_T_LSQUARE:
 197         ds_put_cstr(s, "[");
 198         break;
 199     case LEX_T_RSQUARE:
 200         ds_put_cstr(s, "]");
 201         break;
 202     case LEX_T_EQ:
 203         ds_put_cstr(s, "==");
 204         break;
 205     case LEX_T_NE:
 206         ds_put_cstr(s, "!=");
 207         break;
 208     case LEX_T_LT:
 209         ds_put_cstr(s, "<");
 210         break;
 211     case LEX_T_LE:
 212         ds_put_cstr(s, "<=");
 213         break;
 214     case LEX_T_GT:
 215         ds_put_cstr(s, ">");
 216         break;
 217     case LEX_T_GE:
 218         ds_put_cstr(s, ">=");
 219         break;
 220     case LEX_T_LOG_NOT:
 221         ds_put_cstr(s, "!");
 222         break;
 223     case LEX_T_LOG_AND:
 224         ds_put_cstr(s, "&&");
 225         break;
 226     case LEX_T_LOG_OR:
 227         ds_put_cstr(s, "||");
 228         break;
 229     case LEX_T_ELLIPSIS:
 230         ds_put_cstr(s, "..");
 231         break;
 232     case LEX_T_COMMA:
 233         ds_put_cstr(s, ",");
 234         break;
 235     case LEX_T_SEMICOLON:
 236         ds_put_cstr(s, ";");
 237         break;
 238     case LEX_T_EQUALS:
 239         ds_put_cstr(s, "=");
 240         break;
 241     case LEX_T_EXCHANGE:
 242         ds_put_cstr(s, "<->");
 243         break;
 244     default:
 245         OVS_NOT_REACHED();
 246     }
 247
 248 }
 249 \f
 250 /* lex_token_parse(). */
 251
 252 static void OVS_PRINTF_FORMAT(2, 3)
 253 lex_error(struct lex_token *token, const char *message, ...)
 254 {
 255     ovs_assert(!token->s);
 256     token->type = LEX_T_ERROR;
 257
 258     va_list args;
 259     va_start(args, message);
 260     token->s = xvasprintf(message, args);
 261     va_end(args);
 262 }
 263
 264 static void
 265 lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
 266 {
 267     const char *in = start + (len - 1);
 268     uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
 269
 270     for (int i = 0; i < len; i++) {
 271         int hexit = hexit_value(in[-i]);
 272         if (hexit < 0) {
 273             lex_error(token, "Invalid syntax in hexadecimal constant.");
 274             return;
 275         }
 276         if (hexit && i / 2 >= sizeof token->value.u8) {
 277             lex_error(token, "Hexadecimal constant requires more than "
 278                       "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
 279             return;
 280         }
 281         out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
 282     }
 283     token->format = LEX_F_HEXADECIMAL;
 284 }
 285
 286 static const char *
 287 lex_parse_integer__(const char *p, struct lex_token *token)
 288 {
 289     lex_token_init(token);
 290     token->type = LEX_T_INTEGER;
 291     memset(&token->value, 0, sizeof token->value);
 292     const char *start = p;
 293     const char *end = start;
 294     while (isalnum((unsigned char) *end) || *end == ':'
 295            || (*end == '.' && end[1] != '.')) {
 296         end++;
 297     }
 298     size_t len = end - start;
 299
 300     int n;
 301     struct eth_addr mac;
 302
 303     if (!len) {
 304         lex_error(token, "Integer constant expected.");
 305     } else if (len == 17
 306                && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
 307                            ETH_ADDR_SCAN_ARGS(mac), &n)
 308                && n == len) {
 309         token->value.mac = mac;
 310         token->format = LEX_F_ETHERNET;
 311     } else if (start + strspn(start, "0123456789") == end) {
 312         if (p[0] == '0' && len > 1) {
 313             lex_error(token, "Decimal constants must not have leading zeros.");
 314         } else {
 315             unsigned long long int integer;
 316             char *tail;
 317
 318             errno = 0;
 319             integer = strtoull(p, &tail, 10);
 320             if (tail != end || errno == ERANGE) {
 321                 lex_error(token, "Decimal constants must be less than 2**64.");
 322             } else {
 323                 token->value.integer = htonll(integer);
 324                 token->format = LEX_F_DECIMAL;
 325             }
 326         }
 327     } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
 328         if (len > 2) {
 329             lex_parse_hex_integer(start + 2, len - 2, token);
 330         } else {
 331             lex_error(token, "Hex digits expected following 0%c.", p[1]);
 332         }
 333     } else if (len < INET6_ADDRSTRLEN) {
 334         char copy[INET6_ADDRSTRLEN];
 335         memcpy(copy, p, len);
 336         copy[len] = '\0';
 337
 338         struct in_addr ipv4;
 339         struct in6_addr ipv6;
 340         if (inet_pton(AF_INET, copy, &ipv4) == 1) {
 341             token->value.ipv4 = ipv4.s_addr;
 342             token->format = LEX_F_IPV4;
 343         } else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
 344             token->value.ipv6 = ipv6;
 345             token->format = LEX_F_IPV6;
 346         } else {
 347             lex_error(token, "Invalid numeric constant.");
 348         }
 349     } else {
 350         lex_error(token, "Invalid numeric constant.");
 351     }
 352
 353     ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
 354     return end;
 355 }
 356
 357 static const char *
 358 lex_parse_mask(const char *p, struct lex_token *token)
 359 {
 360     struct lex_token mask;
 361
 362     /* Parse just past the '/' as a second integer.  Handle errors. */
 363     p = lex_parse_integer__(p + 1, &mask);
 364     if (mask.type == LEX_T_ERROR) {
 365         lex_token_swap(&mask, token);
 366         lex_token_destroy(&mask);
 367         return p;
 368     }
 369     ovs_assert(mask.type == LEX_T_INTEGER);
 370
 371     /* Now convert the value and mask into a masked integer token.
 372      * We have a few special cases. */
 373     token->type = LEX_T_MASKED_INTEGER;
 374     memset(&token->mask, 0, sizeof token->mask);
 375     uint32_t prefix_bits = ntohll(mask.value.integer);
 376     if (token->format == mask.format) {
 377         /* Same format value and mask is always OK. */
 378         token->mask = mask.value;
 379     } else if (token->format == LEX_F_IPV4
 380                && mask.format == LEX_F_DECIMAL
 381                && prefix_bits <= 32) {
 382         /* IPv4 address with decimal mask is a CIDR prefix. */
 383         token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
 384     } else if (token->format == LEX_F_IPV6
 385                && mask.format == LEX_F_DECIMAL
 386                && prefix_bits <= 128) {
 387         /* IPv6 address with decimal mask is a CIDR prefix. */
 388         token->mask.ipv6 = ipv6_create_mask(prefix_bits);
 389     } else if (token->format == LEX_F_DECIMAL
 390                && mask.format == LEX_F_HEXADECIMAL
 391                && token->value.integer == 0) {
 392         /* Special case for e.g. 0/0x1234. */
 393         token->format = LEX_F_HEXADECIMAL;
 394         token->mask = mask.value;
 395     } else {
 396         lex_error(token, "Value and mask have incompatible formats.");
 397         return p;
 398     }
 399
 400     /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
 401      * mask. */
 402     for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
 403         ovs_be32 v = token->value.be32[i];
 404         ovs_be32 m = token->mask.be32[i];
 405
 406         if (v & ~m) {
 407             lex_error(token, "Value contains unmasked 1-bits.");
 408             break;
 409         }
 410     }
 411
 412     /* Done! */
 413     lex_token_destroy(&mask);
 414     return p;
 415 }
 416
 417 static const char *
 418 lex_parse_integer(const char *p, struct lex_token *token)
 419 {
 420     p = lex_parse_integer__(p, token);
 421     if (token->type == LEX_T_INTEGER && *p == '/') {
 422         p = lex_parse_mask(p, token);
 423     }
 424     return p;
 425 }
 426
 427 static const char *
 428 lex_parse_string(const char *p, struct lex_token *token)
 429 {
 430     const char *start = ++p;
 431     for (;;) {
 432         switch (*p) {
 433         case '\0':
 434             lex_error(token, "Input ends inside quoted string.");
 435             return p;
 436
 437         case '"':
 438             token->type = (json_string_unescape(start, p - start, &token->s)
 439                            ? LEX_T_STRING : LEX_T_ERROR);
 440             return p + 1;
 441
 442         case '\\':
 443             p++;
 444             if (*p) {
 445                 p++;
 446             }
 447             break;
 448
 449         default:
 450             p++;
 451             break;
 452         }
 453     }
 454 }
 455
 456 static bool
 457 lex_is_id1(unsigned char c)
 458 {
 459     return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 460             || c == '_' || c == '.');
 461 }
 462
 463 static bool
 464 lex_is_idn(unsigned char c)
 465 {
 466     return lex_is_id1(c) || (c >= '0' && c <= '9');
 467 }
 468
 469 static const char *
 470 lex_parse_id(const char *p, struct lex_token *token)
 471 {
 472     const char *start = p;
 473
 474     do {
 475         p++;
 476     } while (lex_is_idn(*p));
 477
 478     token->type = LEX_T_ID;
 479     token->s = xmemdup0(start, p - start);
 480     return p;
 481 }
 482
 483 /* Initializes 'token' and parses the first token from the beginning of
 484  * null-terminated string 'p' into 'token'.  Stores a pointer to the start of
 485  * the token (after skipping white space and comments, if any) into '*startp'.
 486  * Returns the character position at which to begin parsing the next token. */
 487 const char *
 488 lex_token_parse(struct lex_token *token, const char *p, const char **startp)
 489 {
 490     lex_token_init(token);
 491
 492 next:
 493     *startp = p;
 494     switch (*p) {
 495     case '\0':
 496         token->type = LEX_T_END;
 497         return p;
 498
 499     case ' ': case '\t': case '\n': case '\r':
 500         p++;
 501         goto next;
 502
 503     case '/':
 504         p++;
 505         if (*p == '/') {
 506             do {
 507                 p++;
 508             } while (*p != '\0' && *p != '\n');
 509             goto next;
 510         } else if (*p == '*') {
 511             p++;
 512             for (;;) {
 513                 if (*p == '*' && p[1] == '/') {
 514                     p += 2;
 515                     goto next;
 516                 } else if (*p == '\0' || *p == '\n') {
 517                     lex_error(token, "`/*' without matching `*/'.");
 518                     return p;
 519                 } else {
 520                     p++;
 521                 }
 522             }
 523             goto next;
 524         } else {
 525             lex_error(token,
 526                       "`/' is only valid as part of `//' or `/*'.");
 527         }
 528         break;
 529
 530     case '(':
 531         token->type = LEX_T_LPAREN;
 532         p++;
 533         break;
 534
 535     case ')':
 536         token->type = LEX_T_RPAREN;
 537         p++;
 538         break;
 539
 540     case '{':
 541         token->type = LEX_T_LCURLY;
 542         p++;
 543         break;
 544
 545     case '}':
 546         token->type = LEX_T_RCURLY;
 547         p++;
 548         break;
 549
 550     case '[':
 551         token->type = LEX_T_LSQUARE;
 552         p++;
 553         break;
 554
 555     case ']':
 556         token->type = LEX_T_RSQUARE;
 557         p++;
 558         break;
 559
 560     case '=':
 561         p++;
 562         if (*p == '=') {
 563             token->type = LEX_T_EQ;
 564             p++;
 565         } else {
 566             token->type = LEX_T_EQUALS;
 567         }
 568         break;
 569
 570     case '!':
 571         p++;
 572         if (*p == '=') {
 573             token->type = LEX_T_NE;
 574             p++;
 575         } else {
 576             token->type = LEX_T_LOG_NOT;
 577         }
 578         break;
 579
 580     case '&':
 581         p++;
 582         if (*p == '&') {
 583             token->type = LEX_T_LOG_AND;
 584             p++;
 585         } else {
 586             lex_error(token, "`&' is only valid as part of `&&'.");
 587         }
 588         break;
 589
 590     case '|':
 591         p++;
 592         if (*p == '|') {
 593             token->type = LEX_T_LOG_OR;
 594             p++;
 595         } else {
 596             lex_error(token, "`|' is only valid as part of `||'.");
 597         }
 598         break;
 599
 600     case '<':
 601         p++;
 602         if (*p == '=') {
 603             token->type = LEX_T_LE;
 604             p++;
 605         } else if (*p == '-' && p[1] == '>') {
 606             token->type = LEX_T_EXCHANGE;
 607             p += 2;
 608         } else {
 609             token->type = LEX_T_LT;
 610         }
 611         break;
 612
 613     case '>':
 614         p++;
 615         if (*p == '=') {
 616             token->type = LEX_T_GE;
 617             p++;
 618         } else {
 619             token->type = LEX_T_GT;
 620         }
 621         break;
 622
 623     case '.':
 624         p++;
 625         if (*p == '.') {
 626             token->type = LEX_T_ELLIPSIS;
 627             p++;
 628         } else {
 629             lex_error(token, "`.' is only valid as part of `..' or a number.");
 630         }
 631         break;
 632
 633     case ',':
 634         p++;
 635         token->type = LEX_T_COMMA;
 636         break;
 637
 638     case ';':
 639         p++;
 640         token->type = LEX_T_SEMICOLON;
 641         break;
 642
 643     case '0': case '1': case '2': case '3': case '4':
 644     case '5': case '6': case '7': case '8': case '9':
 645     case ':':
 646         p = lex_parse_integer(p, token);
 647         break;
 648
 649     case '"':
 650         p = lex_parse_string(p, token);
 651         break;
 652
 653     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 654     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 655         /* We need to distinguish an Ethernet address or IPv6 address from an
 656          * identifier.  Fortunately, Ethernet addresses and IPv6 addresses that
 657          * are ambiguous based on the first character, always start with hex
 658          * digits followed by a colon, but identifiers never do. */
 659         p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
 660              ? lex_parse_integer(p, token)
 661              : lex_parse_id(p, token));
 662         break;
 663
 664     default:
 665         if (lex_is_id1(*p)) {
 666             p = lex_parse_id(p, token);
 667         } else {
 668             if (isprint((unsigned char) *p)) {
 669                 lex_error(token, "Invalid character `%c' in input.", *p);
 670             } else {
 671                 lex_error(token, "Invalid byte 0x%d in input.", *p);
 672             }
 673             p++;
 674         }
 675         break;
 676     }
 677
 678     return p;
 679 }
 680 \f
 681 /* Initializes 'lexer' for parsing 'input'.
 682  *
 683  * While the lexer is in use, 'input' must remain available, but the caller
 684  * otherwise retains ownership of 'input'.
 685  *
 686  * The caller must call lexer_get() to obtain the first token. */
 687 void
 688 lexer_init(struct lexer *lexer, const char *input)
 689 {
 690     lexer->input = input;
 691     lexer->start = NULL;
 692     lex_token_init(&lexer->token);
 693 }
 694
 695 /* Frees storage associated with 'lexer'. */
 696 void
 697 lexer_destroy(struct lexer *lexer)
 698 {
 699     lex_token_destroy(&lexer->token);
 700 }
 701
 702 /* Obtains the next token from 'lexer' into 'lexer->token', and returns the
 703  * token's type.  The caller may examine 'lexer->token' directly to obtain full
 704  * information about the token. */
 705 enum lex_type
 706 lexer_get(struct lexer *lexer)
 707 {
 708     lex_token_destroy(&lexer->token);
 709     lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
 710     return lexer->token.type;
 711 }
 712
 713 /* Returns the type of the next token that will be fetched by lexer_get(),
 714  * without advancing 'lexer->token' to that token. */
 715 enum lex_type
 716 lexer_lookahead(const struct lexer *lexer)
 717 {
 718     struct lex_token next;
 719     enum lex_type type;
 720     const char *start;
 721
 722     lex_token_parse(&next, lexer->input, &start);
 723     type = next.type;
 724     lex_token_destroy(&next);
 725     return type;
 726 }
 727
 728 /* If 'lexer''s current token has the given 'type', advances 'lexer' to the
 729  * next token and returns true.  Otherwise returns false. */
 730 bool
 731 lexer_match(struct lexer *lexer, enum lex_type type)
 732 {
 733     if (lexer->token.type == type) {
 734         lexer_get(lexer);
 735         return true;
 736     } else {
 737         return false;
 738     }
 739 }
 740
 741 /* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
 742  * to the next token and returns true.  Otherwise returns false.  */
 743 bool
 744 lexer_match_id(struct lexer *lexer, const char *id)
 745 {
 746     if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
 747         lexer_get(lexer);
 748         return true;
 749     } else {
 750         return false;
 751     }
 752 }