lib/classifier.c

   1 /*
   2  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "classifier.h"
  19 #include "classifier-private.h"
  20 #include <errno.h>
  21 #include <netinet/in.h>
  22 #include "byte-order.h"
  23 #include "dynamic-string.h"
  24 #include "odp-util.h"
  25 #include "ofp-util.h"
  26 #include "packets.h"
  27 #include "util.h"
  28 #include "openvswitch/vlog.h"
  29
  30 VLOG_DEFINE_THIS_MODULE(classifier);
  31
  32 struct trie_ctx;
  33
  34 /* A collection of "struct cls_conjunction"s currently embedded into a
  35  * cls_match. */
  36 struct cls_conjunction_set {
  37     /* Link back to the cls_match.
  38      *
  39      * cls_conjunction_set is mostly used during classifier lookup, and, in
  40      * turn, during classifier lookup the most used member of
  41      * cls_conjunction_set is the rule's priority, so we cache it here for fast
  42      * access. */
  43     struct cls_match *match;
  44     int priority;               /* Cached copy of match->priority. */
  45
  46     /* Conjunction information.
  47      *
  48      * 'min_n_clauses' allows some optimization during classifier lookup. */
  49     unsigned int n;             /* Number of elements in 'conj'. */
  50     unsigned int min_n_clauses; /* Smallest 'n' among elements of 'conj'. */
  51     struct cls_conjunction conj[];
  52 };
  53
  54 /* Ports trie depends on both ports sharing the same ovs_be32. */
  55 #define TP_PORTS_OFS32 (offsetof(struct flow, tp_src) / 4)
  56 BUILD_ASSERT_DECL(TP_PORTS_OFS32 == offsetof(struct flow, tp_dst) / 4);
  57 BUILD_ASSERT_DECL(TP_PORTS_OFS32 % 2 == 0);
  58 #define TP_PORTS_OFS64 (TP_PORTS_OFS32 / 2)
  59
  60 static size_t
  61 cls_conjunction_set_size(size_t n)
  62 {
  63     return (sizeof(struct cls_conjunction_set)
  64             + n * sizeof(struct cls_conjunction));
  65 }
  66
  67 static struct cls_conjunction_set *
  68 cls_conjunction_set_alloc(struct cls_match *match,
  69                           const struct cls_conjunction conj[], size_t n)
  70 {
  71     if (n) {
  72         size_t min_n_clauses = conj[0].n_clauses;
  73         for (size_t i = 1; i < n; i++) {
  74             min_n_clauses = MIN(min_n_clauses, conj[i].n_clauses);
  75         }
  76
  77         struct cls_conjunction_set *set = xmalloc(cls_conjunction_set_size(n));
  78         set->match = match;
  79         set->priority = match->priority;
  80         set->n = n;
  81         set->min_n_clauses = min_n_clauses;
  82         memcpy(set->conj, conj, n * sizeof *conj);
  83         return set;
  84     } else {
  85         return NULL;
  86     }
  87 }
  88
  89 static struct cls_match *
  90 cls_match_alloc(const struct cls_rule *rule,
  91                 const struct cls_conjunction conj[], size_t n)
  92 {
  93     int count = count_1bits(rule->match.flow.map);
  94
  95     struct cls_match *cls_match
  96         = xmalloc(sizeof *cls_match - sizeof cls_match->flow.inline_values
  97                   + MINIFLOW_VALUES_SIZE(count));
  98
  99     rculist_init(&cls_match->list);
 100     *CONST_CAST(const struct cls_rule **, &cls_match->cls_rule) = rule;
 101     *CONST_CAST(int *, &cls_match->priority) = rule->priority;
 102     cls_match->visible = false;
 103     miniflow_clone_inline(CONST_CAST(struct miniflow *, &cls_match->flow),
 104                           &rule->match.flow, count);
 105     ovsrcu_set_hidden(&cls_match->conj_set,
 106                       cls_conjunction_set_alloc(cls_match, conj, n));
 107
 108     return cls_match;
 109 }
 110
 111 static struct cls_subtable *find_subtable(const struct classifier *cls,
 112                                           const struct minimask *);
 113 static struct cls_subtable *insert_subtable(struct classifier *cls,
 114                                             const struct minimask *);
 115 static void destroy_subtable(struct classifier *cls, struct cls_subtable *);
 116
 117 static const struct cls_match *find_match_wc(const struct cls_subtable *,
 118                                              const struct flow *,
 119                                              struct trie_ctx *,
 120                                              unsigned int n_tries,
 121                                              struct flow_wildcards *);
 122 static struct cls_match *find_equal(const struct cls_subtable *,
 123                                     const struct miniflow *, uint32_t hash);
 124
 125 static inline const struct cls_match *
 126 next_rule_in_list__(const struct cls_match *rule)
 127 {
 128     const struct cls_match *next = NULL;
 129     next = OBJECT_CONTAINING(rculist_next(&rule->list), next, list);
 130     return next;
 131 }
 132
 133 static inline const struct cls_match *
 134 next_rule_in_list(const struct cls_match *rule, const struct cls_match *head)
 135 {
 136     const struct cls_match *next = next_rule_in_list__(rule);
 137     return next != head ? next : NULL;
 138 }
 139
 140 /* Return the next lower-priority rule in the list that is visible.  Multiple
 141  * identical rules with the same priority may exist transitionally.  In that
 142  * case the first rule of a given priority has been marked as 'to_be_removed',
 143  * and the later rules are marked as '!visible'.  This gets a bit complex if
 144  * there are two rules of the same priority in the list, as in that case the
 145  * head and tail of the list will have the same priority. */
 146 static inline const struct cls_match *
 147 next_visible_rule_in_list(const struct cls_match *rule)
 148 {
 149     const struct cls_match *next = rule;
 150
 151     do {
 152         next = next_rule_in_list__(next);
 153         if (next->priority > rule->priority || next == rule) {
 154             /* We have reached the head of the list, stop. */
 155             return NULL;
 156         }
 157     } while (!next->visible);
 158
 159     return next;
 160 }
 161
 162 static inline struct cls_match *
 163 next_rule_in_list_protected__(struct cls_match *rule)
 164 {
 165     struct cls_match *next = NULL;
 166     next = OBJECT_CONTAINING(rculist_next_protected(&rule->list), next, list);
 167     return next;
 168 }
 169
 170 static inline struct cls_match *
 171 next_rule_in_list_protected(struct cls_match *rule, struct cls_match *head)
 172 {
 173     struct cls_match *next = next_rule_in_list_protected__(rule);
 174     return next != head ? next : NULL;
 175 }
 176
 177 /* Iterates RULE over HEAD and all of the cls_rules on HEAD->list. */
 178 #define FOR_EACH_RULE_IN_LIST(RULE, HEAD)           \
 179     for ((RULE) = (HEAD); (RULE) != NULL;           \
 180          (RULE) = next_rule_in_list(RULE, HEAD))
 181 #define FOR_EACH_RULE_IN_LIST_PROTECTED(RULE, HEAD)         \
 182     for ((RULE) = (HEAD); (RULE) != NULL;                   \
 183          (RULE) = next_rule_in_list_protected(RULE, HEAD))
 184
 185 static unsigned int minimask_get_prefix_len(const struct minimask *,
 186                                             const struct mf_field *);
 187 static void trie_init(struct classifier *cls, int trie_idx,
 188                       const struct mf_field *);
 189 static unsigned int trie_lookup(const struct cls_trie *, const struct flow *,
 190                                 union mf_value *plens);
 191 static unsigned int trie_lookup_value(const rcu_trie_ptr *,
 192                                       const ovs_be32 value[], ovs_be32 plens[],
 193                                       unsigned int value_bits);
 194 static void trie_destroy(rcu_trie_ptr *);
 195 static void trie_insert(struct cls_trie *, const struct cls_rule *, int mlen);
 196 static void trie_insert_prefix(rcu_trie_ptr *, const ovs_be32 *prefix,
 197                                int mlen);
 198 static void trie_remove(struct cls_trie *, const struct cls_rule *, int mlen);
 199 static void trie_remove_prefix(rcu_trie_ptr *, const ovs_be32 *prefix,
 200                                int mlen);
 201 static void mask_set_prefix_bits(struct flow_wildcards *, uint8_t be32ofs,
 202                                  unsigned int n_bits);
 203 static bool mask_prefix_bits_set(const struct flow_wildcards *,
 204                                  uint8_t be32ofs, unsigned int n_bits);
 205 \f
 206 /* cls_rule. */
 207
 208 static inline void
 209 cls_rule_init__(struct cls_rule *rule, unsigned int priority)
 210 {
 211     rculist_init(&rule->node);
 212     rule->priority = priority;
 213     rule->to_be_removed = false;
 214     rule->cls_match = NULL;
 215 }
 216
 217 /* Initializes 'rule' to match packets specified by 'match' at the given
 218  * 'priority'.  'match' must satisfy the invariant described in the comment at
 219  * the definition of struct match.
 220  *
 221  * The caller must eventually destroy 'rule' with cls_rule_destroy().
 222  *
 223  * Clients should not use priority INT_MIN.  (OpenFlow uses priorities between
 224  * 0 and UINT16_MAX, inclusive.) */
 225 void
 226 cls_rule_init(struct cls_rule *rule, const struct match *match, int priority)
 227 {
 228     cls_rule_init__(rule, priority);
 229     minimatch_init(&rule->match, match);
 230 }
 231
 232 /* Same as cls_rule_init() for initialization from a "struct minimatch". */
 233 void
 234 cls_rule_init_from_minimatch(struct cls_rule *rule,
 235                              const struct minimatch *match, int priority)
 236 {
 237     cls_rule_init__(rule, priority);
 238     minimatch_clone(&rule->match, match);
 239 }
 240
 241 /* Initializes 'dst' as a copy of 'src'.
 242  *
 243  * The caller must eventually destroy 'dst' with cls_rule_destroy(). */
 244 void
 245 cls_rule_clone(struct cls_rule *dst, const struct cls_rule *src)
 246 {
 247     cls_rule_init__(dst, src->priority);
 248     minimatch_clone(&dst->match, &src->match);
 249 }
 250
 251 /* Initializes 'dst' with the data in 'src', destroying 'src'.
 252  * 'src' must be a cls_rule NOT in a classifier.
 253  *
 254  * The caller must eventually destroy 'dst' with cls_rule_destroy(). */
 255 void
 256 cls_rule_move(struct cls_rule *dst, struct cls_rule *src)
 257 {
 258     ovs_assert(!src->cls_match);   /* Must not be in a classifier. */
 259     cls_rule_init__(dst, src->priority);
 260     minimatch_move(&dst->match, &src->match);
 261 }
 262
 263 /* Frees memory referenced by 'rule'.  Doesn't free 'rule' itself (it's
 264  * normally embedded into a larger structure).
 265  *
 266  * ('rule' must not currently be in a classifier.) */
 267 void
 268 cls_rule_destroy(struct cls_rule *rule)
 269 {
 270     ovs_assert(!rule->cls_match);   /* Must not be in a classifier. */
 271
 272     /* Check that the rule has been properly removed from the classifier and
 273      * that the destruction only happens after the RCU grace period, or that
 274      * the rule was never inserted to the classifier in the first place. */
 275     ovs_assert(rculist_next_protected(&rule->node) == RCULIST_POISON
 276                || rculist_is_empty(&rule->node));
 277
 278     minimatch_destroy(&rule->match);
 279 }
 280
 281 void
 282 cls_rule_set_conjunctions(struct cls_rule *cr,
 283                           const struct cls_conjunction *conj, size_t n)
 284 {
 285     struct cls_match *match = cr->cls_match;
 286     struct cls_conjunction_set *old
 287         = ovsrcu_get_protected(struct cls_conjunction_set *, &match->conj_set);
 288     struct cls_conjunction *old_conj = old ? old->conj : NULL;
 289     unsigned int old_n = old ? old->n : 0;
 290
 291     if (old_n != n || (n && memcmp(old_conj, conj, n * sizeof *conj))) {
 292         if (old) {
 293             ovsrcu_postpone(free, old);
 294         }
 295         ovsrcu_set(&match->conj_set,
 296                    cls_conjunction_set_alloc(match, conj, n));
 297     }
 298 }
 299
 300
 301 /* Returns true if 'a' and 'b' match the same packets at the same priority,
 302  * false if they differ in some way. */
 303 bool
 304 cls_rule_equal(const struct cls_rule *a, const struct cls_rule *b)
 305 {
 306     return a->priority == b->priority && minimatch_equal(&a->match, &b->match);
 307 }
 308
 309 /* Returns a hash value for 'rule', folding in 'basis'. */
 310 uint32_t
 311 cls_rule_hash(const struct cls_rule *rule, uint32_t basis)
 312 {
 313     return minimatch_hash(&rule->match, hash_int(rule->priority, basis));
 314 }
 315
 316 /* Appends a string describing 'rule' to 's'. */
 317 void
 318 cls_rule_format(const struct cls_rule *rule, struct ds *s)
 319 {
 320     minimatch_format(&rule->match, s, rule->priority);
 321 }
 322
 323 /* Returns true if 'rule' matches every packet, false otherwise. */
 324 bool
 325 cls_rule_is_catchall(const struct cls_rule *rule)
 326 {
 327     return minimask_is_catchall(&rule->match.mask);
 328 }
 329
 330 /* Rules inserted during classifier_defer() need to be made visible before
 331  * calling classifier_publish().
 332  *
 333  * 'rule' must be in a classifier. */
 334 void cls_rule_make_visible(const struct cls_rule *rule)
 335 {
 336     rule->cls_match->visible = true;
 337 }
 338
 339 \f
 340 /* Initializes 'cls' as a classifier that initially contains no classification
 341  * rules. */
 342 void
 343 classifier_init(struct classifier *cls, const uint8_t *flow_segments)
 344 {
 345     cls->n_rules = 0;
 346     cmap_init(&cls->subtables_map);
 347     pvector_init(&cls->subtables);
 348     cmap_init(&cls->partitions);
 349     cls->n_flow_segments = 0;
 350     if (flow_segments) {
 351         while (cls->n_flow_segments < CLS_MAX_INDICES
 352                && *flow_segments < FLOW_U64S) {
 353             cls->flow_segments[cls->n_flow_segments++] = *flow_segments++;
 354         }
 355     }
 356     cls->n_tries = 0;
 357     for (int i = 0; i < CLS_MAX_TRIES; i++) {
 358         trie_init(cls, i, NULL);
 359     }
 360     cls->publish = true;
 361 }
 362
 363 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
 364  * caller's responsibility.
 365  * May only be called after all the readers have been terminated. */
 366 void
 367 classifier_destroy(struct classifier *cls)
 368 {
 369     if (cls) {
 370         struct cls_partition *partition;
 371         struct cls_subtable *subtable;
 372         int i;
 373
 374         for (i = 0; i < cls->n_tries; i++) {
 375             trie_destroy(&cls->tries[i].root);
 376         }
 377
 378         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
 379             destroy_subtable(cls, subtable);
 380         }
 381         cmap_destroy(&cls->subtables_map);
 382
 383         CMAP_FOR_EACH (partition, cmap_node, &cls->partitions) {
 384             ovsrcu_postpone(free, partition);
 385         }
 386         cmap_destroy(&cls->partitions);
 387
 388         pvector_destroy(&cls->subtables);
 389     }
 390 }
 391
 392 /* Set the fields for which prefix lookup should be performed. */
 393 bool
 394 classifier_set_prefix_fields(struct classifier *cls,
 395                              const enum mf_field_id *trie_fields,
 396                              unsigned int n_fields)
 397 {
 398     const struct mf_field * new_fields[CLS_MAX_TRIES];
 399     struct mf_bitmap fields = MF_BITMAP_INITIALIZER;
 400     int i, n_tries = 0;
 401     bool changed = false;
 402
 403     for (i = 0; i < n_fields && n_tries < CLS_MAX_TRIES; i++) {
 404         const struct mf_field *field = mf_from_id(trie_fields[i]);
 405         if (field->flow_be32ofs < 0 || field->n_bits % 32) {
 406             /* Incompatible field.  This is the only place where we
 407              * enforce these requirements, but the rest of the trie code
 408              * depends on the flow_be32ofs to be non-negative and the
 409              * field length to be a multiple of 32 bits. */
 410             continue;
 411         }
 412
 413         if (bitmap_is_set(fields.bm, trie_fields[i])) {
 414             /* Duplicate field, there is no need to build more than
 415              * one index for any one field. */
 416             continue;
 417         }
 418         bitmap_set1(fields.bm, trie_fields[i]);
 419
 420         new_fields[n_tries] = NULL;
 421         if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) {
 422             new_fields[n_tries] = field;
 423             changed = true;
 424         }
 425         n_tries++;
 426     }
 427
 428     if (changed || n_tries < cls->n_tries) {
 429         struct cls_subtable *subtable;
 430
 431         /* Trie configuration needs to change.  Disable trie lookups
 432          * for the tries that are changing and wait all the current readers
 433          * with the old configuration to be done. */
 434         changed = false;
 435         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
 436             for (i = 0; i < cls->n_tries; i++) {
 437                 if ((i < n_tries && new_fields[i]) || i >= n_tries) {
 438                     if (subtable->trie_plen[i]) {
 439                         subtable->trie_plen[i] = 0;
 440                         changed = true;
 441                     }
 442                 }
 443             }
 444         }
 445         /* Synchronize if any readers were using tries.  The readers may
 446          * temporarily function without the trie lookup based optimizations. */
 447         if (changed) {
 448             /* ovsrcu_synchronize() functions as a memory barrier, so it does
 449              * not matter that subtable->trie_plen is not atomic. */
 450             ovsrcu_synchronize();
 451         }
 452
 453         /* Now set up the tries. */
 454         for (i = 0; i < n_tries; i++) {
 455             if (new_fields[i]) {
 456                 trie_init(cls, i, new_fields[i]);
 457             }
 458         }
 459         /* Destroy the rest, if any. */
 460         for (; i < cls->n_tries; i++) {
 461             trie_init(cls, i, NULL);
 462         }
 463
 464         cls->n_tries = n_tries;
 465         return true;
 466     }
 467
 468     return false; /* No change. */
 469 }
 470
 471 static void
 472 trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
 473 {
 474     struct cls_trie *trie = &cls->tries[trie_idx];
 475     struct cls_subtable *subtable;
 476
 477     if (trie_idx < cls->n_tries) {
 478         trie_destroy(&trie->root);
 479     } else {
 480         ovsrcu_set_hidden(&trie->root, NULL);
 481     }
 482     trie->field = field;
 483
 484     /* Add existing rules to the new trie. */
 485     CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
 486         unsigned int plen;
 487
 488         plen = field ? minimask_get_prefix_len(&subtable->mask, field) : 0;
 489         if (plen) {
 490             struct cls_match *head;
 491
 492             CMAP_FOR_EACH (head, cmap_node, &subtable->rules) {
 493                 trie_insert(trie, head->cls_rule, plen);
 494             }
 495         }
 496         /* Initialize subtable's prefix length on this field.  This will
 497          * allow readers to use the trie. */
 498         atomic_thread_fence(memory_order_release);
 499         subtable->trie_plen[trie_idx] = plen;
 500     }
 501 }
 502
 503 /* Returns true if 'cls' contains no classification rules, false otherwise.
 504  * Checking the cmap requires no locking. */
 505 bool
 506 classifier_is_empty(const struct classifier *cls)
 507 {
 508     return cmap_is_empty(&cls->subtables_map);
 509 }
 510
 511 /* Returns the number of rules in 'cls'. */
 512 int
 513 classifier_count(const struct classifier *cls)
 514 {
 515     /* n_rules is an int, so in the presence of concurrent writers this will
 516      * return either the old or a new value. */
 517     return cls->n_rules;
 518 }
 519
 520 static uint32_t
 521 hash_metadata(ovs_be64 metadata)
 522 {
 523     return hash_uint64((OVS_FORCE uint64_t) metadata);
 524 }
 525
 526 static struct cls_partition *
 527 find_partition(const struct classifier *cls, ovs_be64 metadata, uint32_t hash)
 528 {
 529     struct cls_partition *partition;
 530
 531     CMAP_FOR_EACH_WITH_HASH (partition, cmap_node, hash, &cls->partitions) {
 532         if (partition->metadata == metadata) {
 533             return partition;
 534         }
 535     }
 536
 537     return NULL;
 538 }
 539
 540 static struct cls_partition *
 541 create_partition(struct classifier *cls, struct cls_subtable *subtable,
 542                  ovs_be64 metadata)
 543 {
 544     uint32_t hash = hash_metadata(metadata);
 545     struct cls_partition *partition = find_partition(cls, metadata, hash);
 546     if (!partition) {
 547         partition = xmalloc(sizeof *partition);
 548         partition->metadata = metadata;
 549         partition->tags = 0;
 550         tag_tracker_init(&partition->tracker);
 551         cmap_insert(&cls->partitions, &partition->cmap_node, hash);
 552     }
 553     tag_tracker_add(&partition->tracker, &partition->tags, subtable->tag);
 554     return partition;
 555 }
 556
 557 static inline ovs_be32 minimatch_get_ports(const struct minimatch *match)
 558 {
 559     /* Could optimize to use the same map if needed for fast path. */
 560     return MINIFLOW_GET_BE32(&match->flow, tp_src)
 561         & MINIFLOW_GET_BE32(&match->mask.masks, tp_src);
 562 }
 563
 564 static void
 565 subtable_replace_head_rule(struct classifier *cls OVS_UNUSED,
 566                            struct cls_subtable *subtable,
 567                            struct cls_match *head, struct cls_match *new,
 568                            uint32_t hash, uint32_t ihash[CLS_MAX_INDICES])
 569 {
 570     /* Rule's data is already in the tries. */
 571
 572     new->partition = head->partition; /* Steal partition, if any. */
 573     head->partition = NULL;
 574
 575     for (int i = 0; i < subtable->n_indices; i++) {
 576         cmap_replace(&subtable->indices[i], &head->index_nodes[i],
 577                      &new->index_nodes[i], ihash[i]);
 578     }
 579     cmap_replace(&subtable->rules, &head->cmap_node, &new->cmap_node, hash);
 580 }
 581
 582 /* Inserts 'rule' into 'cls'.  Until 'rule' is removed from 'cls', the caller
 583  * must not modify or free it.
 584  *
 585  * If 'cls' already contains an identical rule (including wildcards, values of
 586  * fixed fields, and priority), replaces the old rule by 'rule' and returns the
 587  * rule that was replaced.  The caller takes ownership of the returned rule and
 588  * is thus responsible for destroying it with cls_rule_destroy(), after RCU
 589  * grace period has passed (see ovsrcu_postpone()).
 590  *
 591  * Returns NULL if 'cls' does not contain a rule with an identical key, after
 592  * inserting the new rule.  In this case, no rules are displaced by the new
 593  * rule, even rules that cannot have any effect because the new rule matches a
 594  * superset of their flows and has higher priority.
 595  */
 596 const struct cls_rule *
 597 classifier_replace(struct classifier *cls, const struct cls_rule *rule,
 598                    const struct cls_conjunction *conjs, size_t n_conjs)
 599 {
 600     struct cls_match *new = cls_match_alloc(rule, conjs, n_conjs);
 601     struct cls_subtable *subtable;
 602     uint32_t ihash[CLS_MAX_INDICES];
 603     uint8_t prev_be64ofs = 0;
 604     struct cls_match *head;
 605     size_t n_rules = 0;
 606     uint32_t basis;
 607     uint32_t hash;
 608     int i;
 609
 610     CONST_CAST(struct cls_rule *, rule)->cls_match = new;
 611
 612     subtable = find_subtable(cls, &rule->match.mask);
 613     if (!subtable) {
 614         subtable = insert_subtable(cls, &rule->match.mask);
 615     }
 616
 617     /* Compute hashes in segments. */
 618     basis = 0;
 619     for (i = 0; i < subtable->n_indices; i++) {
 620         ihash[i] = minimatch_hash_range(&rule->match, prev_be64ofs,
 621                                         subtable->index_ofs[i], &basis);
 622         prev_be64ofs = subtable->index_ofs[i];
 623     }
 624     hash = minimatch_hash_range(&rule->match, prev_be64ofs, FLOW_U64S, &basis);
 625
 626     head = find_equal(subtable, &rule->match.flow, hash);
 627     if (!head) {
 628         /* Add rule to tries.
 629          *
 630          * Concurrent readers might miss seeing the rule until this update,
 631          * which might require being fixed up by revalidation later. */
 632         for (i = 0; i < cls->n_tries; i++) {
 633             if (subtable->trie_plen[i]) {
 634                 trie_insert(&cls->tries[i], rule, subtable->trie_plen[i]);
 635             }
 636         }
 637
 638         /* Add rule to ports trie. */
 639         if (subtable->ports_mask_len) {
 640             /* We mask the value to be inserted to always have the wildcarded
 641              * bits in known (zero) state, so we can include them in comparison
 642              * and they will always match (== their original value does not
 643              * matter). */
 644             ovs_be32 masked_ports = minimatch_get_ports(&rule->match);
 645
 646             trie_insert_prefix(&subtable->ports_trie, &masked_ports,
 647                                subtable->ports_mask_len);
 648         }
 649
 650         /* Add rule to partitions.
 651          *
 652          * Concurrent readers might miss seeing the rule until this update,
 653          * which might require being fixed up by revalidation later. */
 654         new->partition = NULL;
 655         if (minimask_get_metadata_mask(&rule->match.mask) == OVS_BE64_MAX) {
 656             ovs_be64 metadata = miniflow_get_metadata(&rule->match.flow);
 657
 658             new->partition = create_partition(cls, subtable, metadata);
 659         }
 660
 661         /* Add new node to segment indices.
 662          *
 663          * Readers may find the rule in the indices before the rule is visible
 664          * in the subtables 'rules' map.  This may result in us losing the
 665          * opportunity to quit lookups earlier, resulting in sub-optimal
 666          * wildcarding.  This will be fixed later by revalidation (always
 667          * scheduled after flow table changes). */
 668         for (i = 0; i < subtable->n_indices; i++) {
 669             cmap_insert(&subtable->indices[i], &new->index_nodes[i], ihash[i]);
 670         }
 671         n_rules = cmap_insert(&subtable->rules, &new->cmap_node, hash);
 672     } else {   /* Equal rules exist in the classifier already. */
 673         struct cls_match *iter;
 674
 675         /* Scan the list for the insertion point that will keep the list in
 676          * order of decreasing priority.
 677          * Insert after 'to_be_removed' rules of the same priority. */
 678         FOR_EACH_RULE_IN_LIST_PROTECTED (iter, head) {
 679             if (rule->priority > iter->priority
 680                 || (rule->priority == iter->priority
 681                     && !iter->cls_rule->to_be_removed)) {
 682                 break;
 683             }
 684         }
 685
 686         /* 'iter' now at the insertion point or NULL if at end. */
 687         if (iter) {
 688             struct cls_rule *old;
 689
 690             if (rule->priority == iter->priority) {
 691                 rculist_replace(&new->list, &iter->list);
 692                 old = CONST_CAST(struct cls_rule *, iter->cls_rule);
 693             } else {
 694                 rculist_insert(&iter->list, &new->list);
 695                 old = NULL;
 696             }
 697
 698             /* Replace the existing head in data structures, if rule is the new
 699              * head. */
 700             if (iter == head) {
 701                 subtable_replace_head_rule(cls, subtable, head, new, hash,
 702                                            ihash);
 703             }
 704
 705             if (old) {
 706                 struct cls_conjunction_set *conj_set;
 707
 708                 conj_set = ovsrcu_get_protected(struct cls_conjunction_set *,
 709                                                 &iter->conj_set);
 710                 if (conj_set) {
 711                     ovsrcu_postpone(free, conj_set);
 712                 }
 713
 714                 ovsrcu_postpone(free, iter);
 715                 old->cls_match = NULL;
 716
 717                 /* No change in subtable's max priority or max count. */
 718
 719                 /* Make rule visible to lookups? */
 720                 new->visible = cls->publish;
 721
 722                 /* Make rule visible to iterators (immediately). */
 723                 rculist_replace(CONST_CAST(struct rculist *, &rule->node),
 724                                 &old->node);
 725
 726                 /* Return displaced rule.  Caller is responsible for keeping it
 727                  * around until all threads quiesce. */
 728                 return old;
 729             }
 730         } else {
 731             rculist_push_back(&head->list, &new->list);
 732         }
 733     }
 734
 735     /* Make rule visible to lookups? */
 736     new->visible = cls->publish;
 737
 738     /* Make rule visible to iterators (immediately). */
 739     rculist_push_back(&subtable->rules_list,
 740                       CONST_CAST(struct rculist *, &rule->node));
 741
 742     /* Rule was added, not replaced.  Update 'subtable's 'max_priority' and
 743      * 'max_count', if necessary.
 744      *
 745      * The rule was already inserted, but concurrent readers may not see the
 746      * rule yet as the subtables vector is not updated yet.  This will have to
 747      * be fixed by revalidation later. */
 748     if (n_rules == 1) {
 749         subtable->max_priority = rule->priority;
 750         subtable->max_count = 1;
 751         pvector_insert(&cls->subtables, subtable, rule->priority);
 752     } else if (rule->priority == subtable->max_priority) {
 753         ++subtable->max_count;
 754     } else if (rule->priority > subtable->max_priority) {
 755         subtable->max_priority = rule->priority;
 756         subtable->max_count = 1;
 757         pvector_change_priority(&cls->subtables, subtable, rule->priority);
 758     }
 759
 760     /* Nothing was replaced. */
 761     cls->n_rules++;
 762
 763     if (cls->publish) {
 764         pvector_publish(&cls->subtables);
 765     }
 766
 767     return NULL;
 768 }
 769
 770 /* Inserts 'rule' into 'cls'.  Until 'rule' is removed from 'cls', the caller
 771  * must not modify or free it.
 772  *
 773  * 'cls' must not contain an identical rule (including wildcards, values of
 774  * fixed fields, and priority).  Use classifier_find_rule_exactly() to find
 775  * such a rule. */
 776 void
 777 classifier_insert(struct classifier *cls, const struct cls_rule *rule,
 778                   const struct cls_conjunction conj[], size_t n_conj)
 779 {
 780     const struct cls_rule *displaced_rule
 781         = classifier_replace(cls, rule, conj, n_conj);
 782     ovs_assert(!displaced_rule);
 783 }
 784
 785 /* Removes 'rule' from 'cls'.  It is the caller's responsibility to destroy
 786  * 'rule' with cls_rule_destroy(), freeing the memory block in which 'rule'
 787  * resides, etc., as necessary.
 788  *
 789  * Does nothing if 'rule' has been already removed, or was never inserted.
 790  *
 791  * Returns the removed rule, or NULL, if it was already removed.
 792  */
 793 const struct cls_rule *
 794 classifier_remove(struct classifier *cls, const struct cls_rule *cls_rule)
 795 {
 796     struct cls_match *rule, *prev, *next;
 797     struct cls_partition *partition;
 798     struct cls_conjunction_set *conj_set;
 799     struct cls_subtable *subtable;
 800     int i;
 801     uint32_t basis = 0, hash, ihash[CLS_MAX_INDICES];
 802     uint8_t prev_be64ofs = 0;
 803     size_t n_rules;
 804
 805     rule = cls_rule->cls_match;
 806     if (!rule) {
 807         return NULL;
 808     }
 809     /* Mark as removed. */
 810     CONST_CAST(struct cls_rule *, cls_rule)->cls_match = NULL;
 811
 812     /* Remove 'cls_rule' from the subtable's rules list. */
 813     rculist_remove(CONST_CAST(struct rculist *, &cls_rule->node));
 814
 815     INIT_CONTAINER(prev, rculist_back_protected(&rule->list), list);
 816     INIT_CONTAINER(next, rculist_next(&rule->list), list);
 817
 818     /* Remove from the list of equal rules. */
 819     rculist_remove(&rule->list);
 820
 821     /* Cheap check for a non-head rule. */
 822     if (prev->priority > rule->priority) {
 823         /* Not the highest priority rule, no need to check subtable's
 824          * 'max_priority'. */
 825         goto free;
 826     }
 827
 828     subtable = find_subtable(cls, &cls_rule->match.mask);
 829     ovs_assert(subtable);
 830
 831     for (i = 0; i < subtable->n_indices; i++) {
 832         ihash[i] = minimatch_hash_range(&cls_rule->match, prev_be64ofs,
 833                                         subtable->index_ofs[i], &basis);
 834         prev_be64ofs = subtable->index_ofs[i];
 835     }
 836     hash = minimatch_hash_range(&cls_rule->match, prev_be64ofs, FLOW_U64S,
 837                                 &basis);
 838
 839     /* Check if the rule is not the head rule. */
 840     if (rule != prev &&
 841         rule != find_equal(subtable, &cls_rule->match.flow, hash)) {
 842         /* Not the head rule, but potentially one with the same priority. */
 843         goto check_priority;
 844     }
 845
 846     /* 'rule' is the head rule.  Check if there is another rule to
 847      * replace 'rule' in the data structures. */
 848     if (next != rule) {
 849         subtable_replace_head_rule(cls, subtable, rule, next, hash, ihash);
 850         goto check_priority;
 851     }
 852
 853     /* 'rule' is last of the kind in the classifier, must remove from all the
 854      * data structures. */
 855
 856     if (subtable->ports_mask_len) {
 857         ovs_be32 masked_ports = minimatch_get_ports(&cls_rule->match);
 858
 859         trie_remove_prefix(&subtable->ports_trie,
 860                            &masked_ports, subtable->ports_mask_len);
 861     }
 862     for (i = 0; i < cls->n_tries; i++) {
 863         if (subtable->trie_plen[i]) {
 864             trie_remove(&cls->tries[i], cls_rule, subtable->trie_plen[i]);
 865         }
 866     }
 867
 868     /* Remove rule node from indices. */
 869     for (i = 0; i < subtable->n_indices; i++) {
 870         cmap_remove(&subtable->indices[i], &rule->index_nodes[i], ihash[i]);
 871     }
 872     n_rules = cmap_remove(&subtable->rules, &rule->cmap_node, hash);
 873
 874     partition = rule->partition;
 875     if (partition) {
 876         tag_tracker_subtract(&partition->tracker, &partition->tags,
 877                              subtable->tag);
 878         if (!partition->tags) {
 879             cmap_remove(&cls->partitions, &partition->cmap_node,
 880                         hash_metadata(partition->metadata));
 881             ovsrcu_postpone(free, partition);
 882         }
 883     }
 884
 885     if (n_rules == 0) {
 886         destroy_subtable(cls, subtable);
 887     } else {
 888 check_priority:
 889         if (subtable->max_priority == rule->priority
 890             && --subtable->max_count == 0) {
 891             /* Find the new 'max_priority' and 'max_count'. */
 892             int max_priority = INT_MIN;
 893             struct cls_match *head;
 894
 895             CMAP_FOR_EACH (head, cmap_node, &subtable->rules) {
 896                 if (head->priority > max_priority) {
 897                     max_priority = head->priority;
 898                     subtable->max_count = 1;
 899                 } else if (head->priority == max_priority) {
 900                     ++subtable->max_count;
 901                 }
 902             }
 903             subtable->max_priority = max_priority;
 904             pvector_change_priority(&cls->subtables, subtable, max_priority);
 905         }
 906     }
 907
 908     if (cls->publish) {
 909         pvector_publish(&cls->subtables);
 910     }
 911
 912 free:
 913     conj_set = ovsrcu_get_protected(struct cls_conjunction_set *,
 914                                     &rule->conj_set);
 915     if (conj_set) {
 916         ovsrcu_postpone(free, conj_set);
 917     }
 918     ovsrcu_postpone(free, rule);
 919     cls->n_rules--;
 920
 921     return cls_rule;
 922 }
 923
 924 /* Prefix tree context.  Valid when 'lookup_done' is true.  Can skip all
 925  * subtables which have a prefix match on the trie field, but whose prefix
 926  * length is not indicated in 'match_plens'.  For example, a subtable that
 927  * has a 8-bit trie field prefix match can be skipped if
 928  * !be_get_bit_at(&match_plens, 8 - 1).  If skipped, 'maskbits' prefix bits
 929  * must be unwildcarded to make datapath flow only match packets it should. */
 930 struct trie_ctx {
 931     const struct cls_trie *trie;
 932     bool lookup_done;        /* Status of the lookup. */
 933     uint8_t be32ofs;         /* U32 offset of the field in question. */
 934     unsigned int maskbits;   /* Prefix length needed to avoid false matches. */
 935     union mf_value match_plens; /* Bitmask of prefix lengths with possible
 936                                  * matches. */
 937 };
 938
 939 static void
 940 trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
 941 {
 942     ctx->trie = trie;
 943     ctx->be32ofs = trie->field->flow_be32ofs;
 944     ctx->lookup_done = false;
 945 }
 946
 947 struct conjunctive_match {
 948     struct hmap_node hmap_node;
 949     uint32_t id;
 950     uint64_t clauses;
 951 };
 952
 953 static struct conjunctive_match *
 954 find_conjunctive_match__(struct hmap *matches, uint64_t id, uint32_t hash)
 955 {
 956     struct conjunctive_match *m;
 957
 958     HMAP_FOR_EACH_IN_BUCKET (m, hmap_node, hash, matches) {
 959         if (m->id == id) {
 960             return m;
 961         }
 962     }
 963     return NULL;
 964 }
 965
 966 static bool
 967 find_conjunctive_match(const struct cls_conjunction_set *set,
 968                        unsigned int max_n_clauses, struct hmap *matches,
 969                        struct conjunctive_match *cm_stubs, size_t n_cm_stubs,
 970                        uint32_t *idp)
 971 {
 972     const struct cls_conjunction *c;
 973
 974     if (max_n_clauses < set->min_n_clauses) {
 975         return false;
 976     }
 977
 978     for (c = set->conj; c < &set->conj[set->n]; c++) {
 979         struct conjunctive_match *cm;
 980         uint32_t hash;
 981
 982         if (c->n_clauses > max_n_clauses) {
 983             continue;
 984         }
 985
 986         hash = hash_int(c->id, 0);
 987         cm = find_conjunctive_match__(matches, c->id, hash);
 988         if (!cm) {
 989             size_t n = hmap_count(matches);
 990
 991             cm = n < n_cm_stubs ? &cm_stubs[n] : xmalloc(sizeof *cm);
 992             hmap_insert(matches, &cm->hmap_node, hash);
 993             cm->id = c->id;
 994             cm->clauses = UINT64_MAX << (c->n_clauses & 63);
 995         }
 996         cm->clauses |= UINT64_C(1) << c->clause;
 997         if (cm->clauses == UINT64_MAX) {
 998             *idp = cm->id;
 999             return true;
1000         }
1001     }
1002     return false;
1003 }
1004
1005 static void
1006 free_conjunctive_matches(struct hmap *matches,
1007                          struct conjunctive_match *cm_stubs, size_t n_cm_stubs)
1008 {
1009     if (hmap_count(matches) > n_cm_stubs) {
1010         struct conjunctive_match *cm, *next;
1011
1012         HMAP_FOR_EACH_SAFE (cm, next, hmap_node, matches) {
1013             if (!(cm >= cm_stubs && cm < &cm_stubs[n_cm_stubs])) {
1014                 free(cm);
1015             }
1016         }
1017     }
1018     hmap_destroy(matches);
1019 }
1020
1021 /* Like classifier_lookup(), except that support for conjunctive matches can be
1022  * configured with 'allow_conjunctive_matches'.  That feature is not exposed
1023  * externally because turning off conjunctive matches is only useful to avoid
1024  * recursion within this function itself.
1025  *
1026  * 'flow' is non-const to allow for temporary modifications during the lookup.
1027  * Any changes are restored before returning. */
1028 static const struct cls_rule *
1029 classifier_lookup__(const struct classifier *cls, struct flow *flow,
1030                     struct flow_wildcards *wc, bool allow_conjunctive_matches)
1031 {
1032     const struct cls_partition *partition;
1033     struct trie_ctx trie_ctx[CLS_MAX_TRIES];
1034     const struct cls_match *match;
1035     tag_type tags;
1036
1037     /* Highest-priority flow in 'cls' that certainly matches 'flow'. */
1038     const struct cls_match *hard = NULL;
1039     int hard_pri = INT_MIN;     /* hard ? hard->priority : INT_MIN. */
1040
1041     /* Highest-priority conjunctive flows in 'cls' matching 'flow'.  Since
1042      * these are (components of) conjunctive flows, we can only know whether
1043      * the full conjunctive flow matches after seeing multiple of them.  Thus,
1044      * we refer to these as "soft matches". */
1045     struct cls_conjunction_set *soft_stub[64];
1046     struct cls_conjunction_set **soft = soft_stub;
1047     size_t n_soft = 0, allocated_soft = ARRAY_SIZE(soft_stub);
1048     int soft_pri = INT_MIN;    /* n_soft ? MAX(soft[*]->priority) : INT_MIN. */
1049
1050     /* Synchronize for cls->n_tries and subtable->trie_plen.  They can change
1051      * when table configuration changes, which happens typically only on
1052      * startup. */
1053     atomic_thread_fence(memory_order_acquire);
1054
1055     /* Determine 'tags' such that, if 'subtable->tag' doesn't intersect them,
1056      * then 'flow' cannot possibly match in 'subtable':
1057      *
1058      *     - If flow->metadata maps to a given 'partition', then we can use
1059      *       'tags' for 'partition->tags'.
1060      *
1061      *     - If flow->metadata has no partition, then no rule in 'cls' has an
1062      *       exact-match for flow->metadata.  That means that we don't need to
1063      *       search any subtable that includes flow->metadata in its mask.
1064      *
1065      * In either case, we always need to search any cls_subtables that do not
1066      * include flow->metadata in its mask.  One way to do that would be to
1067      * check the "cls_subtable"s explicitly for that, but that would require an
1068      * extra branch per subtable.  Instead, we mark such a cls_subtable's
1069      * 'tags' as TAG_ALL and make sure that 'tags' is never empty.  This means
1070      * that 'tags' always intersects such a cls_subtable's 'tags', so we don't
1071      * need a special case.
1072      */
1073     partition = (cmap_is_empty(&cls->partitions)
1074                  ? NULL
1075                  : find_partition(cls, flow->metadata,
1076                                   hash_metadata(flow->metadata)));
1077     tags = partition ? partition->tags : TAG_ARBITRARY;
1078
1079     /* Initialize trie contexts for find_match_wc(). */
1080     for (int i = 0; i < cls->n_tries; i++) {
1081         trie_ctx_init(&trie_ctx[i], &cls->tries[i]);
1082     }
1083
1084     /* Main loop. */
1085     struct cls_subtable *subtable;
1086     PVECTOR_FOR_EACH_PRIORITY (subtable, hard_pri, 2, sizeof *subtable,
1087                                &cls->subtables) {
1088         struct cls_conjunction_set *conj_set;
1089
1090         /* Skip subtables not in our partition. */
1091         if (!tag_intersects(tags, subtable->tag)) {
1092             continue;
1093         }
1094
1095         /* Skip subtables with no match, or where the match is lower-priority
1096          * than some certain match we've already found. */
1097         match = find_match_wc(subtable, flow, trie_ctx, cls->n_tries, wc);
1098         if (!match || match->priority <= hard_pri) {
1099             continue;
1100         }
1101
1102         conj_set = ovsrcu_get(struct cls_conjunction_set *, &match->conj_set);
1103         if (!conj_set) {
1104             /* 'match' isn't part of a conjunctive match.  It's the best
1105              * certain match we've got so far, since we know that it's
1106              * higher-priority than hard_pri.
1107              *
1108              * (There might be a higher-priority conjunctive match.  We can't
1109              * tell yet.) */
1110             hard = match;
1111             hard_pri = hard->priority;
1112         } else if (allow_conjunctive_matches) {
1113             /* 'match' is part of a conjunctive match.  Add it to the list. */
1114             if (OVS_UNLIKELY(n_soft >= allocated_soft)) {
1115                 struct cls_conjunction_set **old_soft = soft;
1116
1117                 allocated_soft *= 2;
1118                 soft = xmalloc(allocated_soft * sizeof *soft);
1119                 memcpy(soft, old_soft, n_soft * sizeof *soft);
1120                 if (old_soft != soft_stub) {
1121                     free(old_soft);
1122                 }
1123             }
1124             soft[n_soft++] = conj_set;
1125
1126             /* Keep track of the highest-priority soft match. */
1127             if (soft_pri < match->priority) {
1128                 soft_pri = match->priority;
1129             }
1130         }
1131     }
1132
1133     /* In the common case, at this point we have no soft matches and we can
1134      * return immediately.  (We do the same thing if we have potential soft
1135      * matches but none of them are higher-priority than our hard match.) */
1136     if (hard_pri >= soft_pri) {
1137         if (soft != soft_stub) {
1138             free(soft);
1139         }
1140         return hard ? hard->cls_rule : NULL;
1141     }
1142
1143     /* At this point, we have some soft matches.  We might also have a hard
1144      * match; if so, its priority is lower than the highest-priority soft
1145      * match. */
1146
1147     /* Soft match loop.
1148      *
1149      * Check whether soft matches are real matches. */
1150     for (;;) {
1151         /* Delete soft matches that are null.  This only happens in second and
1152          * subsequent iterations of the soft match loop, when we drop back from
1153          * a high-priority soft match to a lower-priority one.
1154          *
1155          * Also, delete soft matches whose priority is less than or equal to
1156          * the hard match's priority.  In the first iteration of the soft
1157          * match, these can be in 'soft' because the earlier main loop found
1158          * the soft match before the hard match.  In second and later iteration
1159          * of the soft match loop, these can be in 'soft' because we dropped
1160          * back from a high-priority soft match to a lower-priority soft match.
1161          *
1162          * It is tempting to delete soft matches that cannot be satisfied
1163          * because there are fewer soft matches than required to satisfy any of
1164          * their conjunctions, but we cannot do that because there might be
1165          * lower priority soft or hard matches with otherwise identical
1166          * matches.  (We could special case those here, but there's no
1167          * need--we'll do so at the bottom of the soft match loop anyway and
1168          * this duplicates less code.)
1169          *
1170          * It's also tempting to break out of the soft match loop if 'n_soft ==
1171          * 1' but that would also miss lower-priority hard matches.  We could
1172          * special case that also but again there's no need. */
1173         for (int i = 0; i < n_soft; ) {
1174             if (!soft[i] || soft[i]->priority <= hard_pri) {
1175                 soft[i] = soft[--n_soft];
1176             } else {
1177                 i++;
1178             }
1179         }
1180         if (!n_soft) {
1181             break;
1182         }
1183
1184         /* Find the highest priority among the soft matches.  (We know this
1185          * must be higher than the hard match's priority; otherwise we would
1186          * have deleted all of the soft matches in the previous loop.)  Count
1187          * the number of soft matches that have that priority. */
1188         soft_pri = INT_MIN;
1189         int n_soft_pri = 0;
1190         for (int i = 0; i < n_soft; i++) {
1191             if (soft[i]->priority > soft_pri) {
1192                 soft_pri = soft[i]->priority;
1193                 n_soft_pri = 1;
1194             } else if (soft[i]->priority == soft_pri) {
1195                 n_soft_pri++;
1196             }
1197         }
1198         ovs_assert(soft_pri > hard_pri);
1199
1200         /* Look for a real match among the highest-priority soft matches.
1201          *
1202          * It's unusual to have many conjunctive matches, so we use stubs to
1203          * avoid calling malloc() in the common case.  An hmap has a built-in
1204          * stub for up to 2 hmap_nodes; possibly, we would benefit a variant
1205          * with a bigger stub. */
1206         struct conjunctive_match cm_stubs[16];
1207         struct hmap matches;
1208
1209         hmap_init(&matches);
1210         for (int i = 0; i < n_soft; i++) {
1211             uint32_t id;
1212
1213             if (soft[i]->priority == soft_pri
1214                 && find_conjunctive_match(soft[i], n_soft_pri, &matches,
1215                                           cm_stubs, ARRAY_SIZE(cm_stubs),
1216                                           &id)) {
1217                 uint32_t saved_conj_id = flow->conj_id;
1218                 const struct cls_rule *rule;
1219
1220                 flow->conj_id = id;
1221                 rule = classifier_lookup__(cls, flow, wc, false);
1222                 flow->conj_id = saved_conj_id;
1223
1224                 if (rule) {
1225                     free_conjunctive_matches(&matches,
1226                                              cm_stubs, ARRAY_SIZE(cm_stubs));
1227                     if (soft != soft_stub) {
1228                         free(soft);
1229                     }
1230                     return rule;
1231                 }
1232             }
1233         }
1234         free_conjunctive_matches(&matches, cm_stubs, ARRAY_SIZE(cm_stubs));
1235
1236         /* There's no real match among the highest-priority soft matches.
1237          * However, if any of those soft matches has a lower-priority but
1238          * otherwise identical flow match, then we need to consider those for
1239          * soft or hard matches.
1240          *
1241          * The next iteration of the soft match loop will delete any null
1242          * pointers we put into 'soft' (and some others too). */
1243         for (int i = 0; i < n_soft; i++) {
1244             if (soft[i]->priority != soft_pri) {
1245                 continue;
1246             }
1247
1248             /* Find next-lower-priority flow with identical flow match. */
1249             match = next_visible_rule_in_list(soft[i]->match);
1250             if (match) {
1251                 soft[i] = ovsrcu_get(struct cls_conjunction_set *,
1252                                      &match->conj_set);
1253                 if (!soft[i]) {
1254                     /* The flow is a hard match; don't treat as a soft
1255                      * match. */
1256                     if (match->priority > hard_pri) {
1257                         hard = match;
1258                         hard_pri = hard->priority;
1259                     }
1260                 }
1261             } else {
1262                 /* No such lower-priority flow (probably the common case). */
1263                 soft[i] = NULL;
1264             }
1265         }
1266     }
1267
1268     if (soft != soft_stub) {
1269         free(soft);
1270     }
1271     return hard ? hard->cls_rule : NULL;
1272 }
1273
1274 /* Finds and returns the highest-priority rule in 'cls' that matches 'flow'.
1275  * Returns a null pointer if no rules in 'cls' match 'flow'.  If multiple rules
1276  * of equal priority match 'flow', returns one arbitrarily.
1277  *
1278  * If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the
1279  * set of bits that were significant in the lookup.  At some point
1280  * earlier, 'wc' should have been initialized (e.g., by
1281  * flow_wildcards_init_catchall()).
1282  *
1283  * 'flow' is non-const to allow for temporary modifications during the lookup.
1284  * Any changes are restored before returning. */
1285 const struct cls_rule *
1286 classifier_lookup(const struct classifier *cls, struct flow *flow,
1287                   struct flow_wildcards *wc)
1288 {
1289     return classifier_lookup__(cls, flow, wc, true);
1290 }
1291
1292 /* Finds and returns a rule in 'cls' with exactly the same priority and
1293  * matching criteria as 'target'.  Returns a null pointer if 'cls' doesn't
1294  * contain an exact match.
1295  *
1296  * Returns the first matching rule that is not 'to_be_removed'.  Only one such
1297  * rule may exist. */
1298 const struct cls_rule *
1299 classifier_find_rule_exactly(const struct classifier *cls,
1300                              const struct cls_rule *target)
1301 {
1302     const struct cls_match *head, *rule;
1303     const struct cls_subtable *subtable;
1304
1305     subtable = find_subtable(cls, &target->match.mask);
1306     if (!subtable) {
1307         return NULL;
1308     }
1309
1310     head = find_equal(subtable, &target->match.flow,
1311                       miniflow_hash_in_minimask(&target->match.flow,
1312                                                 &target->match.mask, 0));
1313     if (!head) {
1314         return NULL;
1315     }
1316     FOR_EACH_RULE_IN_LIST (rule, head) {
1317         if (rule->priority < target->priority) {
1318             break; /* Not found. */
1319         }
1320         if (rule->priority == target->priority
1321             && !rule->cls_rule->to_be_removed) {
1322             return rule->cls_rule;
1323         }
1324     }
1325     return NULL;
1326 }
1327
1328 /* Finds and returns a rule in 'cls' with priority 'priority' and exactly the
1329  * same matching criteria as 'target'.  Returns a null pointer if 'cls' doesn't
1330  * contain an exact match. */
1331 const struct cls_rule *
1332 classifier_find_match_exactly(const struct classifier *cls,
1333                               const struct match *target, int priority)
1334 {
1335     const struct cls_rule *retval;
1336     struct cls_rule cr;
1337
1338     cls_rule_init(&cr, target, priority);
1339     retval = classifier_find_rule_exactly(cls, &cr);
1340     cls_rule_destroy(&cr);
1341
1342     return retval;
1343 }
1344
1345 /* Checks if 'target' would overlap any other rule in 'cls'.  Two rules are
1346  * considered to overlap if both rules have the same priority and a packet
1347  * could match both.
1348  *
1349  * A trivial example of overlapping rules is two rules matching disjoint sets
1350  * of fields. E.g., if one rule matches only on port number, while another only
1351  * on dl_type, any packet from that specific port and with that specific
1352  * dl_type could match both, if the rules also have the same priority.
1353  *
1354  * 'target' is not considered to overlap with a rule that has been marked
1355  * as 'to_be_removed'.
1356  */
1357 bool
1358 classifier_rule_overlaps(const struct classifier *cls,
1359                          const struct cls_rule *target)
1360 {
1361     struct cls_subtable *subtable;
1362
1363     /* Iterate subtables in the descending max priority order. */
1364     PVECTOR_FOR_EACH_PRIORITY (subtable, target->priority - 1, 2,
1365                                sizeof(struct cls_subtable), &cls->subtables) {
1366         uint64_t storage[FLOW_U64S];
1367         struct minimask mask;
1368         const struct cls_rule *rule;
1369
1370         minimask_combine(&mask, &target->match.mask, &subtable->mask, storage);
1371
1372         RCULIST_FOR_EACH (rule, node, &subtable->rules_list) {
1373             if (rule->priority == target->priority
1374                 && !rule->to_be_removed
1375                 && miniflow_equal_in_minimask(&target->match.flow,
1376                                               &rule->match.flow, &mask)) {
1377                 return true;
1378             }
1379         }
1380     }
1381     return false;
1382 }
1383
1384 /* Returns true if 'rule' exactly matches 'criteria' or if 'rule' is more
1385  * specific than 'criteria'.  That is, 'rule' matches 'criteria' and this
1386  * function returns true if, for every field:
1387  *
1388  *   - 'criteria' and 'rule' specify the same (non-wildcarded) value for the
1389  *     field, or
1390  *
1391  *   - 'criteria' wildcards the field,
1392  *
1393  * Conversely, 'rule' does not match 'criteria' and this function returns false
1394  * if, for at least one field:
1395  *
1396  *   - 'criteria' and 'rule' specify different values for the field, or
1397  *
1398  *   - 'criteria' specifies a value for the field but 'rule' wildcards it.
1399  *
1400  * Equivalently, the truth table for whether a field matches is:
1401  *
1402  *                                     rule
1403  *
1404  *                   c         wildcard    exact
1405  *                   r        +---------+---------+
1406  *                   i   wild |   yes   |   yes   |
1407  *                   t   card |         |         |
1408  *                   e        +---------+---------+
1409  *                   r  exact |    no   |if values|
1410  *                   i        |         |are equal|
1411  *                   a        +---------+---------+
1412  *
1413  * This is the matching rule used by OpenFlow 1.0 non-strict OFPT_FLOW_MOD
1414  * commands and by OpenFlow 1.0 aggregate and flow stats.
1415  *
1416  * Ignores rule->priority. */
1417 bool
1418 cls_rule_is_loose_match(const struct cls_rule *rule,
1419                         const struct minimatch *criteria)
1420 {
1421     return (!minimask_has_extra(&rule->match.mask, &criteria->mask)
1422             && miniflow_equal_in_minimask(&rule->match.flow, &criteria->flow,
1423                                           &criteria->mask));
1424 }
1425 \f
1426 /* Iteration. */
1427
1428 static bool
1429 rule_matches(const struct cls_rule *rule, const struct cls_rule *target)
1430 {
1431     /* Iterators never see rules that have been marked for removal.
1432      * This allows them to be oblivious of duplicate rules. */
1433     return (!rule->to_be_removed &&
1434             (!target
1435              || miniflow_equal_in_minimask(&rule->match.flow,
1436                                            &target->match.flow,
1437                                            &target->match.mask)));
1438 }
1439
1440 static const struct cls_rule *
1441 search_subtable(const struct cls_subtable *subtable,
1442                 struct cls_cursor *cursor)
1443 {
1444     if (!cursor->target
1445         || !minimask_has_extra(&subtable->mask, &cursor->target->match.mask)) {
1446         const struct cls_rule *rule;
1447
1448         RCULIST_FOR_EACH (rule, node, &subtable->rules_list) {
1449             if (rule_matches(rule, cursor->target)) {
1450                 return rule;
1451             }
1452         }
1453     }
1454     return NULL;
1455 }
1456
1457 /* Initializes 'cursor' for iterating through rules in 'cls', and returns the
1458  * first matching cls_rule via '*pnode', or NULL if there are no matches.
1459  *
1460  *     - If 'target' is null, the cursor will visit every rule in 'cls'.
1461  *
1462  *     - If 'target' is nonnull, the cursor will visit each 'rule' in 'cls'
1463  *       such that cls_rule_is_loose_match(rule, target) returns true.
1464  *
1465  * Ignores target->priority. */
1466 struct cls_cursor
1467 cls_cursor_start(const struct classifier *cls, const struct cls_rule *target)
1468 {
1469     struct cls_cursor cursor;
1470     struct cls_subtable *subtable;
1471
1472     cursor.cls = cls;
1473     cursor.target = target && !cls_rule_is_catchall(target) ? target : NULL;
1474     cursor.rule = NULL;
1475
1476     /* Find first rule. */
1477     PVECTOR_CURSOR_FOR_EACH (subtable, &cursor.subtables,
1478                              &cursor.cls->subtables) {
1479         const struct cls_rule *rule = search_subtable(subtable, &cursor);
1480
1481         if (rule) {
1482             cursor.subtable = subtable;
1483             cursor.rule = rule;
1484             break;
1485         }
1486     }
1487
1488     return cursor;
1489 }
1490
1491 static const struct cls_rule *
1492 cls_cursor_next(struct cls_cursor *cursor)
1493 {
1494     const struct cls_rule *rule;
1495     const struct cls_subtable *subtable;
1496
1497     rule = cursor->rule;
1498     subtable = cursor->subtable;
1499     RCULIST_FOR_EACH_CONTINUE (rule, node, &subtable->rules_list) {
1500         if (rule_matches(rule, cursor->target)) {
1501             return rule;
1502         }
1503     }
1504
1505     PVECTOR_CURSOR_FOR_EACH_CONTINUE (subtable, &cursor->subtables) {
1506         rule = search_subtable(subtable, cursor);
1507         if (rule) {
1508             cursor->subtable = subtable;
1509             return rule;
1510         }
1511     }
1512
1513     return NULL;
1514 }
1515
1516 /* Sets 'cursor->rule' to the next matching cls_rule in 'cursor''s iteration,
1517  * or to null if all matching rules have been visited. */
1518 void
1519 cls_cursor_advance(struct cls_cursor *cursor)
1520 {
1521     cursor->rule = cls_cursor_next(cursor);
1522 }
1523 \f
1524 static struct cls_subtable *
1525 find_subtable(const struct classifier *cls, const struct minimask *mask)
1526 {
1527     struct cls_subtable *subtable;
1528
1529     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, minimask_hash(mask, 0),
1530                              &cls->subtables_map) {
1531         if (minimask_equal(mask, &subtable->mask)) {
1532             return subtable;
1533         }
1534     }
1535     return NULL;
1536 }
1537
1538 /* The new subtable will be visible to the readers only after this. */
1539 static struct cls_subtable *
1540 insert_subtable(struct classifier *cls, const struct minimask *mask)
1541 {
1542     uint32_t hash = minimask_hash(mask, 0);
1543     struct cls_subtable *subtable;
1544     int i, index = 0;
1545     struct flow_wildcards old, new;
1546     uint8_t prev;
1547     int count = count_1bits(mask->masks.map);
1548
1549     subtable = xzalloc(sizeof *subtable - sizeof mask->masks.inline_values
1550                        + MINIFLOW_VALUES_SIZE(count));
1551     cmap_init(&subtable->rules);
1552     miniflow_clone_inline(CONST_CAST(struct miniflow *, &subtable->mask.masks),
1553                           &mask->masks, count);
1554
1555     /* Init indices for segmented lookup, if any. */
1556     flow_wildcards_init_catchall(&new);
1557     old = new;
1558     prev = 0;
1559     for (i = 0; i < cls->n_flow_segments; i++) {
1560         flow_wildcards_fold_minimask_range(&new, mask, prev,
1561                                            cls->flow_segments[i]);
1562         /* Add an index if it adds mask bits. */
1563         if (!flow_wildcards_equal(&new, &old)) {
1564             cmap_init(&subtable->indices[index]);
1565             *CONST_CAST(uint8_t *, &subtable->index_ofs[index])
1566                 = cls->flow_segments[i];
1567             index++;
1568             old = new;
1569         }
1570         prev = cls->flow_segments[i];
1571     }
1572     /* Check if the rest of the subtable's mask adds any bits,
1573      * and remove the last index if it doesn't. */
1574     if (index > 0) {
1575         flow_wildcards_fold_minimask_range(&new, mask, prev, FLOW_U64S);
1576         if (flow_wildcards_equal(&new, &old)) {
1577             --index;
1578             *CONST_CAST(uint8_t *, &subtable->index_ofs[index]) = 0;
1579             cmap_destroy(&subtable->indices[index]);
1580         }
1581     }
1582     *CONST_CAST(uint8_t *, &subtable->n_indices) = index;
1583
1584     *CONST_CAST(tag_type *, &subtable->tag) =
1585         (minimask_get_metadata_mask(mask) == OVS_BE64_MAX
1586          ? tag_create_deterministic(hash)
1587          : TAG_ALL);
1588
1589     for (i = 0; i < cls->n_tries; i++) {
1590         subtable->trie_plen[i] = minimask_get_prefix_len(mask,
1591                                                          cls->tries[i].field);
1592     }
1593
1594     /* Ports trie. */
1595     ovsrcu_set_hidden(&subtable->ports_trie, NULL);
1596     *CONST_CAST(int *, &subtable->ports_mask_len)
1597         = 32 - ctz32(ntohl(MINIFLOW_GET_BE32(&mask->masks, tp_src)));
1598
1599     /* List of rules. */
1600     rculist_init(&subtable->rules_list);
1601
1602     cmap_insert(&cls->subtables_map, &subtable->cmap_node, hash);
1603
1604     return subtable;
1605 }
1606
1607 /* RCU readers may still access the subtable before it is actually freed. */
1608 static void
1609 destroy_subtable(struct classifier *cls, struct cls_subtable *subtable)
1610 {
1611     int i;
1612
1613     pvector_remove(&cls->subtables, subtable);
1614     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
1615                 minimask_hash(&subtable->mask, 0));
1616
1617     ovs_assert(ovsrcu_get_protected(struct trie_node *, &subtable->ports_trie)
1618                == NULL);
1619     ovs_assert(cmap_is_empty(&subtable->rules));
1620     ovs_assert(rculist_is_empty(&subtable->rules_list));
1621
1622     for (i = 0; i < subtable->n_indices; i++) {
1623         cmap_destroy(&subtable->indices[i]);
1624     }
1625     cmap_destroy(&subtable->rules);
1626     ovsrcu_postpone(free, subtable);
1627 }
1628
1629 struct range {
1630     uint8_t start;
1631     uint8_t end;
1632 };
1633
1634 static unsigned int be_get_bit_at(const ovs_be32 value[], unsigned int ofs);
1635
1636 /* Return 'true' if can skip rest of the subtable based on the prefix trie
1637  * lookup results. */
1638 static inline bool
1639 check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
1640             const unsigned int field_plen[CLS_MAX_TRIES],
1641             const struct range ofs, const struct flow *flow,
1642             struct flow_wildcards *wc)
1643 {
1644     int j;
1645
1646     /* Check if we could avoid fully unwildcarding the next level of
1647      * fields using the prefix tries.  The trie checks are done only as
1648      * needed to avoid folding in additional bits to the wildcards mask. */
1649     for (j = 0; j < n_tries; j++) {
1650         /* Is the trie field relevant for this subtable? */
1651         if (field_plen[j]) {
1652             struct trie_ctx *ctx = &trie_ctx[j];
1653             uint8_t be32ofs = ctx->be32ofs;
1654             uint8_t be64ofs = be32ofs / 2;
1655
1656             /* Is the trie field within the current range of fields? */
1657             if (be64ofs >= ofs.start && be64ofs < ofs.end) {
1658                 /* On-demand trie lookup. */
1659                 if (!ctx->lookup_done) {
1660                     memset(&ctx->match_plens, 0, sizeof ctx->match_plens);
1661                     ctx->maskbits = trie_lookup(ctx->trie, flow,
1662                                                 &ctx->match_plens);
1663                     ctx->lookup_done = true;
1664                 }
1665                 /* Possible to skip the rest of the subtable if subtable's
1666                  * prefix on the field is not included in the lookup result. */
1667                 if (!be_get_bit_at(&ctx->match_plens.be32, field_plen[j] - 1)) {
1668                     /* We want the trie lookup to never result in unwildcarding
1669                      * any bits that would not be unwildcarded otherwise.
1670                      * Since the trie is shared by the whole classifier, it is
1671                      * possible that the 'maskbits' contain bits that are
1672                      * irrelevant for the partition relevant for the current
1673                      * packet.  Hence the checks below. */
1674
1675                     /* Check that the trie result will not unwildcard more bits
1676                      * than this subtable would otherwise. */
1677                     if (ctx->maskbits <= field_plen[j]) {
1678                         /* Unwildcard the bits and skip the rest. */
1679                         mask_set_prefix_bits(wc, be32ofs, ctx->maskbits);
1680                         /* Note: Prerequisite already unwildcarded, as the only
1681                          * prerequisite of the supported trie lookup fields is
1682                          * the ethertype, which is always unwildcarded. */
1683                         return true;
1684                     }
1685                     /* Can skip if the field is already unwildcarded. */
1686                     if (mask_prefix_bits_set(wc, be32ofs, ctx->maskbits)) {
1687                         return true;
1688                     }
1689                 }
1690             }
1691         }
1692     }
1693     return false;
1694 }
1695
1696 /* Returns true if 'target' satisifies 'flow'/'mask', that is, if each bit
1697  * for which 'flow', for which 'mask' has a bit set, specifies a particular
1698  * value has the correct value in 'target'.
1699  *
1700  * This function is equivalent to miniflow_equal_flow_in_minimask(flow,
1701  * target, mask) but this is faster because of the invariant that
1702  * flow->map and mask->masks.map are the same, and that this version
1703  * takes the 'wc'. */
1704 static inline bool
1705 miniflow_and_mask_matches_flow(const struct miniflow *flow,
1706                                const struct minimask *mask,
1707                                const struct flow *target)
1708 {
1709     const uint64_t *flowp = miniflow_get_values(flow);
1710     const uint64_t *maskp = miniflow_get_values(&mask->masks);
1711     int idx;
1712
1713     MAP_FOR_EACH_INDEX(idx, mask->masks.map) {
1714         uint64_t diff = (*flowp++ ^ flow_u64_value(target, idx)) & *maskp++;
1715
1716         if (diff) {
1717             return false;
1718         }
1719     }
1720
1721     return true;
1722 }
1723
1724 static inline const struct cls_match *
1725 find_match(const struct cls_subtable *subtable, const struct flow *flow,
1726            uint32_t hash)
1727 {
1728     const struct cls_match *head, *rule;
1729
1730     CMAP_FOR_EACH_WITH_HASH (head, cmap_node, hash, &subtable->rules) {
1731         if (OVS_LIKELY(miniflow_and_mask_matches_flow(&head->flow,
1732                                                       &subtable->mask,
1733                                                       flow))) {
1734             /* Return highest priority rule that is visible. */
1735             FOR_EACH_RULE_IN_LIST(rule, head) {
1736                 if (OVS_LIKELY(rule->visible)) {
1737                     return rule;
1738                 }
1739             }
1740         }
1741     }
1742
1743     return NULL;
1744 }
1745
1746 /* Returns true if 'target' satisifies 'flow'/'mask', that is, if each bit
1747  * for which 'flow', for which 'mask' has a bit set, specifies a particular
1748  * value has the correct value in 'target'.
1749  *
1750  * This function is equivalent to miniflow_and_mask_matches_flow() but this
1751  * version fills in the mask bits in 'wc'. */
1752 static inline bool
1753 miniflow_and_mask_matches_flow_wc(const struct miniflow *flow,
1754                                   const struct minimask *mask,
1755                                   const struct flow *target,
1756                                   struct flow_wildcards *wc)
1757 {
1758     const uint64_t *flowp = miniflow_get_values(flow);
1759     const uint64_t *maskp = miniflow_get_values(&mask->masks);
1760     int idx;
1761
1762     MAP_FOR_EACH_INDEX(idx, mask->masks.map) {
1763         uint64_t mask = *maskp++;
1764         uint64_t diff = (*flowp++ ^ flow_u64_value(target, idx)) & mask;
1765
1766         if (diff) {
1767             /* Only unwildcard if none of the differing bits is already
1768              * exact-matched. */
1769             if (!(flow_u64_value(&wc->masks, idx) & diff)) {
1770                 /* Keep one bit of the difference.  The selected bit may be
1771                  * different in big-endian v.s. little-endian systems. */
1772                 *flow_u64_lvalue(&wc->masks, idx) |= rightmost_1bit(diff);
1773             }
1774             return false;
1775         }
1776         /* Fill in the bits that were looked at. */
1777         *flow_u64_lvalue(&wc->masks, idx) |= mask;
1778     }
1779
1780     return true;
1781 }
1782
1783 /* Unwildcard the fields looked up so far, if any. */
1784 static void
1785 fill_range_wc(const struct cls_subtable *subtable, struct flow_wildcards *wc,
1786               uint8_t to)
1787 {
1788     if (to) {
1789         flow_wildcards_fold_minimask_range(wc, &subtable->mask, 0, to);
1790     }
1791 }
1792
1793 static const struct cls_match *
1794 find_match_wc(const struct cls_subtable *subtable, const struct flow *flow,
1795               struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
1796               struct flow_wildcards *wc)
1797 {
1798     uint32_t basis = 0, hash;
1799     const struct cls_match *rule = NULL;
1800     int i;
1801     struct range ofs;
1802
1803     if (OVS_UNLIKELY(!wc)) {
1804         return find_match(subtable, flow,
1805                           flow_hash_in_minimask(flow, &subtable->mask, 0));
1806     }
1807
1808     ofs.start = 0;
1809     /* Try to finish early by checking fields in segments. */
1810     for (i = 0; i < subtable->n_indices; i++) {
1811         const struct cmap_node *inode;
1812
1813         ofs.end = subtable->index_ofs[i];
1814
1815         if (check_tries(trie_ctx, n_tries, subtable->trie_plen, ofs, flow,
1816                         wc)) {
1817             /* 'wc' bits for the trie field set, now unwildcard the preceding
1818              * bits used so far. */
1819             fill_range_wc(subtable, wc, ofs.start);
1820             return NULL;
1821         }
1822         hash = flow_hash_in_minimask_range(flow, &subtable->mask, ofs.start,
1823                                            ofs.end, &basis);
1824         inode = cmap_find(&subtable->indices[i], hash);
1825         if (!inode) {
1826             /* No match, can stop immediately, but must fold in the bits
1827              * used in lookup so far. */
1828             fill_range_wc(subtable, wc, ofs.end);
1829             return NULL;
1830         }
1831
1832         /* If we have narrowed down to a single rule already, check whether
1833          * that rule matches.  Either way, we're done.
1834          *
1835          * (Rare) hash collisions may cause us to miss the opportunity for this
1836          * optimization. */
1837         if (!cmap_node_next(inode)) {
1838             const struct cls_match *head;
1839
1840             ASSIGN_CONTAINER(head, inode - i, index_nodes);
1841             if (miniflow_and_mask_matches_flow_wc(&head->flow, &subtable->mask,
1842                                                   flow, wc)) {
1843                 /* Return highest priority rule that is visible. */
1844                 FOR_EACH_RULE_IN_LIST(rule, head) {
1845                     if (OVS_LIKELY(rule->visible)) {
1846                         return rule;
1847                     }
1848                 }
1849             }
1850             return NULL;
1851         }
1852         ofs.start = ofs.end;
1853     }
1854     ofs.end = FLOW_U64S;
1855     /* Trie check for the final range. */
1856     if (check_tries(trie_ctx, n_tries, subtable->trie_plen, ofs, flow, wc)) {
1857         fill_range_wc(subtable, wc, ofs.start);
1858         return NULL;
1859     }
1860     hash = flow_hash_in_minimask_range(flow, &subtable->mask, ofs.start,
1861                                        ofs.end, &basis);
1862     rule = find_match(subtable, flow, hash);
1863     if (!rule && subtable->ports_mask_len) {
1864         /* Ports are always part of the final range, if any.
1865          * No match was found for the ports.  Use the ports trie to figure out
1866          * which ports bits to unwildcard. */
1867         unsigned int mbits;
1868         ovs_be32 value, plens, mask;
1869
1870         mask = MINIFLOW_GET_BE32(&subtable->mask.masks, tp_src);
1871         value = ((OVS_FORCE ovs_be32 *)flow)[TP_PORTS_OFS32] & mask;
1872         mbits = trie_lookup_value(&subtable->ports_trie, &value, &plens, 32);
1873
1874         ((OVS_FORCE ovs_be32 *)&wc->masks)[TP_PORTS_OFS32] |=
1875             mask & be32_prefix_mask(mbits);
1876
1877         /* Unwildcard all bits in the mask upto the ports, as they were used
1878          * to determine there is no match. */
1879         fill_range_wc(subtable, wc, TP_PORTS_OFS64);
1880         return NULL;
1881     }
1882
1883     /* Must unwildcard all the fields, as they were looked at. */
1884     flow_wildcards_fold_minimask(wc, &subtable->mask);
1885     return rule;
1886 }
1887
1888 static struct cls_match *
1889 find_equal(const struct cls_subtable *subtable, const struct miniflow *flow,
1890            uint32_t hash)
1891 {
1892     struct cls_match *head;
1893
1894     CMAP_FOR_EACH_WITH_HASH (head, cmap_node, hash, &subtable->rules) {
1895         if (miniflow_equal(&head->flow, flow)) {
1896             return head;
1897         }
1898     }
1899     return NULL;
1900 }
1901 \f
1902 /* A longest-prefix match tree. */
1903
1904 /* Return at least 'plen' bits of the 'prefix', starting at bit offset 'ofs'.
1905  * Prefixes are in the network byte order, and the offset 0 corresponds to
1906  * the most significant bit of the first byte.  The offset can be read as
1907  * "how many bits to skip from the start of the prefix starting at 'pr'". */
1908 static uint32_t
1909 raw_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
1910 {
1911     uint32_t prefix;
1912
1913     pr += ofs / 32; /* Where to start. */
1914     ofs %= 32;      /* How many bits to skip at 'pr'. */
1915
1916     prefix = ntohl(*pr) << ofs; /* Get the first 32 - ofs bits. */
1917     if (plen > 32 - ofs) {      /* Need more than we have already? */
1918         prefix |= ntohl(*++pr) >> (32 - ofs);
1919     }
1920     /* Return with possible unwanted bits at the end. */
1921     return prefix;
1922 }
1923
1924 /* Return min(TRIE_PREFIX_BITS, plen) bits of the 'prefix', starting at bit
1925  * offset 'ofs'.  Prefixes are in the network byte order, and the offset 0
1926  * corresponds to the most significant bit of the first byte.  The offset can
1927  * be read as "how many bits to skip from the start of the prefix starting at
1928  * 'pr'". */
1929 static uint32_t
1930 trie_get_prefix(const ovs_be32 pr[], unsigned int ofs, unsigned int plen)
1931 {
1932     if (!plen) {
1933         return 0;
1934     }
1935     if (plen > TRIE_PREFIX_BITS) {
1936         plen = TRIE_PREFIX_BITS; /* Get at most TRIE_PREFIX_BITS. */
1937     }
1938     /* Return with unwanted bits cleared. */
1939     return raw_get_prefix(pr, ofs, plen) & ~0u << (32 - plen);
1940 }
1941
1942 /* Return the number of equal bits in 'n_bits' of 'prefix's MSBs and a 'value'
1943  * starting at "MSB 0"-based offset 'ofs'. */
1944 static unsigned int
1945 prefix_equal_bits(uint32_t prefix, unsigned int n_bits, const ovs_be32 value[],
1946                   unsigned int ofs)
1947 {
1948     uint64_t diff = prefix ^ raw_get_prefix(value, ofs, n_bits);
1949     /* Set the bit after the relevant bits to limit the result. */
1950     return raw_clz64(diff << 32 | UINT64_C(1) << (63 - n_bits));
1951 }
1952
1953 /* Return the number of equal bits in 'node' prefix and a 'prefix' of length
1954  * 'plen', starting at "MSB 0"-based offset 'ofs'. */
1955 static unsigned int
1956 trie_prefix_equal_bits(const struct trie_node *node, const ovs_be32 prefix[],
1957                        unsigned int ofs, unsigned int plen)
1958 {
1959     return prefix_equal_bits(node->prefix, MIN(node->n_bits, plen - ofs),
1960                              prefix, ofs);
1961 }
1962
1963 /* Return the bit at ("MSB 0"-based) offset 'ofs' as an int.  'ofs' can
1964  * be greater than 31. */
1965 static unsigned int
1966 be_get_bit_at(const ovs_be32 value[], unsigned int ofs)
1967 {
1968     return (((const uint8_t *)value)[ofs / 8] >> (7 - ofs % 8)) & 1u;
1969 }
1970
1971 /* Return the bit at ("MSB 0"-based) offset 'ofs' as an int.  'ofs' must
1972  * be between 0 and 31, inclusive. */
1973 static unsigned int
1974 get_bit_at(const uint32_t prefix, unsigned int ofs)
1975 {
1976     return (prefix >> (31 - ofs)) & 1u;
1977 }
1978
1979 /* Create new branch. */
1980 static struct trie_node *
1981 trie_branch_create(const ovs_be32 *prefix, unsigned int ofs, unsigned int plen,
1982                    unsigned int n_rules)
1983 {
1984     struct trie_node *node = xmalloc(sizeof *node);
1985
1986     node->prefix = trie_get_prefix(prefix, ofs, plen);
1987
1988     if (plen <= TRIE_PREFIX_BITS) {
1989         node->n_bits = plen;
1990         ovsrcu_set_hidden(&node->edges[0], NULL);
1991         ovsrcu_set_hidden(&node->edges[1], NULL);
1992         node->n_rules = n_rules;
1993     } else { /* Need intermediate nodes. */
1994         struct trie_node *subnode = trie_branch_create(prefix,
1995                                                        ofs + TRIE_PREFIX_BITS,
1996                                                        plen - TRIE_PREFIX_BITS,
1997                                                        n_rules);
1998         int bit = get_bit_at(subnode->prefix, 0);
1999         node->n_bits = TRIE_PREFIX_BITS;
2000         ovsrcu_set_hidden(&node->edges[bit], subnode);
2001         ovsrcu_set_hidden(&node->edges[!bit], NULL);
2002         node->n_rules = 0;
2003     }
2004     return node;
2005 }
2006
2007 static void
2008 trie_node_destroy(const struct trie_node *node)
2009 {
2010     ovsrcu_postpone(free, CONST_CAST(struct trie_node *, node));
2011 }
2012
2013 /* Copy a trie node for modification and postpone delete the old one. */
2014 static struct trie_node *
2015 trie_node_rcu_realloc(const struct trie_node *node)
2016 {
2017     struct trie_node *new_node = xmalloc(sizeof *node);
2018
2019     *new_node = *node;
2020     trie_node_destroy(node);
2021
2022     return new_node;
2023 }
2024
2025 static void
2026 trie_destroy(rcu_trie_ptr *trie)
2027 {
2028     struct trie_node *node = ovsrcu_get_protected(struct trie_node *, trie);
2029
2030     if (node) {
2031         ovsrcu_set_hidden(trie, NULL);
2032         trie_destroy(&node->edges[0]);
2033         trie_destroy(&node->edges[1]);
2034         trie_node_destroy(node);
2035     }
2036 }
2037
2038 static bool
2039 trie_is_leaf(const struct trie_node *trie)
2040 {
2041     /* No children? */
2042     return !ovsrcu_get(struct trie_node *, &trie->edges[0])
2043         && !ovsrcu_get(struct trie_node *, &trie->edges[1]);
2044 }
2045
2046 static void
2047 mask_set_prefix_bits(struct flow_wildcards *wc, uint8_t be32ofs,
2048                      unsigned int n_bits)
2049 {
2050     ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
2051     unsigned int i;
2052
2053     for (i = 0; i < n_bits / 32; i++) {
2054         mask[i] = OVS_BE32_MAX;
2055     }
2056     if (n_bits % 32) {
2057         mask[i] |= htonl(~0u << (32 - n_bits % 32));
2058     }
2059 }
2060
2061 static bool
2062 mask_prefix_bits_set(const struct flow_wildcards *wc, uint8_t be32ofs,
2063                      unsigned int n_bits)
2064 {
2065     ovs_be32 *mask = &((ovs_be32 *)&wc->masks)[be32ofs];
2066     unsigned int i;
2067     ovs_be32 zeroes = 0;
2068
2069     for (i = 0; i < n_bits / 32; i++) {
2070         zeroes |= ~mask[i];
2071     }
2072     if (n_bits % 32) {
2073         zeroes |= ~mask[i] & htonl(~0u << (32 - n_bits % 32));
2074     }
2075
2076     return !zeroes; /* All 'n_bits' bits set. */
2077 }
2078
2079 static rcu_trie_ptr *
2080 trie_next_edge(struct trie_node *node, const ovs_be32 value[],
2081                unsigned int ofs)
2082 {
2083     return node->edges + be_get_bit_at(value, ofs);
2084 }
2085
2086 static const struct trie_node *
2087 trie_next_node(const struct trie_node *node, const ovs_be32 value[],
2088                unsigned int ofs)
2089 {
2090     return ovsrcu_get(struct trie_node *,
2091                       &node->edges[be_get_bit_at(value, ofs)]);
2092 }
2093
2094 /* Set the bit at ("MSB 0"-based) offset 'ofs'.  'ofs' can be greater than 31.
2095  */
2096 static void
2097 be_set_bit_at(ovs_be32 value[], unsigned int ofs)
2098 {
2099     ((uint8_t *)value)[ofs / 8] |= 1u << (7 - ofs % 8);
2100 }
2101
2102 /* Returns the number of bits in the prefix mask necessary to determine a
2103  * mismatch, in case there are longer prefixes in the tree below the one that
2104  * matched.
2105  * '*plens' will have a bit set for each prefix length that may have matching
2106  * rules.  The caller is responsible for clearing the '*plens' prior to
2107  * calling this.
2108  */
2109 static unsigned int
2110 trie_lookup_value(const rcu_trie_ptr *trie, const ovs_be32 value[],
2111                   ovs_be32 plens[], unsigned int n_bits)
2112 {
2113     const struct trie_node *prev = NULL;
2114     const struct trie_node *node = ovsrcu_get(struct trie_node *, trie);
2115     unsigned int match_len = 0; /* Number of matching bits. */
2116
2117     for (; node; prev = node, node = trie_next_node(node, value, match_len)) {
2118         unsigned int eqbits;
2119         /* Check if this edge can be followed. */
2120         eqbits = prefix_equal_bits(node->prefix, node->n_bits, value,
2121                                    match_len);
2122         match_len += eqbits;
2123         if (eqbits < node->n_bits) { /* Mismatch, nothing more to be found. */
2124             /* Bit at offset 'match_len' differed. */
2125             return match_len + 1; /* Includes the first mismatching bit. */
2126         }
2127         /* Full match, check if rules exist at this prefix length. */
2128         if (node->n_rules > 0) {
2129             be_set_bit_at(plens, match_len - 1);
2130         }
2131         if (match_len >= n_bits) {
2132             return n_bits; /* Full prefix. */
2133         }
2134     }
2135     /* node == NULL.  Full match so far, but we tried to follow an
2136      * non-existing branch.  Need to exclude the other branch if it exists
2137      * (it does not if we were called on an empty trie or 'prev' is a leaf
2138      * node). */
2139     return !prev || trie_is_leaf(prev) ? match_len : match_len + 1;
2140 }
2141
2142 static unsigned int
2143 trie_lookup(const struct cls_trie *trie, const struct flow *flow,
2144             union mf_value *plens)
2145 {
2146     const struct mf_field *mf = trie->field;
2147
2148     /* Check that current flow matches the prerequisites for the trie
2149      * field.  Some match fields are used for multiple purposes, so we
2150      * must check that the trie is relevant for this flow. */
2151     if (mf_are_prereqs_ok(mf, flow)) {
2152         return trie_lookup_value(&trie->root,
2153                                  &((ovs_be32 *)flow)[mf->flow_be32ofs],
2154                                  &plens->be32, mf->n_bits);
2155     }
2156     memset(plens, 0xff, sizeof *plens); /* All prefixes, no skipping. */
2157     return 0; /* Value not used in this case. */
2158 }
2159
2160 /* Returns the length of a prefix match mask for the field 'mf' in 'minimask'.
2161  * Returns the u32 offset to the miniflow data in '*miniflow_index', if
2162  * 'miniflow_index' is not NULL. */
2163 static unsigned int
2164 minimask_get_prefix_len(const struct minimask *minimask,
2165                         const struct mf_field *mf)
2166 {
2167     unsigned int n_bits = 0, mask_tz = 0; /* Non-zero when end of mask seen. */
2168     uint8_t be32_ofs = mf->flow_be32ofs;
2169     uint8_t be32_end = be32_ofs + mf->n_bytes / 4;
2170
2171     for (; be32_ofs < be32_end; ++be32_ofs) {
2172         uint32_t mask = ntohl(minimask_get_be32(minimask, be32_ofs));
2173
2174         /* Validate mask, count the mask length. */
2175         if (mask_tz) {
2176             if (mask) {
2177                 return 0; /* No bits allowed after mask ended. */
2178             }
2179         } else {
2180             if (~mask & (~mask + 1)) {
2181                 return 0; /* Mask not contiguous. */
2182             }
2183             mask_tz = ctz32(mask);
2184             n_bits += 32 - mask_tz;
2185         }
2186     }
2187
2188     return n_bits;
2189 }
2190
2191 /*
2192  * This is called only when mask prefix is known to be CIDR and non-zero.
2193  * Relies on the fact that the flow and mask have the same map, and since
2194  * the mask is CIDR, the storage for the flow field exists even if it
2195  * happened to be zeros.
2196  */
2197 static const ovs_be32 *
2198 minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
2199 {
2200     return (OVS_FORCE const ovs_be32 *)
2201         (miniflow_get_values(&match->flow)
2202          + count_1bits(match->flow.map &
2203                        ((UINT64_C(1) << mf->flow_be32ofs / 2) - 1)))
2204         + (mf->flow_be32ofs & 1);
2205 }
2206
2207 /* Insert rule in to the prefix tree.
2208  * 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
2209  * in 'rule'. */
2210 static void
2211 trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
2212 {
2213     trie_insert_prefix(&trie->root,
2214                        minimatch_get_prefix(&rule->match, trie->field), mlen);
2215 }
2216
2217 static void
2218 trie_insert_prefix(rcu_trie_ptr *edge, const ovs_be32 *prefix, int mlen)
2219 {
2220     struct trie_node *node;
2221     int ofs = 0;
2222
2223     /* Walk the tree. */
2224     for (; (node = ovsrcu_get_protected(struct trie_node *, edge));
2225          edge = trie_next_edge(node, prefix, ofs)) {
2226         unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
2227         ofs += eqbits;
2228         if (eqbits < node->n_bits) {
2229             /* Mismatch, new node needs to be inserted above. */
2230             int old_branch = get_bit_at(node->prefix, eqbits);
2231             struct trie_node *new_parent;
2232
2233             new_parent = trie_branch_create(prefix, ofs - eqbits, eqbits,
2234                                             ofs == mlen ? 1 : 0);
2235             /* Copy the node to modify it. */
2236             node = trie_node_rcu_realloc(node);
2237             /* Adjust the new node for its new position in the tree. */
2238             node->prefix <<= eqbits;
2239             node->n_bits -= eqbits;
2240             ovsrcu_set_hidden(&new_parent->edges[old_branch], node);
2241
2242             /* Check if need a new branch for the new rule. */
2243             if (ofs < mlen) {
2244                 ovsrcu_set_hidden(&new_parent->edges[!old_branch],
2245                                   trie_branch_create(prefix, ofs, mlen - ofs,
2246                                                      1));
2247             }
2248             ovsrcu_set(edge, new_parent); /* Publish changes. */
2249             return;
2250         }
2251         /* Full match so far. */
2252
2253         if (ofs == mlen) {
2254             /* Full match at the current node, rule needs to be added here. */
2255             node->n_rules++;
2256             return;
2257         }
2258     }
2259     /* Must insert a new tree branch for the new rule. */
2260     ovsrcu_set(edge, trie_branch_create(prefix, ofs, mlen - ofs, 1));
2261 }
2262
2263 /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
2264  * in 'rule'. */
2265 static void
2266 trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
2267 {
2268     trie_remove_prefix(&trie->root,
2269                        minimatch_get_prefix(&rule->match, trie->field), mlen);
2270 }
2271
2272 /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
2273  * in 'rule'. */
2274 static void
2275 trie_remove_prefix(rcu_trie_ptr *root, const ovs_be32 *prefix, int mlen)
2276 {
2277     struct trie_node *node;
2278     rcu_trie_ptr *edges[sizeof(union mf_value) * 8];
2279     int depth = 0, ofs = 0;
2280
2281     /* Walk the tree. */
2282     for (edges[0] = root;
2283          (node = ovsrcu_get_protected(struct trie_node *, edges[depth]));
2284          edges[++depth] = trie_next_edge(node, prefix, ofs)) {
2285         unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);
2286
2287         if (eqbits < node->n_bits) {
2288             /* Mismatch, nothing to be removed.  This should never happen, as
2289              * only rules in the classifier are ever removed. */
2290             break; /* Log a warning. */
2291         }
2292         /* Full match so far. */
2293         ofs += eqbits;
2294
2295         if (ofs == mlen) {
2296             /* Full prefix match at the current node, remove rule here. */
2297             if (!node->n_rules) {
2298                 break; /* Log a warning. */
2299             }
2300             node->n_rules--;
2301
2302             /* Check if can prune the tree. */
2303             while (!node->n_rules) {
2304                 struct trie_node *next,
2305                     *edge0 = ovsrcu_get_protected(struct trie_node *,
2306                                                   &node->edges[0]),
2307                     *edge1 = ovsrcu_get_protected(struct trie_node *,
2308                                                   &node->edges[1]);
2309
2310                 if (edge0 && edge1) {
2311                     break; /* A branching point, cannot prune. */
2312                 }
2313
2314                 /* Else have at most one child node, remove this node. */
2315                 next = edge0 ? edge0 : edge1;
2316
2317                 if (next) {
2318                     if (node->n_bits + next->n_bits > TRIE_PREFIX_BITS) {
2319                         break;   /* Cannot combine. */
2320                     }
2321                     next = trie_node_rcu_realloc(next); /* Modify. */
2322
2323                     /* Combine node with next. */
2324                     next->prefix = node->prefix | next->prefix >> node->n_bits;
2325                     next->n_bits += node->n_bits;
2326                 }
2327                 /* Update the parent's edge. */
2328                 ovsrcu_set(edges[depth], next); /* Publish changes. */
2329                 trie_node_destroy(node);
2330
2331                 if (next || !depth) {
2332                     /* Branch not pruned or at root, nothing more to do. */
2333                     break;
2334                 }
2335                 node = ovsrcu_get_protected(struct trie_node *,
2336                                             edges[--depth]);
2337             }
2338             return;
2339         }
2340     }
2341     /* Cannot go deeper. This should never happen, since only rules
2342      * that actually exist in the classifier are ever removed. */
2343     VLOG_WARN("Trying to remove non-existing rule from a prefix trie.");
2344 }