lib/cmap.c

   1 /*
   2  * Copyright (c) 2014 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "cmap.h"
  19 #include "hash.h"
  20 #include "ovs-rcu.h"
  21 #include "random.h"
  22 #include "util.h"
  23
  24 /* Optimistic Concurrent Cuckoo Hash
  25  * =================================
  26  *
  27  * A "cuckoo hash" is an open addressing hash table schema, designed such that
  28  * a given element can be in one of only a small number of buckets 'd', each of
  29  * which holds up to a small number 'k' elements.  Thus, the expected and
  30  * worst-case lookup times are O(1) because they require comparing no more than
  31  * a fixed number of elements (k * d).  Inserting a new element can require
  32  * moving around existing elements, but it is also O(1) amortized expected
  33  * time.
  34  *
  35  * An optimistic concurrent hash table goes one step further, making it
  36  * possible for a single writer to execute concurrently with any number of
  37  * readers without requiring the readers to take any locks.
  38  *
  39  * This cuckoo hash implementation uses:
  40  *
  41  *    - Two hash functions (d=2).  More hash functions allow for a higher load
  42  *      factor, but increasing 'k' is easier and the benefits of increasing 'd'
  43  *      quickly fall off with the 'k' values used here.  Also, the method of
  44  *      generating hashes used in this implementation is hard to reasonably
  45  *      extend beyond d=2.  Finally, each additional hash function means that a
  46  *      lookup has to look at least one extra cache line.
  47  *
  48  *    - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets
  49  *      exactly one cache line in size.
  50  *
  51  * According to Erlingsson [4], these parameters suggest a maximum load factor
  52  * of about 93%.  The current implementation is conservative, expanding the
  53  * hash table when it is over 85% full.
  54  *
  55  *
  56  * Hash Functions
  57  * ==============
  58  *
  59  * A cuckoo hash requires multiple hash functions.  When reorganizing the hash
  60  * becomes too difficult, it also requires the ability to change the hash
  61  * functions.  Requiring the client to provide multiple hashes and to be able
  62  * to change them to new hashes upon insertion is inconvenient.
  63  *
  64  * This implementation takes another approach.  The client provides a single,
  65  * fixed hash.  The cuckoo hash internally "rehashes" this hash against a
  66  * randomly selected basis value (see rehash()).  This rehashed value is one of
  67  * the two hashes.  The other hash is computed by 16-bit circular rotation of
  68  * the rehashed value.  Updating the basis changes the hash functions.
  69  *
  70  * To work properly, the hash functions used by a cuckoo hash must be
  71  * independent.  If one hash function is a function of the other (e.g. h2(x) =
  72  * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail
  73  * catastrophically (loop forever) because of collisions.  With this rehashing
  74  * technique, the two hashes are completely independent for masks up to 16 bits
  75  * wide.  For masks wider than 16 bits, only 32-n bits are independent between
  76  * the two hashes.  Thus, it becomes risky to grow a cuckoo hash table beyond
  77  * about 2**24 buckets (about 71 million elements with k=5 and maximum load
  78  * 85%).  Fortunately, Open vSwitch does not normally deal with hash tables
  79  * this large.
  80  *
  81  *
  82  * Handling Duplicates
  83  * ===================
  84  *
  85  * This cuckoo hash table implementation deals with duplicate client-provided
  86  * hash values by chaining: the second and subsequent cmap_nodes with a given
  87  * hash are chained off the initially inserted node's 'next' member.  The hash
  88  * table maintains the invariant that a single client-provided hash value
  89  * exists in only a single chain in a single bucket (even though that hash
  90  * could be stored in two buckets).
  91  *
  92  *
  93  * References
  94  * ==========
  95  *
  96  * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High
  97  *     Performance Ethernet Forwarding with CuckooSwitch".  In Proc. 9th
  98  *     CoNEXT, Dec. 2013.
  99  *
 100  * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent
 101  *     memcache with dumber caching and smarter hashing".  In Proc. 10th USENIX
 102  *     NSDI, Apr. 2013
 103  *
 104  * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2):
 105  *     122-144, May 2004.
 106  *
 107  * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical
 108  *     Alternative to Traditional Hash Tables".  In Proc. 7th Workshop on
 109  *     Distributed Data and Structures (WDAS'06), 2006.
 110  */
 111 /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */
 112 #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8))
 113
 114 /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit. */
 115 #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE)
 116
 117 /* Pad to make a bucket a full cache line in size: 4 on 32-bit, 0 on 64-bit. */
 118 #define CMAP_PADDING ((CACHE_LINE_SIZE - 4) - (CMAP_K * CMAP_ENTRY_SIZE))
 119
 120 /* A cuckoo hash bucket.  Designed to be cache-aligned and exactly one cache
 121  * line long. */
 122 struct cmap_bucket {
 123     /* Allows readers to track in-progress changes.  Initially zero, each
 124      * writer increments this value just before and just after each change (see
 125      * cmap_set_bucket()).  Thus, a reader can ensure that it gets a consistent
 126      * snapshot by waiting for the counter to become even (see
 127      * read_even_counter()), then checking that its value does not change while
 128      * examining the bucket (see cmap_find()). */
 129     atomic_uint32_t counter;
 130
 131     /* (hash, node) slots.  They are parallel arrays instead of an array of
 132      * structs to reduce the amount of space lost to padding.
 133      *
 134      * The slots are in no particular order.  A null pointer indicates that a
 135      * pair is unused.  In-use slots are not necessarily in the earliest
 136      * slots. */
 137     atomic_uint32_t hashes[CMAP_K];
 138     struct cmap_node nodes[CMAP_K];
 139
 140     /* Padding to make cmap_bucket exactly one cache line long. */
 141 #if CMAP_PADDING > 0
 142     uint8_t pad[CMAP_PADDING];
 143 #endif
 144 };
 145 BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
 146
 147 /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
 148  * enlarging a cmap.  Reasonable values lie between about 75% and 93%.  Smaller
 149  * values waste memory; larger values increase the average insertion time. */
 150 #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
 151
 152 /* The implementation of a concurrent hash map. */
 153 struct cmap_impl {
 154     unsigned int n;             /* Number of in-use elements. */
 155     unsigned int max_n;         /* Max elements before enlarging. */
 156     uint32_t mask;              /* Number of 'buckets', minus one. */
 157     uint32_t basis;             /* Basis for rehashing client's hash values. */
 158
 159     /* Padding to make cmap_impl exactly one cache line long. */
 160     uint8_t pad[CACHE_LINE_SIZE - sizeof(unsigned int) * 4];
 161
 162     struct cmap_bucket buckets[];
 163 };
 164 BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE);
 165
 166 static uint32_t cmap_get_hash__(const atomic_uint32_t *hash,
 167                                 memory_order order)
 168 {
 169     uint32_t hash__;
 170
 171     atomic_read_explicit(CONST_CAST(ATOMIC(uint32_t) *, hash), &hash__, order);
 172     return hash__;
 173 }
 174
 175 #define cmap_get_hash(HASH) \
 176     cmap_get_hash__(HASH, memory_order_acquire)
 177 #define cmap_get_hash_protected(HASH) \
 178     cmap_get_hash__(HASH, memory_order_relaxed)
 179
 180 static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
 181
 182 /* Given a rehashed value 'hash', returns the other hash for that rehashed
 183  * value.  This is symmetric: other_hash(other_hash(x)) == x.  (See also "Hash
 184  * Functions" at the top of this file.) */
 185 static uint32_t
 186 other_hash(uint32_t hash)
 187 {
 188     return (hash << 16) | (hash >> 16);
 189 }
 190
 191 /* Returns the rehashed value for 'hash' within 'impl'.  (See also "Hash
 192  * Functions" at the top of this file.) */
 193 static uint32_t
 194 rehash(const struct cmap_impl *impl, uint32_t hash)
 195 {
 196     return hash_finish(impl->basis, hash);
 197 }
 198
 199 static struct cmap_impl *
 200 cmap_get_impl(const struct cmap *cmap)
 201 {
 202     return ovsrcu_get(struct cmap_impl *, &cmap->impl);
 203 }
 204
 205 static uint32_t
 206 calc_max_n(uint32_t mask)
 207 {
 208     return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32;
 209 }
 210
 211 static struct cmap_impl *
 212 cmap_impl_create(uint32_t mask)
 213 {
 214     struct cmap_impl *impl;
 215
 216     ovs_assert(is_pow2(mask + 1));
 217
 218     impl = xzalloc_cacheline(sizeof *impl
 219                              + (mask + 1) * sizeof *impl->buckets);
 220     impl->n = 0;
 221     impl->max_n = calc_max_n(mask);
 222     impl->mask = mask;
 223     impl->basis = random_uint32();
 224
 225     return impl;
 226 }
 227
 228 /* Initializes 'cmap' as an empty concurrent hash map. */
 229 void
 230 cmap_init(struct cmap *cmap)
 231 {
 232     ovsrcu_set(&cmap->impl, cmap_impl_create(0));
 233 }
 234
 235 /* Destroys 'cmap'.
 236  *
 237  * The client is responsible for destroying any data previously held in
 238  * 'cmap'. */
 239 void
 240 cmap_destroy(struct cmap *cmap)
 241 {
 242     if (cmap) {
 243         ovsrcu_postpone(free_cacheline, cmap_get_impl(cmap));
 244     }
 245 }
 246
 247 /* Returns the number of elements in 'cmap'. */
 248 size_t
 249 cmap_count(const struct cmap *cmap)
 250 {
 251     return cmap_get_impl(cmap)->n;
 252 }
 253
 254 /* Returns true if 'cmap' is empty, false otherwise. */
 255 bool
 256 cmap_is_empty(const struct cmap *cmap)
 257 {
 258     return cmap_count(cmap) == 0;
 259 }
 260
 261 static uint32_t
 262 read_counter(struct cmap_bucket *bucket)
 263 {
 264     uint32_t counter;
 265
 266     atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire);
 267     return counter;
 268 }
 269
 270 static uint32_t
 271 read_even_counter(struct cmap_bucket *bucket)
 272 {
 273     uint32_t counter;
 274
 275     do {
 276         counter = read_counter(bucket);
 277     } while (OVS_UNLIKELY(counter & 1));
 278
 279     return counter;
 280 }
 281
 282 static bool
 283 counter_changed(struct cmap_bucket *b, uint32_t c)
 284 {
 285     return OVS_UNLIKELY(read_counter(b) != c);
 286 }
 287
 288 /* Searches 'cmap' for an element with the specified 'hash'.  If one or more is
 289  * found, returns a pointer to the first one, otherwise a null pointer.  All of
 290  * the nodes on the returned list are guaranteed to have exactly the given
 291  * 'hash'.
 292  *
 293  * This function works even if 'cmap' is changing concurrently.  If 'cmap' is
 294  * not changing, then cmap_find_protected() is slightly faster.
 295  *
 296  * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */
 297 struct cmap_node *
 298 cmap_find(const struct cmap *cmap, uint32_t hash)
 299 {
 300     struct cmap_impl *impl = cmap_get_impl(cmap);
 301     uint32_t h1 = rehash(impl, hash);
 302     uint32_t h2 = other_hash(h1);
 303     struct cmap_bucket *b1;
 304     struct cmap_bucket *b2;
 305     uint32_t c1, c2;
 306     int i;
 307     struct cmap_node *node;
 308
 309     b1 = &impl->buckets[h1 & impl->mask];
 310     b2 = &impl->buckets[h2 & impl->mask];
 311 retry:
 312     node = NULL;
 313     c1 = read_even_counter(b1);
 314     for (i = 0; i < CMAP_K; i++) {
 315         if (cmap_get_hash(&b1->hashes[i]) == hash) {
 316             node = cmap_node_next(&b1->nodes[i]);
 317             break;
 318         }
 319     }
 320     if (OVS_UNLIKELY(counter_changed(b1, c1))) {
 321         goto retry;
 322     }
 323     if (node) {
 324         return node;
 325     }
 326
 327 retry2:
 328     node = NULL;
 329     c2 = read_even_counter(b2);
 330     for (i = 0; i < CMAP_K; i++) {
 331         if (cmap_get_hash(&b2->hashes[i]) == hash) {
 332             node = cmap_node_next(&b2->nodes[i]);
 333             break;
 334         }
 335     }
 336     if (OVS_UNLIKELY(counter_changed(b2, c2))) {
 337         goto retry2;
 338     }
 339     if (node) {
 340         return node;
 341     }
 342
 343     /* We just got a stable reading on 'b2', but a node could have been moved
 344      * to 'b1', so we need to chack the 'c1' again. */
 345     if (counter_changed(b1, c1)) {
 346         goto retry;
 347     }
 348     return NULL;
 349 }
 350
 351 static int
 352 cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash)
 353 {
 354     int i;
 355
 356     for (i = 0; i < CMAP_K; i++) {
 357         struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 358
 359         if (node && cmap_get_hash_protected(&b->hashes[i]) == hash) {
 360             return i;
 361         }
 362     }
 363     return -1;
 364 }
 365
 366 static struct cmap_node *
 367 cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h)
 368 {
 369     struct cmap_bucket *b = &impl->buckets[h & impl->mask];
 370     int i;
 371
 372     for (i = 0; i < CMAP_K; i++) {
 373         struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 374
 375         if (node && cmap_get_hash_protected(&b->hashes[i]) == hash) {
 376             return node;
 377         }
 378     }
 379     return NULL;
 380 }
 381
 382 /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently.
 383  *
 384  * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */
 385 struct cmap_node *
 386 cmap_find_protected(const struct cmap *cmap, uint32_t hash)
 387 {
 388     struct cmap_impl *impl = cmap_get_impl(cmap);
 389     uint32_t h1 = rehash(impl, hash);
 390     uint32_t h2 = other_hash(hash);
 391     struct cmap_node *node;
 392
 393     node = cmap_find_bucket_protected(impl, hash, h1);
 394     if (node) {
 395         return node;
 396     }
 397     return cmap_find_bucket_protected(impl, hash, h2);
 398 }
 399
 400 static int
 401 cmap_find_empty_slot_protected(const struct cmap_bucket *b)
 402 {
 403     int i;
 404
 405     for (i = 0; i < CMAP_K; i++) {
 406         if (!cmap_node_next_protected(&b->nodes[i])) {
 407             return i;
 408         }
 409     }
 410     return -1;
 411 }
 412
 413 static void
 414 cmap_set_bucket(struct cmap_bucket *b, int i,
 415                 struct cmap_node *node, uint32_t hash)
 416 {
 417     uint32_t c;
 418
 419     atomic_read_explicit(&b->counter, &c, memory_order_acquire);
 420     atomic_store_explicit(&b->counter, c + 1, memory_order_release);
 421     ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */
 422     atomic_store_explicit(&b->hashes[i], hash, memory_order_release);
 423     atomic_store_explicit(&b->counter, c + 2, memory_order_release);
 424 }
 425
 426 /* Searches 'b' for a node with the given 'hash'.  If it finds one, adds
 427  * 'new_node' to the node's linked list and returns true.  If it does not find
 428  * one, returns false. */
 429 static bool
 430 cmap_insert_dup(struct cmap_node *new_node, uint32_t hash,
 431                 struct cmap_bucket *b)
 432 {
 433     int i;
 434
 435     for (i = 0; i < CMAP_K; i++) {
 436         struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 437
 438         if (cmap_get_hash_protected(&b->hashes[i]) == hash) {
 439             if (node) {
 440                 struct cmap_node *p;
 441
 442                 /* The common case is that 'new_node' is a singleton,
 443                  * with a null 'next' pointer.  Rehashing can add a
 444                  * longer chain, but due to our invariant of always
 445                  * having all nodes with the same (user) hash value at
 446                  * a single chain, rehashing will always insert the
 447                  * chain to an empty node.  The only way we can end up
 448                  * here is by the user inserting a chain of nodes at
 449                  * once.  Find the end of the chain starting at
 450                  * 'new_node', then splice 'node' to the end of that
 451                  * chain. */
 452                 p = new_node;
 453                 for (;;) {
 454                     struct cmap_node *next = cmap_node_next_protected(p);
 455
 456                     if (!next) {
 457                         break;
 458                     }
 459                     p = next;
 460                 }
 461                 ovsrcu_set_hidden(&p->next, node);
 462             } else {
 463                 /* The hash value is there from some previous insertion, but
 464                  * the associated node has been removed.  We're not really
 465                  * inserting a duplicate, but we can still reuse the slot.
 466                  * Carry on. */
 467             }
 468
 469             /* Change the bucket to point to 'new_node'.  This is a degenerate
 470              * form of cmap_set_bucket() that doesn't update the counter since
 471              * we're only touching one field and in a way that doesn't change
 472              * the bucket's meaning for readers. */
 473             ovsrcu_set(&b->nodes[i].next, new_node);
 474
 475             return true;
 476         }
 477     }
 478     return false;
 479 }
 480
 481 /* Searches 'b' for an empty slot.  If successful, stores 'node' and 'hash' in
 482  * the slot and returns true.  Otherwise, returns false. */
 483 static bool
 484 cmap_insert_bucket(struct cmap_node *node, uint32_t hash,
 485                    struct cmap_bucket *b)
 486 {
 487     int i;
 488
 489     for (i = 0; i < CMAP_K; i++) {
 490         if (!cmap_node_next_protected(&b->nodes[i])) {
 491             cmap_set_bucket(b, i, node, hash);
 492             return true;
 493         }
 494     }
 495     return false;
 496 }
 497
 498 /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'.  (This
 499  * might be the same as 'b'.) */
 500 static struct cmap_bucket *
 501 other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot)
 502 {
 503     uint32_t h1 = rehash(impl, cmap_get_hash_protected(&b->hashes[slot]));
 504     uint32_t h2 = other_hash(h1);
 505     uint32_t b_idx = b - impl->buckets;
 506     uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1;
 507
 508     return &impl->buckets[other_h & impl->mask];
 509 }
 510
 511 /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1'
 512  * and 'b2' are full.  This function attempts to rearrange buckets within
 513  * 'impl' to make room for 'new_node'.
 514  *
 515  * The implementation is a general-purpose breadth-first search.  At first
 516  * glance, this is more complex than a random walk through 'impl' (suggested by
 517  * some references), but random walks have a tendency to loop back through a
 518  * single bucket.  We have to move nodes backward along the path that we find,
 519  * so that no node actually disappears from the hash table, which means a
 520  * random walk would have to be careful to deal with loops.  By contrast, a
 521  * successful breadth-first search always finds a *shortest* path through the
 522  * hash table, and a shortest path will never contain loops, so it avoids that
 523  * problem entirely.
 524  */
 525 static bool
 526 cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node,
 527                 uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2)
 528 {
 529     enum { MAX_DEPTH = 4 };
 530
 531     /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'.
 532      *
 533      * One can follow the path via:
 534      *
 535      *     struct cmap_bucket *b;
 536      *     int i;
 537      *
 538      *     b = path->start;
 539      *     for (i = 0; i < path->n; i++) {
 540      *         b = other_bucket_protected(impl, b, path->slots[i]);
 541      *     }
 542      *     ovs_assert(b == path->end);
 543      */
 544     struct cmap_path {
 545         struct cmap_bucket *start; /* First bucket along the path. */
 546         struct cmap_bucket *end;   /* Last bucket on the path. */
 547         uint8_t slots[MAX_DEPTH];  /* Slots used for each hop. */
 548         int n;                     /* Number of slots[]. */
 549     };
 550
 551     /* We need to limit the amount of work we do trying to find a path.  It
 552      * might actually be impossible to rearrange the cmap, and after some time
 553      * it is likely to be easier to rehash the entire cmap.
 554      *
 555      * This value of MAX_QUEUE is an arbitrary limit suggested by one of the
 556      * references.  Empirically, it seems to work OK. */
 557     enum { MAX_QUEUE = 500 };
 558     struct cmap_path queue[MAX_QUEUE];
 559     int head = 0;
 560     int tail = 0;
 561
 562     /* Add 'b1' and 'b2' as starting points for the search. */
 563     queue[head].start = b1;
 564     queue[head].end = b1;
 565     queue[head].n = 0;
 566     head++;
 567     if (b1 != b2) {
 568         queue[head].start = b2;
 569         queue[head].end = b2;
 570         queue[head].n = 0;
 571         head++;
 572     }
 573
 574     while (tail < head) {
 575         const struct cmap_path *path = &queue[tail++];
 576         struct cmap_bucket *this = path->end;
 577         int i;
 578
 579         for (i = 0; i < CMAP_K; i++) {
 580             struct cmap_bucket *next = other_bucket_protected(impl, this, i);
 581             int j;
 582
 583             if (this == next) {
 584                 continue;
 585             }
 586
 587             j = cmap_find_empty_slot_protected(next);
 588             if (j >= 0) {
 589                 /* We've found a path along which we can rearrange the hash
 590                  * table:  Start at path->start, follow all the slots in
 591                  * path->slots[], then follow slot 'i', then the bucket you
 592                  * arrive at has slot 'j' empty. */
 593                 struct cmap_bucket *buckets[MAX_DEPTH + 2];
 594                 int slots[MAX_DEPTH + 2];
 595                 int k;
 596
 597                 /* Figure out the full sequence of slots. */
 598                 for (k = 0; k < path->n; k++) {
 599                     slots[k] = path->slots[k];
 600                 }
 601                 slots[path->n] = i;
 602                 slots[path->n + 1] = j;
 603
 604                 /* Figure out the full sequence of buckets. */
 605                 buckets[0] = path->start;
 606                 for (k = 0; k <= path->n; k++) {
 607                     buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]);
 608                 }
 609
 610                 /* Now the path is fully expressed.  One can start from
 611                  * buckets[0], go via slots[0] to buckets[1], via slots[1] to
 612                  * buckets[2], and so on.
 613                  *
 614                  * Move all the nodes across the path "backward".  After each
 615                  * step some node appears in two buckets.  Thus, every node is
 616                  * always visible to a concurrent search. */
 617                 for (k = path->n + 1; k > 0; k--) {
 618                     int slot = slots[k - 1];
 619
 620                     cmap_set_bucket(buckets[k], slots[k],
 621                                     cmap_node_next_protected(&buckets[k - 1]->nodes[slot]),
 622                                     cmap_get_hash_protected(&buckets[k - 1]->hashes[slot]));
 623                 }
 624
 625                 /* Finally, replace the first node on the path by
 626                  * 'new_node'. */
 627                 cmap_set_bucket(buckets[0], slots[0], new_node, hash);
 628
 629                 return true;
 630             }
 631
 632             if (path->n < MAX_DEPTH && head < MAX_QUEUE) {
 633                 struct cmap_path *new_path = &queue[head++];
 634
 635                 *new_path = *path;
 636                 new_path->end = next;
 637                 new_path->slots[new_path->n++] = i;
 638             }
 639         }
 640     }
 641
 642     return false;
 643 }
 644
 645 /* Adds 'node', with the given 'hash', to 'impl'.
 646  *
 647  * 'node' is ordinarily a single node, with a null 'next' pointer.  When
 648  * rehashing, however, it may be a longer chain of nodes. */
 649 static bool
 650 cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)
 651 {
 652     uint32_t h1 = rehash(impl, hash);
 653     uint32_t h2 = other_hash(h1);
 654     struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];
 655     struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask];
 656
 657     return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||
 658                          cmap_insert_dup(node, hash, b2)) ||
 659             OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||
 660                        cmap_insert_bucket(node, hash, b2)) ||
 661             cmap_insert_bfs(impl, node, hash, b1, b2));
 662 }
 663
 664 /* Inserts 'node', with the given 'hash', into 'cmap'.  The caller must ensure
 665  * that 'cmap' cannot change concurrently (from another thread).  If duplicates
 666  * are undesirable, the caller must have already verified that 'cmap' does not
 667  * contain a duplicate of 'node'.
 668  *
 669  * Returns the current number of nodes in the cmap after the insertion. */
 670 size_t
 671 cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
 672 {
 673     struct cmap_impl *impl = cmap_get_impl(cmap);
 674
 675     ovsrcu_set_hidden(&node->next, NULL);
 676
 677     if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
 678         impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
 679     }
 680
 681     while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
 682         impl = cmap_rehash(cmap, impl->mask);
 683     }
 684     return ++impl->n;
 685 }
 686
 687 static bool
 688 cmap_replace__(struct cmap_impl *impl, struct cmap_node *node,
 689                struct cmap_node *replacement, uint32_t hash, uint32_t h)
 690 {
 691     struct cmap_bucket *b = &impl->buckets[h & impl->mask];
 692     int slot;
 693
 694     slot = cmap_find_slot_protected(b, hash);
 695     if (slot < 0) {
 696         return false;
 697     }
 698
 699     /* The pointer to 'node' is changed to point to 'replacement',
 700      * which is the next node if no replacement node is given. */
 701     if (!replacement) {
 702         replacement = cmap_node_next_protected(node);
 703     } else {
 704         /* 'replacement' takes the position of 'node' in the list. */
 705         ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node));
 706     }
 707
 708     struct cmap_node *iter = &b->nodes[slot];
 709     for (;;) {
 710         struct cmap_node *next = cmap_node_next_protected(iter);
 711
 712         if (next == node) {
 713             ovsrcu_set(&iter->next, replacement);
 714             return true;
 715         }
 716         iter = next;
 717     }
 718 }
 719
 720 /* Replaces 'old_node' in 'cmap' with 'new_node'.  The caller must
 721  * ensure that 'cmap' cannot change concurrently (from another thread).
 722  *
 723  * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or
 724  * into any other concurrent hash map while any other thread might be accessing
 725  * it.  One correct way to do this is to free it from an RCU callback with
 726  * ovsrcu_postpone().
 727  *
 728  * Returns the current number of nodes in the cmap after the replacement.  The
 729  * number of nodes decreases by one if 'new_node' is NULL. */
 730 size_t
 731 cmap_replace(struct cmap *cmap, struct cmap_node *old_node,
 732              struct cmap_node *new_node, uint32_t hash)
 733 {
 734     struct cmap_impl *impl = cmap_get_impl(cmap);
 735     uint32_t h1 = rehash(impl, hash);
 736     uint32_t h2 = other_hash(h1);
 737     bool ok;
 738
 739     ok = cmap_replace__(impl, old_node, new_node, hash, h1)
 740         || cmap_replace__(impl, old_node, new_node, hash, h2);
 741     ovs_assert(ok);
 742
 743     if (!new_node) {
 744         impl->n--;
 745     }
 746     return impl->n;
 747 }
 748
 749 static bool
 750 cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new)
 751 {
 752     const struct cmap_bucket *b;
 753
 754     for (b = old->buckets; b <= &old->buckets[old->mask]; b++) {
 755         int i;
 756
 757         for (i = 0; i < CMAP_K; i++) {
 758             /* possible optimization here because we know the hashes are
 759              * unique */
 760             struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 761
 762             if (node &&
 763                 !cmap_try_insert(new, node,
 764                                  cmap_get_hash_protected(&b->hashes[i]))) {
 765                 return false;
 766             }
 767         }
 768     }
 769     return true;
 770 }
 771
 772 static struct cmap_impl *
 773 cmap_rehash(struct cmap *cmap, uint32_t mask)
 774 {
 775     struct cmap_impl *old = cmap_get_impl(cmap);
 776     struct cmap_impl *new;
 777
 778     new = cmap_impl_create(mask);
 779     ovs_assert(old->n < new->max_n);
 780
 781     while (!cmap_try_rehash(old, new)) {
 782         memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets);
 783         new->basis = random_uint32();
 784     }
 785
 786     new->n = old->n;
 787     ovsrcu_set(&cmap->impl, new);
 788     ovsrcu_postpone(free_cacheline, old);
 789
 790     return new;
 791 }
 792
 793 struct cmap_cursor
 794 cmap_cursor_start(const struct cmap *cmap)
 795 {
 796     struct cmap_cursor cursor;
 797
 798     cursor.impl = cmap_get_impl(cmap);
 799     cursor.bucket_idx = 0;
 800     cursor.entry_idx = 0;
 801     cursor.node = NULL;
 802     cmap_cursor_advance(&cursor);
 803
 804     return cursor;
 805 }
 806
 807 void
 808 cmap_cursor_advance(struct cmap_cursor *cursor)
 809 {
 810     const struct cmap_impl *impl = cursor->impl;
 811
 812     if (cursor->node) {
 813         cursor->node = cmap_node_next(cursor->node);
 814         if (cursor->node) {
 815             return;
 816         }
 817     }
 818
 819     while (cursor->bucket_idx <= impl->mask) {
 820         const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx];
 821
 822         while (cursor->entry_idx < CMAP_K) {
 823             cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]);
 824             if (cursor->node) {
 825                 return;
 826             }
 827         }
 828
 829         cursor->bucket_idx++;
 830         cursor->entry_idx = 0;
 831     }
 832 }
 833
 834 /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in
 835  * 'cmap'.  Uses '*pos' to determine where to begin iteration, and updates
 836  * '*pos' to pass on the next iteration into them before returning.
 837  *
 838  * It's better to use plain CMAP_FOR_EACH and related functions, since they are
 839  * faster and better at dealing with cmaps that change during iteration.
 840  *
 841  * Before beginning iteration, set '*pos' to all zeros. */
 842 struct cmap_node *
 843 cmap_next_position(const struct cmap *cmap,
 844                    struct cmap_position *pos)
 845 {
 846     struct cmap_impl *impl = cmap_get_impl(cmap);
 847     unsigned int bucket = pos->bucket;
 848     unsigned int entry = pos->entry;
 849     unsigned int offset = pos->offset;
 850
 851     while (bucket <= impl->mask) {
 852         const struct cmap_bucket *b = &impl->buckets[bucket];
 853
 854         while (entry < CMAP_K) {
 855             const struct cmap_node *node = cmap_node_next(&b->nodes[entry]);
 856             unsigned int i;
 857
 858             for (i = 0; node; i++, node = cmap_node_next(node)) {
 859                 if (i == offset) {
 860                     if (cmap_node_next(node)) {
 861                         offset++;
 862                     } else {
 863                         entry++;
 864                         offset = 0;
 865                     }
 866                     pos->bucket = bucket;
 867                     pos->entry = entry;
 868                     pos->offset = offset;
 869                     return CONST_CAST(struct cmap_node *, node);
 870                 }
 871             }
 872
 873             entry++;
 874             offset = 0;
 875         }
 876
 877         bucket++;
 878         entry = offset = 0;
 879     }
 880
 881     pos->bucket = pos->entry = pos->offset = 0;
 882     return NULL;
 883 }