lib/ovs-atomic-i586.h

   1 /*
   2  * Copyright (c) 2014 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /* This header implements atomic operation primitives on 32-bit 586+ with GCC.
  18  */
  19 #ifndef IN_OVS_ATOMIC_H
  20 #error "This header should only be included indirectly via ovs-atomic.h."
  21 #endif
  22
  23 #define OVS_ATOMIC_I586_IMPL 1
  24
  25 /*
  26  * These assumptions have been adopted from the x86_64 Memory model:
  27  *
  28  * - 1, 2, and 4 byte loads and stores are atomic on aligned memory.
  29  * - Loads are not reordered with other loads.
  30  * - Stores are not reordered with OLDER loads.
  31  *   - Loads may be reordered with OLDER stores to a different memory location,
  32  *     but not with OLDER stores to the same memory location.
  33  * - Stores are not reordered with other stores, except maybe for special
  34  *   instructions not emitted by compilers, or by the stores performed by
  35  *   a single fast string operation (e.g., "stos").  As long as the atomic
  36  *   stores are not combined with any other stores, even the allowed reordering
  37  *   of the stores by a single fast string operation is not a problem.
  38  * - Neither loads nor stores are reordered with locked instructions.
  39  * - Stores by a single processor are observed in the same order by all
  40  *   processors.
  41  * - (Unlocked) Stores from different processors are NOT ordered.
  42  * - Memory ordering obeys causality (memory ordering respects transitive
  43  *   visibility).
  44  * - Any two stores are seen in a consistent order by processors other than
  45  *   the those performing the stores.
  46  * - Locked instructions have total order.
  47  *
  48  * These rules imply that:
  49  *
  50  * - Locked instructions are not needed for aligned loads or stores to make
  51  *   them atomic for sizes upto 4 bytes.  8 byte objects need locked
  52  *   instructions.
  53  * - All stores have release semantics; none of the preceding stores or loads
  54  *   can be reordered with following stores.  Following loads could still be
  55  *   reordered to happen before the store, but that is not a violation of the
  56  *   release semantics.
  57  * - All loads from a given memory location have acquire semantics with
  58  *   respect to the stores on the same memory location; none of the following
  59  *   loads or stores can be reordered with the load.  Preceding stores to a
  60  *   different memory location MAY be reordered with the load, but that is not
  61  *   a violation of the acquire semantics (i.e., the loads and stores of two
  62  *   critical sections guarded by a different memory location can overlap).
  63  * - Locked instructions serve as CPU memory barriers by themselves.
  64  * - Locked stores implement the sequential consistency memory order.  Using
  65  *   locked instructions when seq_cst memory order is requested allows normal
  66  *   loads to observe the stores in the same (total) order without using CPU
  67  *   memory barrier after the loads.
  68  *
  69  * NOTE: Some older AMD Opteron processors have a bug that violates the
  70  * acquire semantics described above.  The bug manifests as an unlocked
  71  * read-modify-write operation following a "semaphore operation" operating
  72  * on data that existed before entering the critical section; i.e., the
  73  * preceding "semaphore operation" fails to function as an acquire barrier.
  74  * The affected CPUs are AMD family 15, models 32 to 63.
  75  *
  76  * Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
  77  */
  78
  79 /* Barriers. */
  80
  81 #define compiler_barrier()  asm volatile(" " : : : "memory")
  82 #define cpu_barrier()  asm volatile("lock; addl $0,(%%esp)" ::: "memory", "cc")
  83
  84 /*
  85  * The 'volatile' keyword prevents the compiler from keeping the atomic
  86  * value in a register, and generates a new memory access for each atomic
  87  * operation.  This allows the implementations of memory_order_relaxed and
  88  * memory_order_consume to avoid issuing a compiler memory barrier, allowing
  89  * full optimization of all surrounding non-atomic variables.
  90  *
  91  * The placement of the 'volatile' keyword after the 'TYPE' below is highly
  92  * significant when the TYPE is a pointer type.  In that case we want the
  93  * pointer to be declared volatile, not the data type that is being pointed
  94  * at!
  95  *
  96  * Attribute aligned is used to tell the compiler to align 64-bit data
  97  * on a 8-byte boundary.  This allows more efficient atomic access, as the
  98  * the CPU guarantees such memory accesses to be atomic. */
  99 #define ATOMIC(TYPE) TYPE volatile __attribute__((aligned(sizeof(TYPE))))
 100
 101 /* Memory ordering.  Must be passed in as a constant. */
 102 typedef enum {
 103     memory_order_relaxed,
 104     memory_order_consume,
 105     memory_order_acquire,
 106     memory_order_release,
 107     memory_order_acq_rel,
 108     memory_order_seq_cst
 109 } memory_order;
 110 \f
 111 #define ATOMIC_BOOL_LOCK_FREE 2
 112 #define ATOMIC_CHAR_LOCK_FREE 2
 113 #define ATOMIC_SHORT_LOCK_FREE 2
 114 #define ATOMIC_INT_LOCK_FREE 2
 115 #define ATOMIC_LONG_LOCK_FREE 2
 116 #define ATOMIC_LLONG_LOCK_FREE 2
 117 #define ATOMIC_POINTER_LOCK_FREE 2
 118
 119 #define IS_LOCKLESS_ATOMIC(OBJECT)                      \
 120     (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
 121 \f
 122 #define ATOMIC_VAR_INIT(VALUE) VALUE
 123 #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
 124
 125 /*
 126  * The memory_model_relaxed does not need a compiler barrier, if the
 127  * atomic operation can otherwise be guaranteed to not be moved with
 128  * respect to other atomic operations on the same memory location.  Using
 129  * the 'volatile' keyword in the definition of the atomic types
 130  * accomplishes this, as memory accesses to volatile data may not be
 131  * optimized away, or be reordered with other volatile accesses.
 132  *
 133  * On x86 also memory_order_consume is automatic, and data dependency on a
 134  * volatile atomic variable means that the compiler optimizations should not
 135  * cause problems.  That is, the compiler should not speculate the value of
 136  * the atomic_read, as it is going to read it from the memory anyway.
 137  * This allows omiting the compiler memory barrier on atomic_reads with
 138  * memory_order_consume.  This matches the definition of
 139  * smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
 140  * in rcu_dereference().
 141  *
 142  * We use this same logic below to choose inline assembly statements with or
 143  * without a compiler memory barrier.
 144  */
 145 static inline void
 146 atomic_compiler_barrier(memory_order order)
 147 {
 148     if (order > memory_order_consume) {
 149         compiler_barrier();
 150     }
 151 }
 152
 153 static inline void
 154 atomic_thread_fence(memory_order order)
 155 {
 156     if (order == memory_order_seq_cst) {
 157         cpu_barrier();
 158     } else {
 159         atomic_compiler_barrier(order);
 160     }
 161 }
 162
 163 static inline void
 164 atomic_signal_fence(memory_order order)
 165 {
 166     atomic_compiler_barrier(order);
 167 }
 168
 169 #define atomic_is_lock_free(OBJ)                \
 170     ((void) *(OBJ),                             \
 171      IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)
 172
 173 /* The 8-byte atomic exchange uses cmpxchg8b with the SRC (ax:dx) as
 174  * the expected value (bx:cx), which will get replaced by the current
 175  * value in the likely case it did not match, after which we keep
 176  * trying until the swap succeeds. */
 177
 178 #if defined(__PIC__)
 179 /* ebx may not be clobbered when compiled with -fPIC, must save and
 180  * restore it.  Furthermore, 'DST' may be addressed via ebx, so the
 181  * address must be passed via a register so that it remains valid also
 182  * after changing ebx. */
 183 #define atomic_exchange_8__(DST, SRC, CLOB)       \
 184     uint32_t temp____;                            \
 185                                                   \
 186     asm volatile("      movl %%ebx,%2 ;    "      \
 187                  "      movl %%eax,%%ebx ; "      \
 188                  "      movl %%edx,%%ecx ; "      \
 189                  "1:                       "      \
 190                  "lock; cmpxchg8b (%0);    "      \
 191                  "      jne 1b ;           "      \
 192                  "      movl %2,%%ebx ;    "      \
 193                  " # atomic_exchange_8__   "      \
 194                  : "+r" (DST),       /* 0 */      \
 195                    "+A" (SRC),       /* 1 */      \
 196                    "=mr" (temp____)  /* 2 */      \
 197                  :: "ecx", CLOB, "cc")
 198
 199 #else
 200 #define atomic_exchange_8__(DST, SRC, CLOB)       \
 201     asm volatile("      movl %%eax,%%ebx ; "      \
 202                  "      movl %%edx,%%ecx ; "      \
 203                  "1:                       "      \
 204                  "lock; cmpxchg8b %0 ;     "      \
 205                  "      jne 1b ;           "      \
 206                  " # atomic_exchange_8__   "      \
 207                  : "+m" (*DST),      /* 0 */      \
 208                    "+A" (SRC)        /* 1 */      \
 209                  :: "ebx", "ecx", CLOB, "cc")
 210 #endif
 211
 212 #define atomic_exchange__(DST, SRC, ORDER)        \
 213     ({                                            \
 214         typeof(DST) dst___ = (DST);               \
 215         typeof(*(DST)) src___ = (SRC);            \
 216                                                   \
 217         if ((ORDER) > memory_order_consume) {                  \
 218             if (sizeof(*(DST)) == 8) {                         \
 219                 atomic_exchange_8__(dst___, src___, "memory"); \
 220             } else {                                           \
 221                 asm volatile("xchg %1,%0 ;       "             \
 222                              "# atomic_exchange__"             \
 223                              : "+r" (src___),   /* 0 */        \
 224                                "+m" (*dst___)   /* 1 */        \
 225                              :: "memory");                     \
 226             }                                                  \
 227         } else {                                               \
 228             if (sizeof(*(DST)) == 8) {                         \
 229                 atomic_exchange_8__(dst___, src___, "cc");     \
 230             } else {                                           \
 231                 asm volatile("xchg %1,%0 ;       "             \
 232                              "# atomic_exchange__"             \
 233                              : "+r" (src___),    /* 0 */       \
 234                                "+m" (*dst___));  /* 1 */       \
 235             }                                                  \
 236         }                                                      \
 237         src___;                                                \
 238     })
 239
 240 #if defined(__SSE__)
 241 /* SSE registers are 128-bit wide, and moving the lowest 64-bits of an SSE
 242  * register to proerly aligned memory is atomic.  See ATOMIC(TYPE) above. */
 243 #define atomic_store_8__(DST, SRC)                 \
 244     asm volatile("movq %1,%0 ; # atomic_store_8__" \
 245                  : "=m" (*DST)   /* 0 */           \
 246                  : "x" (SRC))    /* 1, SSE */
 247 #else
 248 /* Locked 64-bit exchange is available on all i586 CPUs. */
 249 #define atomic_store_8__(DST, SRC)          \
 250     atomic_exchange_8__(DST, SRC, "cc")
 251 #endif
 252
 253 #define atomic_store_explicit(DST, SRC, ORDER)          \
 254     ({                                                  \
 255         typeof(DST) dst__ = (DST);                      \
 256         typeof(*(DST)) src__ = (SRC);                   \
 257                                                         \
 258         if ((ORDER) != memory_order_seq_cst) {          \
 259             atomic_compiler_barrier(ORDER);             \
 260             if (sizeof(*(DST)) == 8) {                  \
 261                 atomic_store_8__(dst__, src__);         \
 262             } else {                                    \
 263                 *dst__ = src__;                         \
 264             }                                           \
 265         } else {                                        \
 266             atomic_exchange__(dst__, src__, ORDER);     \
 267         }                                               \
 268         (void) 0;                                       \
 269     })
 270 #define atomic_store(DST, SRC)                              \
 271     atomic_store_explicit(DST, SRC, memory_order_seq_cst)
 272
 273 #if defined(__SSE__)
 274 /* SSE registers are 128-bit wide, and moving 64-bits from properly aligned
 275  * memory to an SSE register is atomic.  See ATOMIC(TYPE) above. */
 276 #define atomic_read_8__(SRC, DST)               \
 277     ({                                          \
 278         typeof(*(DST)) res__;                   \
 279                                                 \
 280         asm ("movq %1,%0 ; # atomic_read_8__"   \
 281              : "=x" (res__)   /* 0, SSE. */     \
 282              : "m" (*SRC));   /* 1 */           \
 283         *(DST) = res__;                         \
 284     })
 285 #else
 286 /* Must use locked cmpxchg8b (available on all i586 CPUs) if compiled w/o sse
 287  * support.  Compare '*DST' to a random value in bx:cx and returns the actual
 288  * value in ax:dx.  The registers bx and cx are only read, so they are not
 289  * clobbered. */
 290 #define atomic_read_8__(SRC, DST)               \
 291     ({                                          \
 292         typeof(*(DST)) res__;                   \
 293                                                 \
 294         asm ("      movl %%ebx,%%eax ; "        \
 295              "      movl %%ecx,%%edx ; "        \
 296              "lock; cmpxchg8b %1 ;     "        \
 297              "# atomic_read_8__        "        \
 298              : "=&A" (res__), /* 0 */           \
 299                "+m"  (*SRC)   /* 1 */           \
 300              : : "cc");                         \
 301         *(DST) = res__;                         \
 302     })
 303 #endif
 304
 305 #define atomic_read_explicit(SRC, DST, ORDER)   \
 306     ({                                          \
 307         typeof(DST) dst__ = (DST);              \
 308         typeof(SRC) src__ = (SRC);              \
 309                                                 \
 310         if (sizeof(*(DST)) <= 4) {              \
 311             *dst__ = *src__;                    \
 312         } else {                                \
 313             atomic_read_8__(SRC, DST);          \
 314         }                                       \
 315         atomic_compiler_barrier(ORDER);         \
 316         (void) 0;                               \
 317     })
 318 #define atomic_read(SRC, DST)                               \
 319     atomic_read_explicit(SRC, DST, memory_order_seq_cst)
 320
 321 #if defined(__PIC__)
 322 /* ebx may not be used as an input when compiled with -fPIC, must save
 323  * and restore it.  Furthermore, 'DST' may be addressed via ebx, so
 324  * the address must be passed via a register so that it remains valid
 325  * also after changing ebx. */
 326 #define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB)         \
 327     asm volatile("      xchgl %%ebx,%3 ;    "                         \
 328                  "lock; cmpxchg8b (%1) ;    "                         \
 329                  "      xchgl %3,%%ebx ;    "                         \
 330                  "      sete %0             "                         \
 331                  "# atomic_compare_exchange_8__"                      \
 332                  : "=q" (RES),                 /* 0 */                \
 333                    "+r" (DST),                 /* 1 */                \
 334                    "+A" (EXP)                  /* 2 */                \
 335                  : "r" ((uint32_t)SRC),        /* 3 */                \
 336                    "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */      \
 337                  : CLOB, "cc")
 338 #else
 339 #define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB)         \
 340     asm volatile("lock; cmpxchg8b %1 ; "                              \
 341                  "      sete %0        "                              \
 342                  "# atomic_compare_exchange_8__"                      \
 343                  : "=q" (RES),                 /* 0 */                \
 344                    "+m" (*DST),                /* 1 */                \
 345                    "+A" (EXP)                  /* 2 */                \
 346                  : "b" ((uint32_t)SRC),        /* 3 */                \
 347                    "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */      \
 348                  : CLOB, "cc")
 349 #endif
 350
 351 #define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB)           \
 352     asm volatile("lock; cmpxchg %3,%1 ; "                             \
 353                  "      sete    %0      "                             \
 354                  "# atomic_compare_exchange__"                        \
 355                  : "=q" (RES),           /* 0 */                      \
 356                    "+m" (*DST),          /* 1 */                      \
 357                    "+a" (EXP)            /* 2 */                      \
 358                  : "r" (SRC)             /* 3 */                      \
 359                  : CLOB, "cc")
 360
 361 /* ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
 362  * at least as strong a barrier as allowed for ORD_FAIL in all cases. */
 363 #define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
 364     ({                                                                  \
 365         typeof(DST) dst__ = (DST);                                      \
 366         typeof(DST) expp__ = (EXP);                                     \
 367         typeof(*(DST)) src__ = (SRC);                                   \
 368         typeof(*(DST)) exp__ = *expp__;                                 \
 369         uint8_t res__;                                                  \
 370         (void)ORD_FAIL;                                                 \
 371                                                                         \
 372         if ((ORDER) > memory_order_consume) {                           \
 373             if (sizeof(*(DST)) <= 4) {                                  \
 374                 atomic_compare_exchange__(dst__, exp__, src__, res__,   \
 375                                           "memory");                    \
 376             } else {                                                    \
 377                 atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
 378                                             "memory");                  \
 379             }                                                           \
 380         } else {                                                        \
 381             if (sizeof(*(DST)) <= 4) {                                  \
 382                 atomic_compare_exchange__(dst__, exp__, src__, res__,   \
 383                                           "cc");                        \
 384             } else {                                                    \
 385                 atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
 386                                             "cc");                      \
 387             }                                                           \
 388         }                                                               \
 389         if (!res__) {                                                   \
 390             *expp__ = exp__;                                            \
 391         }                                                               \
 392         (bool)res__;                                                    \
 393     })
 394 #define atomic_compare_exchange_strong(DST, EXP, SRC)             \
 395     atomic_compare_exchange_strong_explicit(DST, EXP, SRC,        \
 396                                             memory_order_seq_cst, \
 397                                             memory_order_seq_cst)
 398 #define atomic_compare_exchange_weak            \
 399     atomic_compare_exchange_strong
 400 #define atomic_compare_exchange_weak_explicit   \
 401     atomic_compare_exchange_strong_explicit
 402
 403 #define atomic_add__(RMW, ARG, CLOB)            \
 404     asm volatile("lock; xadd %0,%1 ; "          \
 405                  "# atomic_add__     "          \
 406                  : "+r" (ARG),       /* 0 */    \
 407                    "+m" (*RMW)       /* 1 */    \
 408                  :: CLOB, "cc")
 409
 410 #define atomic_add_32__(RMW, ARG, ORIG, ORDER)     \
 411     ({                                             \
 412         typeof(RMW) rmw__ = (RMW);                 \
 413         typeof(*(RMW)) arg__ = (ARG);              \
 414                                                    \
 415         if ((ORDER) > memory_order_consume) {      \
 416             atomic_add__(rmw__, arg__, "memory");  \
 417         } else {                                   \
 418             atomic_add__(rmw__, arg__, "cc");      \
 419         }                                          \
 420         *(ORIG) = arg__;                           \
 421     })
 422
 423 /* We could use simple locked instructions if the original value was not
 424  * needed. */
 425 #define atomic_op__(RMW, OP, ARG, ORIG, ORDER)              \
 426     ({                                                      \
 427         typeof(RMW) rmw__ = (RMW);                          \
 428         typeof(ARG) arg__ = (ARG);                                      \
 429                                                                         \
 430         typeof(*(RMW)) val__;                                           \
 431                                                                         \
 432         atomic_read_explicit(rmw__, &val__, memory_order_relaxed);      \
 433         do {                                                            \
 434         } while (!atomic_compare_exchange_weak_explicit(rmw__, &val__,  \
 435                                                         val__ OP arg__, \
 436                                                         ORDER,          \
 437                                                         memory_order_relaxed)); \
 438         *(ORIG) = val__;                                                \
 439     })
 440
 441 #define atomic_add_explicit(RMW, ARG, ORIG, ORDER)              \
 442     (sizeof(*(RMW)) <= 4                                        \
 443      ? atomic_add_32__(RMW, ARG, ORIG, ORDER)                   \
 444      : atomic_op__(RMW, +, ARG, ORIG, ORDER))
 445 #define atomic_add(RMW, ARG, ORIG)                              \
 446     atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 447
 448 #define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)              \
 449     (sizeof(*(RMW)) <= 4                                        \
 450      ? atomic_add_32__(RMW, -(ARG), ORIG, ORDER)                \
 451      : atomic_op__(RMW, -, ARG, ORIG, ORDER))
 452 #define atomic_sub(RMW, ARG, ORIG)                              \
 453     atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 454
 455 #define atomic_or_explicit(RMW, ARG, ORIG, ORDER)       \
 456     atomic_op__(RMW, |, ARG, ORIG, ORDER)
 457 #define atomic_or(RMW, ARG, ORIG)                              \
 458     atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 459
 460 #define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)      \
 461     atomic_op__(RMW, ^, ARG, ORIG, ORDER)
 462 #define atomic_xor(RMW, ARG, ORIG)                              \
 463     atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 464
 465 #define atomic_and_explicit(RMW, ARG, ORIG, ORDER)      \
 466     atomic_op__(RMW, &, ARG, ORIG, ORDER)
 467 #define atomic_and(RMW, ARG, ORIG)                              \
 468     atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 469
 470 \f
 471 /* atomic_flag */
 472
 473 typedef ATOMIC(int) atomic_flag;
 474 #define ATOMIC_FLAG_INIT { false }
 475
 476 #define atomic_flag_test_and_set_explicit(FLAG, ORDER)  \
 477     ((bool)atomic_exchange__(FLAG, 1, ORDER))
 478 #define atomic_flag_test_and_set(FLAG)                                  \
 479     atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)
 480
 481 #define atomic_flag_clear_explicit(FLAG, ORDER) \
 482     atomic_store_explicit(FLAG, 0, ORDER)
 483 #define atomic_flag_clear(FLAG)                                 \
 484     atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)