lib/ovs-atomic-x86_64.h

   1 /*
   2  * Copyright (c) 2014 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /* This header implements atomic operation primitives on x86_64 with GCC. */
  18 #ifndef IN_OVS_ATOMIC_H
  19 #error "This header should only be included indirectly via ovs-atomic.h."
  20 #endif
  21
  22 #define OVS_ATOMIC_X86_64_IMPL 1
  23
  24 /*
  25  * x86_64 Memory model (http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.html):
  26  *
  27  * - 1, 2, 4, and 8 byte loads and stores are atomic on aligned memory.
  28  * - Loads are not reordered with other loads.
  29  * - Stores are not reordered with OLDER loads.
  30  *   - Loads may be reordered with OLDER stores to a different memory location,
  31  *     but not with OLDER stores to the same memory location.
  32  * - Stores are not reordered with other stores, except for special
  33  *   instructions (CLFLUSH, streaming stores, fast string operations).
  34  *   Most of these are not emitted by compilers, and as long as the
  35  *   atomic stores are not combined with any other stores, even the allowed
  36  *   reordering of the stores by a single fast string operation (e.g., "stos")
  37  *   is not a problem.
  38  * - Neither loads nor stores are reordered with locked instructions.
  39  * - Loads cannot pass earlier LFENCE or MFENCE instructions.
  40  * - Stores cannot pass earlier LFENCE, SFENCE, or MFENCE instructions.
  41  * - LFENCE instruction cannot pass earlier loads.
  42  * - SFENCE instruction cannot pass earlier stores.
  43  * - MFENCE instruction cannot pass earlier loads or stores.
  44  * - Stores by a single processor are observed in the same order by all
  45  *   processors.
  46  * - (Unlocked) Stores from different processors are NOT ordered.
  47  * - Memory ordering obeys causality (memory ordering respects transitive
  48  *   visibility).
  49  * - Any two stores are seen in a consistent order by processors other than
  50  *   the those performing the stores.
  51  * - Locked instructions have total order.
  52  *
  53  * These rules imply that:
  54  *
  55  * - Locked instructions are not needed for aligned loads or stores to make
  56  *   them atomic.
  57  * - All stores have release semantics; none of the preceding stores or loads
  58  *   can be reordered with following stores.  Following loads could still be
  59  *   reordered to happen before the store, but that is not a violation of the
  60  *   release semantics.
  61  * - All loads from a given memory location have acquire semantics with
  62  *   respect to the stores on the same memory location; none of the following
  63  *   loads or stores can be reordered with the load.  Preceding stores to a
  64  *   different memory location MAY be reordered with the load, but that is not
  65  *   a violation of the acquire semantics (i.e., the loads and stores of two
  66  *   critical sections guarded by a different memory location can overlap).
  67  * - Locked instructions serve as CPU memory barriers by themselves.
  68  * - Locked stores implement the sequential consistency memory order.  Using
  69  *   locked instructions when seq_cst memory order is requested allows normal
  70  *   loads to observe the stores in the same (total) order without using CPU
  71  *   memory barrier after the loads.
  72  *
  73  * NOTE: Some older AMD Opteron processors have a bug that violates the
  74  * acquire semantics described above.  The bug manifests as an unlocked
  75  * read-modify-write operation following a "semaphore operation" operating
  76  * on data that existed before entering the critical section; i.e., the
  77  * preceding "semaphore operation" fails to function as an acquire barrier.
  78  * The affected CPUs are AMD family 15, models 32 to 63.
  79  *
  80  * Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
  81  */
  82
  83 /* Barriers. */
  84
  85 #define compiler_barrier()      asm volatile(" " : : : "memory")
  86 #define cpu_barrier()           asm volatile("mfence;" : : : "memory")
  87
  88 /*
  89  * The 'volatile' keyword prevents the compiler from keeping the atomic
  90  * value in a register, and generates a new memory access for each atomic
  91  * operation.  This allows the implementations of memory_order_relaxed and
  92  * memory_order_consume to avoid issuing a compiler memory barrier, allowing
  93  * full optimization of all surrounding non-atomic variables.
  94  *
  95  * The placement of the 'volatile' keyword after the 'TYPE' below is highly
  96  * significant when the TYPE is a pointer type.  In that case we want the
  97  * pointer to be declared volatile, not the data type that is being pointed
  98  * at!
  99  */
 100 #define ATOMIC(TYPE) TYPE volatile
 101
 102 /* Memory ordering.  Must be passed in as a constant. */
 103 typedef enum {
 104     memory_order_relaxed,
 105     memory_order_consume,
 106     memory_order_acquire,
 107     memory_order_release,
 108     memory_order_acq_rel,
 109     memory_order_seq_cst
 110 } memory_order;
 111 \f
 112 #define ATOMIC_BOOL_LOCK_FREE 2
 113 #define ATOMIC_CHAR_LOCK_FREE 2
 114 #define ATOMIC_SHORT_LOCK_FREE 2
 115 #define ATOMIC_INT_LOCK_FREE 2
 116 #define ATOMIC_LONG_LOCK_FREE 2
 117 #define ATOMIC_LLONG_LOCK_FREE 2
 118 #define ATOMIC_POINTER_LOCK_FREE 2
 119
 120 #define IS_LOCKLESS_ATOMIC(OBJECT)                      \
 121     (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
 122 \f
 123 #define ATOMIC_VAR_INIT(VALUE) VALUE
 124 #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
 125
 126 /*
 127  * The memory_model_relaxed does not need a compiler barrier, if the
 128  * atomic operation can otherwise be guaranteed to not be moved with
 129  * respect to other atomic operations on the same memory location.  Using
 130  * the 'volatile' keyword in the definition of the atomic types
 131  * accomplishes this, as memory accesses to volatile data may not be
 132  * optimized away, or be reordered with other volatile accesses.
 133  *
 134  * On x86 also memory_order_consume is automatic, and data dependency on a
 135  * volatile atomic variable means that the compiler optimizations should not
 136  * cause problems.  That is, the compiler should not speculate the value of
 137  * the atomic_read, as it is going to read it from the memory anyway.
 138  * This allows omiting the compiler memory barrier on atomic_reads with
 139  * memory_order_consume.  This matches the definition of
 140  * smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
 141  * in rcu_dereference().
 142  *
 143  * We use this same logic below to choose inline assembly statements with or
 144  * without a compiler memory barrier.
 145  */
 146 static inline void
 147 atomic_compiler_barrier(memory_order order)
 148 {
 149     if (order > memory_order_consume) {
 150         compiler_barrier();
 151     }
 152 }
 153
 154 static inline void
 155 atomic_thread_fence(memory_order order)
 156 {
 157     if (order == memory_order_seq_cst) {
 158         cpu_barrier();
 159     } else {
 160         atomic_compiler_barrier(order);
 161     }
 162 }
 163
 164 static inline void
 165 atomic_signal_fence(memory_order order)
 166 {
 167     atomic_compiler_barrier(order);
 168 }
 169
 170 #define atomic_is_lock_free(OBJ)                \
 171     ((void) *(OBJ),                             \
 172      IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)
 173
 174 #define atomic_exchange__(DST, SRC, ORDER)        \
 175     ({                                            \
 176         typeof(DST) dst___ = (DST);               \
 177         typeof(*(DST)) src___ = (SRC);            \
 178                                                   \
 179         if ((ORDER) > memory_order_consume) {           \
 180             asm volatile("xchg %1,%0 ; "                \
 181                          "# atomic_exchange__"          \
 182                          : "+r" (src___),    /* 0 */    \
 183                            "+m" (*dst___)    /* 1 */    \
 184                          :: "memory");                  \
 185         } else {                                        \
 186             asm volatile("xchg %1,%0 ; "                \
 187                          "# atomic_exchange__"          \
 188                          : "+r" (src___),    /* 0 */    \
 189                            "+m" (*dst___));  /* 1 */    \
 190         }                                               \
 191         src___;                                         \
 192     })
 193
 194 /* Atomic store: Valid memory models are:
 195  *
 196  * memory_order_relaxed, memory_order_release, and
 197  * memory_order_seq_cst. */
 198 #define atomic_store_explicit(DST, SRC, ORDER)          \
 199     ({                                                  \
 200         typeof(DST) dst__ = (DST);                      \
 201         typeof(*(DST)) src__ = (SRC);                   \
 202                                                         \
 203         if ((ORDER) != memory_order_seq_cst) {          \
 204             atomic_compiler_barrier(ORDER);             \
 205             *dst__ = src__;                             \
 206         } else {                                        \
 207             atomic_exchange__(dst__, src__, ORDER);     \
 208         }                                               \
 209         (void) 0;                                       \
 210     })
 211 #define atomic_store(DST, SRC)                                  \
 212     atomic_store_explicit(DST, SRC, memory_order_seq_cst)
 213
 214 /* Atomic read: Valid memory models are:
 215  *
 216  * memory_order_relaxed, memory_order_consume, memory_model_acquire,
 217  * and memory_order_seq_cst. */
 218 #define atomic_read_explicit(SRC, DST, ORDER)           \
 219     ({                                                  \
 220         typeof(DST) dst__ = (DST);                      \
 221         typeof(SRC) src__ = (SRC);                      \
 222                                                         \
 223         *dst__ = *src__;                                \
 224         atomic_compiler_barrier(ORDER);                 \
 225         (void) 0;                                       \
 226     })
 227 #define atomic_read(SRC, DST)                                   \
 228     atomic_read_explicit(SRC, DST, memory_order_seq_cst)
 229
 230 #define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB)           \
 231     asm volatile("lock; cmpxchg %3,%1 ; "                             \
 232                  "      sete    %0      "                             \
 233                  "# atomic_compare_exchange__"                        \
 234                  : "=q" (RES),           /* 0 */                      \
 235                    "+m" (*DST),          /* 1 */                      \
 236                    "+a" (EXP)            /* 2 */                      \
 237                  : "r" (SRC)             /* 3 */                      \
 238                  : CLOB, "cc")
 239
 240 /* All memory models are valid for read-modify-write operations.
 241  *
 242  * Valid memory models for the read operation of the current value in
 243  * the failure case are the same as for atomic read, but can not be
 244  * stronger than the success memory model.
 245  * ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
 246  * at least as strong a barrier as allowed for ORD_FAIL in all cases. */
 247 #define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
 248     ({                                                              \
 249         typeof(DST) dst__ = (DST);                                  \
 250         typeof(DST) expp__ = (EXP);                                 \
 251         typeof(*(DST)) src__ = (SRC);                               \
 252         typeof(*(DST)) exp__ = *expp__;                             \
 253         uint8_t res__;                                              \
 254         (void)ORD_FAIL;                                             \
 255                                                                     \
 256         if ((ORDER) > memory_order_consume) {                       \
 257             atomic_compare_exchange__(dst__, exp__, src__, res__,   \
 258                                       "memory");                    \
 259         } else {                                                    \
 260             atomic_compare_exchange__(dst__, exp__, src__, res__,   \
 261                                       "cc");                        \
 262         }                                                           \
 263         if (!res__) {                                               \
 264             *expp__ = exp__;                                        \
 265         }                                                           \
 266         (bool)res__;                                                \
 267     })
 268 #define atomic_compare_exchange_strong(DST, EXP, SRC)             \
 269     atomic_compare_exchange_strong_explicit(DST, EXP, SRC,        \
 270                                             memory_order_seq_cst, \
 271                                             memory_order_seq_cst)
 272 #define atomic_compare_exchange_weak            \
 273     atomic_compare_exchange_strong
 274 #define atomic_compare_exchange_weak_explicit   \
 275     atomic_compare_exchange_strong_explicit
 276
 277 #define atomic_add__(RMW, ARG, CLOB)            \
 278     asm volatile("lock; xadd %0,%1 ; "          \
 279                  "# atomic_add__     "          \
 280                  : "+r" (ARG),       /* 0 */    \
 281                    "+m" (*RMW)       /* 1 */    \
 282                  :: CLOB, "cc")
 283
 284 #define atomic_add_explicit(RMW, ARG, ORIG, ORDER)  \
 285     ({                                              \
 286         typeof(RMW) rmw__ = (RMW);                  \
 287         typeof(*(RMW)) arg__ = (ARG);               \
 288                                                     \
 289         if ((ORDER) > memory_order_consume) {       \
 290             atomic_add__(rmw__, arg__, "memory");   \
 291         } else {                                    \
 292             atomic_add__(rmw__, arg__, "cc");       \
 293         }                                           \
 294         *(ORIG) = arg__;                            \
 295     })
 296 #define atomic_add(RMW, ARG, ORIG)                              \
 297     atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 298
 299 #define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)      \
 300     atomic_add_explicit(RMW, -(ARG), ORIG, ORDER)
 301 #define atomic_sub(RMW, ARG, ORIG)                              \
 302     atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 303
 304 /* We could use simple locked instructions if the original value was not
 305  * needed. */
 306 #define atomic_op__(RMW, OP, ARG, ORIG, ORDER)              \
 307     ({                                                      \
 308         typeof(RMW) rmw__ = (RMW);                          \
 309         typeof(ARG) arg__ = (ARG);                                      \
 310                                                                         \
 311         typeof(*(RMW)) val__;                                           \
 312                                                                         \
 313         atomic_read_explicit(rmw__, &val__, memory_order_relaxed);      \
 314         do {                                                            \
 315         } while (!atomic_compare_exchange_weak_explicit(rmw__, &val__,  \
 316                                                         val__ OP arg__, \
 317                                                         ORDER,          \
 318                                                         memory_order_relaxed)); \
 319         *(ORIG) = val__;                                                \
 320     })
 321
 322 #define atomic_or_explicit(RMW, ARG, ORIG, ORDER)       \
 323     atomic_op__(RMW, |, ARG, ORIG, ORDER)
 324 #define atomic_or(RMW, ARG, ORIG)                              \
 325     atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 326
 327 #define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)      \
 328     atomic_op__(RMW, ^, ARG, ORIG, ORDER)
 329 #define atomic_xor(RMW, ARG, ORIG)                              \
 330     atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 331
 332 #define atomic_and_explicit(RMW, ARG, ORIG, ORDER)      \
 333     atomic_op__(RMW, &, ARG, ORIG, ORDER)
 334 #define atomic_and(RMW, ARG, ORIG)                              \
 335     atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
 336
 337 \f
 338 /* atomic_flag */
 339
 340 typedef ATOMIC(int) atomic_flag;
 341 #define ATOMIC_FLAG_INIT { false }
 342
 343 #define atomic_flag_test_and_set_explicit(FLAG, ORDER)  \
 344     ((bool)atomic_exchange__(FLAG, 1, ORDER))
 345 #define atomic_flag_test_and_set(FLAG)                                  \
 346     atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)
 347
 348 #define atomic_flag_clear_explicit(FLAG, ORDER) \
 349     atomic_store_explicit(FLAG, 0, ORDER)
 350 #define atomic_flag_clear(FLAG)                                 \
 351     atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)