2 * Copyright (c) 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 /* This header implements atomic operation primitives on 32-bit 586+ with GCC.
19 #ifndef IN_OVS_ATOMIC_H
20 #error "This header should only be included indirectly via ovs-atomic.h."
23 #define OVS_ATOMIC_I586_IMPL 1
26 * These assumptions have been adopted from the x86_64 Memory model:
28 * - 1, 2, and 4 byte loads and stores are atomic on aligned memory.
29 * - Loads are not reordered with other loads.
30 * - Stores are not reordered with OLDER loads.
31 * - Loads may be reordered with OLDER stores to a different memory location,
32 * but not with OLDER stores to the same memory location.
33 * - Stores are not reordered with other stores, except maybe for special
34 * instructions not emitted by compilers, or by the stores performed by
35 * a single fast string operation (e.g., "stos"). As long as the atomic
36 * stores are not combined with any other stores, even the allowed reordering
37 * of the stores by a single fast string operation is not a problem.
38 * - Neither loads nor stores are reordered with locked instructions.
39 * - Stores by a single processor are observed in the same order by all
41 * - (Unlocked) Stores from different processors are NOT ordered.
42 * - Memory ordering obeys causality (memory ordering respects transitive
44 * - Any two stores are seen in a consistent order by processors other than
45 * the those performing the stores.
46 * - Locked instructions have total order.
48 * These rules imply that:
50 * - Locked instructions are not needed for aligned loads or stores to make
51 * them atomic for sizes upto 4 bytes. 8 byte objects need locked
53 * - All stores have release semantics; none of the preceding stores or loads
54 * can be reordered with following stores. Following loads could still be
55 * reordered to happen before the store, but that is not a violation of the
57 * - All loads from a given memory location have acquire semantics with
58 * respect to the stores on the same memory location; none of the following
59 * loads or stores can be reordered with the load. Preceding stores to a
60 * different memory location MAY be reordered with the load, but that is not
61 * a violation of the acquire semantics (i.e., the loads and stores of two
62 * critical sections guarded by a different memory location can overlap).
63 * - Locked instructions serve as CPU memory barriers by themselves.
64 * - Locked stores implement the sequential consistency memory order. Using
65 * locked instructions when seq_cst memory order is requested allows normal
66 * loads to observe the stores in the same (total) order without using CPU
67 * memory barrier after the loads.
69 * NOTE: Some older AMD Opteron processors have a bug that violates the
70 * acquire semantics described above. The bug manifests as an unlocked
71 * read-modify-write operation following a "semaphore operation" operating
72 * on data that existed before entering the critical section; i.e., the
73 * preceding "semaphore operation" fails to function as an acquire barrier.
74 * The affected CPUs are AMD family 15, models 32 to 63.
76 * Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
81 #define compiler_barrier() asm volatile(" " : : : "memory")
82 #define cpu_barrier() asm volatile("lock; addl $0,(%%esp)" ::: "memory", "cc")
85 * The 'volatile' keyword prevents the compiler from keeping the atomic
86 * value in a register, and generates a new memory access for each atomic
87 * operation. This allows the implementations of memory_order_relaxed and
88 * memory_order_consume to avoid issuing a compiler memory barrier, allowing
89 * full optimization of all surrounding non-atomic variables.
91 * The placement of the 'volatile' keyword after the 'TYPE' below is highly
92 * significant when the TYPE is a pointer type. In that case we want the
93 * pointer to be declared volatile, not the data type that is being pointed
96 #define ATOMIC(TYPE) TYPE volatile
98 /* Memory ordering. Must be passed in as a constant. */
100 memory_order_relaxed,
101 memory_order_consume,
102 memory_order_acquire,
103 memory_order_release,
104 memory_order_acq_rel,
108 #define ATOMIC_BOOL_LOCK_FREE 2
109 #define ATOMIC_CHAR_LOCK_FREE 2
110 #define ATOMIC_SHORT_LOCK_FREE 2
111 #define ATOMIC_INT_LOCK_FREE 2
112 #define ATOMIC_LONG_LOCK_FREE 2
113 #define ATOMIC_LLONG_LOCK_FREE 2
114 #define ATOMIC_POINTER_LOCK_FREE 2
116 #define IS_LOCKLESS_ATOMIC(OBJECT) \
117 (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
119 #define ATOMIC_VAR_INIT(VALUE) VALUE
120 #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
123 * The memory_model_relaxed does not need a compiler barrier, if the
124 * atomic operation can otherwise be guaranteed to not be moved with
125 * respect to other atomic operations on the same memory location. Using
126 * the 'volatile' keyword in the definition of the atomic types
127 * accomplishes this, as memory accesses to volatile data may not be
128 * optimized away, or be reordered with other volatile accesses.
130 * On x86 also memory_order_consume is automatic, and data dependency on a
131 * volatile atomic variable means that the compiler optimizations should not
132 * cause problems. That is, the compiler should not speculate the value of
133 * the atomic_read, as it is going to read it from the memory anyway.
134 * This allows omiting the compiler memory barrier on atomic_reads with
135 * memory_order_consume. This matches the definition of
136 * smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
137 * in rcu_dereference().
139 * We use this same logic below to choose inline assembly statements with or
140 * without a compiler memory barrier.
143 atomic_compiler_barrier(memory_order order)
145 if (order > memory_order_consume) {
151 atomic_thread_fence(memory_order order)
153 if (order == memory_order_seq_cst) {
156 atomic_compiler_barrier(order);
161 atomic_signal_fence(memory_order order)
163 atomic_compiler_barrier(order);
166 #define atomic_is_lock_free(OBJ) \
168 IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)
170 /* The 8-byte atomic exchange uses cmpxchg8b with the SRC (ax:dx) as
171 * the expected value (bx:cx), which will get replaced by the current
172 * value in the likely case it did not match, after which we keep
173 * trying until the swap succeeds. */
176 /* ebx may not be clobbered when compiled with -fPIC, must save and
177 * restore it. Furthermore, 'DST' may be addressed via ebx, so the
178 * address must be passed via a register so that it remains valid also
179 * after changing ebx. */
180 #define atomic_exchange_8__(DST, SRC, CLOB) \
183 asm volatile(" movl %%ebx,%2 ; " \
184 " movl %%eax,%%ebx ; " \
185 " movl %%edx,%%ecx ; " \
187 "lock; cmpxchg8b (%0); " \
189 " movl %2,%%ebx ; " \
190 " # atomic_exchange_8__ " \
191 : "+r" (DST), /* 0 */ \
192 "+A" (SRC), /* 1 */ \
193 "=mr" (temp____) /* 2 */ \
194 :: "ecx", CLOB, "cc")
197 #define atomic_exchange_8__(DST, SRC, CLOB) \
198 asm volatile(" movl %%eax,%%ebx ; " \
199 " movl %%edx,%%ecx ; " \
201 "lock; cmpxchg8b %0 ; " \
203 " # atomic_exchange_8__ " \
204 : "+m" (*DST), /* 0 */ \
206 :: "ebx", "ecx", CLOB, "cc")
209 #define atomic_exchange__(DST, SRC, ORDER) \
211 typeof(DST) dst___ = (DST); \
212 typeof(*(DST)) src___ = (SRC); \
214 if ((ORDER) > memory_order_consume) { \
215 if (sizeof(*(DST)) == 8) { \
216 atomic_exchange_8__(dst___, src___, "memory"); \
218 asm volatile("xchg %1,%0 ; " \
219 "# atomic_exchange__" \
220 : "+r" (src___), /* 0 */ \
221 "+m" (*dst___) /* 1 */ \
225 if (sizeof(*(DST)) == 8) { \
226 atomic_exchange_8__(dst___, src___, "cc"); \
228 asm volatile("xchg %1,%0 ; " \
229 "# atomic_exchange__" \
230 : "+r" (src___), /* 0 */ \
231 "+m" (*dst___)); /* 1 */ \
237 #define atomic_store_explicit(DST, SRC, ORDER) \
239 typeof(DST) dst__ = (DST); \
240 typeof(*(DST)) src__ = (SRC); \
242 if ((ORDER) != memory_order_seq_cst \
243 && sizeof(*(DST)) <= 4) { \
244 atomic_compiler_barrier(ORDER); \
247 atomic_exchange__(dst__, src__, ORDER); \
251 #define atomic_store(DST, SRC) \
252 atomic_store_explicit(DST, SRC, memory_order_seq_cst)
254 /* The 8-byte variant compares '*DST' to a random value in bx:cx and
255 * returns the actual value in ax:dx. The registers bx and cx are
256 * only read, so they are not clobbered. */
257 #define atomic_read_explicit(SRC, DST, ORDER) \
259 typeof(DST) dst__ = (DST); \
260 typeof(SRC) src__ = (SRC); \
262 if (sizeof(*(DST)) <= 4) { \
265 typeof(*(DST)) res__; \
267 asm volatile(" movl %%ebx,%%eax ; " \
268 " movl %%ecx,%%edx ; " \
269 "lock; cmpxchg8b %1 ; " \
270 "# atomic_read_explicit " \
271 : "=&A" (res__), /* 0 */ \
272 "+m" (*src__) /* 1 */ \
276 atomic_compiler_barrier(ORDER); \
279 #define atomic_read(SRC, DST) \
280 atomic_read_explicit(SRC, DST, memory_order_seq_cst)
283 /* ebx may not be used as an input when compiled with -fPIC, must save
284 * and restore it. Furthermore, 'DST' may be addressed via ebx, so
285 * the address must be passed via a register so that it remains valid
286 * also after changing ebx. */
287 #define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB) \
288 asm volatile(" xchgl %%ebx,%3 ; " \
289 "lock; cmpxchg8b (%1) ; " \
290 " xchgl %3,%%ebx ; " \
292 "# atomic_compare_exchange_8__" \
293 : "=q" (RES), /* 0 */ \
294 "+r" (DST), /* 1 */ \
296 : "r" ((uint32_t)SRC), /* 3 */ \
297 "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */ \
300 #define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB) \
301 asm volatile("lock; cmpxchg8b %1 ; " \
303 "# atomic_compare_exchange_8__" \
304 : "=q" (RES), /* 0 */ \
305 "+m" (*DST), /* 1 */ \
307 : "b" ((uint32_t)SRC), /* 3 */ \
308 "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */ \
312 #define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB) \
313 asm volatile("lock; cmpxchg %3,%1 ; " \
315 "# atomic_compare_exchange__" \
316 : "=q" (RES), /* 0 */ \
317 "+m" (*DST), /* 1 */ \
319 : "r" (SRC) /* 3 */ \
322 /* ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
323 * at least as strong a barrier as allowed for ORD_FAIL in all cases. */
324 #define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
326 typeof(DST) dst__ = (DST); \
327 typeof(DST) expp__ = (EXP); \
328 typeof(*(DST)) src__ = (SRC); \
329 typeof(*(DST)) exp__ = *expp__; \
333 if ((ORDER) > memory_order_consume) { \
334 if (sizeof(*(DST)) <= 4) { \
335 atomic_compare_exchange__(dst__, exp__, src__, res__, \
338 atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
342 if (sizeof(*(DST)) <= 4) { \
343 atomic_compare_exchange__(dst__, exp__, src__, res__, \
346 atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
355 #define atomic_compare_exchange_strong(DST, EXP, SRC) \
356 atomic_compare_exchange_strong_explicit(DST, EXP, SRC, \
357 memory_order_seq_cst, \
358 memory_order_seq_cst)
359 #define atomic_compare_exchange_weak \
360 atomic_compare_exchange_strong
361 #define atomic_compare_exchange_weak_explicit \
362 atomic_compare_exchange_strong_explicit
364 #define atomic_add__(RMW, ARG, CLOB) \
365 asm volatile("lock; xadd %0,%1 ; " \
367 : "+r" (ARG), /* 0 */ \
368 "+m" (*RMW) /* 1 */ \
371 #define atomic_add_32__(RMW, ARG, ORIG, ORDER) \
373 typeof(RMW) rmw__ = (RMW); \
374 typeof(*(RMW)) arg__ = (ARG); \
376 if ((ORDER) > memory_order_consume) { \
377 atomic_add__(rmw__, arg__, "memory"); \
379 atomic_add__(rmw__, arg__, "cc"); \
384 /* We could use simple locked instructions if the original value was not
386 #define atomic_op__(RMW, OP, ARG, ORIG, ORDER) \
388 typeof(RMW) rmw__ = (RMW); \
389 typeof(ARG) arg__ = (ARG); \
391 typeof(*(RMW)) val__; \
393 atomic_read_explicit(rmw__, &val__, memory_order_relaxed); \
395 } while (!atomic_compare_exchange_weak_explicit(rmw__, &val__, \
398 memory_order_relaxed)); \
402 #define atomic_add_explicit(RMW, ARG, ORIG, ORDER) \
403 (sizeof(*(RMW)) <= 4 \
404 ? atomic_add_32__(RMW, ARG, ORIG, ORDER) \
405 : atomic_op__(RMW, +, ARG, ORIG, ORDER))
406 #define atomic_add(RMW, ARG, ORIG) \
407 atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
409 #define atomic_sub_explicit(RMW, ARG, ORIG, ORDER) \
410 (sizeof(*(RMW)) <= 4 \
411 ? atomic_add_32__(RMW, -(ARG), ORIG, ORDER) \
412 : atomic_op__(RMW, -, ARG, ORIG, ORDER))
413 #define atomic_sub(RMW, ARG, ORIG) \
414 atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
416 #define atomic_or_explicit(RMW, ARG, ORIG, ORDER) \
417 atomic_op__(RMW, |, ARG, ORIG, ORDER)
418 #define atomic_or(RMW, ARG, ORIG) \
419 atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
421 #define atomic_xor_explicit(RMW, ARG, ORIG, ORDER) \
422 atomic_op__(RMW, ^, ARG, ORIG, ORDER)
423 #define atomic_xor(RMW, ARG, ORIG) \
424 atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
426 #define atomic_and_explicit(RMW, ARG, ORIG, ORDER) \
427 atomic_op__(RMW, &, ARG, ORIG, ORDER)
428 #define atomic_and(RMW, ARG, ORIG) \
429 atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
434 typedef ATOMIC(int) atomic_flag;
435 #define ATOMIC_FLAG_INIT { false }
437 #define atomic_flag_test_and_set_explicit(FLAG, ORDER) \
438 ((bool)atomic_exchange__(FLAG, 1, ORDER))
439 #define atomic_flag_test_and_set(FLAG) \
440 atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)
442 #define atomic_flag_clear_explicit(FLAG, ORDER) \
443 atomic_store_explicit(FLAG, 0, ORDER)
444 #define atomic_flag_clear(FLAG) \
445 atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)