2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "connectivity.h"
28 #include "dp-packet.h"
34 #include "ofproto/ofproto-dpif.h"
35 #include "ofproto/ofproto-dpif-rid.h"
36 #include "ofproto/ofproto-provider.h"
37 #include "openvswitch/dynamic-string.h"
38 #include "openvswitch/list.h"
39 #include "openvswitch/match.h"
40 #include "openvswitch/ofp-actions.h"
41 #include "openvswitch/ofp-util.h"
42 #include "openvswitch/ofpbuf.h"
43 #include "openvswitch/vlog.h"
45 #include "poll-loop.h"
51 VLOG_DEFINE_THIS_MODULE(bond);
53 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
54 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
55 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
57 /* Bit-mask for hashing a flow down to a bucket. */
58 #define BOND_MASK 0xff
59 #define BOND_BUCKETS (BOND_MASK + 1)
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
78 /* A bond slave, that is, one of the links comprising a bond. */
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
85 struct netdev *netdev; /* Network device, owned by the client. */
86 uint64_t change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* OpenFlow port number. */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
101 /* A bond, that is, a set of network devices grouped to improve performance or
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135 /* Store active slave to OVSDB. */
136 bool active_slave_changed; /* Set to true whenever the bond changes
137 active slave. It will be reset to false
138 after it is stored into OVSDB */
140 /* Interface name may not be persistent across an OS reboot, use
141 * MAC address for identifing the active slave */
142 struct eth_addr active_slave_mac;
143 /* The MAC address of the active interface. */
144 /* Legacy compatibility. */
145 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
147 struct ovs_refcount ref_cnt;
150 /* What to do with an bond_recirc_rule. */
152 ADD, /* Add the rule to ofproto's flow table. */
153 DEL, /* Delete the rule from the ofproto's flow table. */
156 /* A rule to add to or delete from ofproto's internal flow table. */
157 struct bond_pr_rule_op {
158 struct hmap_node hmap_node;
160 ofp_port_t out_ofport;
162 struct rule **pr_rule;
165 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
166 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
167 OVS_REQ_RDLOCK(rwlock);
168 static void bond_enable_slave(struct bond_slave *, bool enable)
169 OVS_REQ_WRLOCK(rwlock);
170 static void bond_link_status_update(struct bond_slave *)
171 OVS_REQ_WRLOCK(rwlock);
172 static void bond_choose_active_slave(struct bond *)
173 OVS_REQ_WRLOCK(rwlock);
174 static unsigned int bond_hash_src(const struct eth_addr mac,
175 uint16_t vlan, uint32_t basis);
176 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
178 static struct bond_entry *lookup_bond_entry(const struct bond *,
181 OVS_REQ_RDLOCK(rwlock);
182 static struct bond_slave *get_enabled_slave(struct bond *)
183 OVS_REQ_RDLOCK(rwlock);
184 static struct bond_slave *choose_output_slave(const struct bond *,
186 struct flow_wildcards *,
188 OVS_REQ_RDLOCK(rwlock);
190 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
191 * stores the mode in '*balance' and returns true. Otherwise returns false
192 * without modifying '*balance'. */
194 bond_mode_from_string(enum bond_mode *balance, const char *s)
196 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
198 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
200 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
208 /* Returns a string representing 'balance'. */
210 bond_mode_to_string(enum bond_mode balance) {
213 return "balance-tcp";
215 return "balance-slb";
217 return "active-backup";
223 /* Creates and returns a new bond whose configuration is initially taken from
226 * The caller should register each slave on the new bond by calling
227 * bond_slave_register(). */
229 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
233 bond = xzalloc(sizeof *bond);
234 bond->ofproto = ofproto;
235 hmap_init(&bond->slaves);
236 ovs_list_init(&bond->enabled_slaves);
237 ovs_mutex_init(&bond->mutex);
238 ovs_refcount_init(&bond->ref_cnt);
239 hmap_init(&bond->pr_rule_ops);
241 bond_reconfigure(bond, s);
246 bond_ref(const struct bond *bond_)
248 struct bond *bond = CONST_CAST(struct bond *, bond_);
251 ovs_refcount_ref(&bond->ref_cnt);
258 bond_unref(struct bond *bond)
260 struct bond_pr_rule_op *pr_op;
261 struct bond_slave *slave;
263 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
267 ovs_rwlock_wrlock(&rwlock);
268 hmap_remove(all_bonds, &bond->hmap_node);
269 ovs_rwlock_unlock(&rwlock);
271 HMAP_FOR_EACH_POP (slave, hmap_node, &bond->slaves) {
272 /* Client owns 'slave->netdev'. */
276 hmap_destroy(&bond->slaves);
278 ovs_mutex_destroy(&bond->mutex);
282 HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
285 hmap_destroy(&bond->pr_rule_ops);
287 if (bond->recirc_id) {
288 recirc_free_id(bond->recirc_id);
295 add_pr_rule(struct bond *bond, const struct match *match,
296 ofp_port_t out_ofport, struct rule **rule)
298 uint32_t hash = match_hash(match, 0);
299 struct bond_pr_rule_op *pr_op;
301 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
302 if (match_equal(&pr_op->match, match)) {
304 pr_op->out_ofport = out_ofport;
305 pr_op->pr_rule = rule;
310 pr_op = xmalloc(sizeof *pr_op);
311 pr_op->match = *match;
313 pr_op->out_ofport = out_ofport;
314 pr_op->pr_rule = rule;
315 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
319 update_recirc_rules(struct bond *bond)
320 OVS_REQ_WRLOCK(rwlock)
323 struct bond_pr_rule_op *pr_op, *next_op;
324 uint64_t ofpacts_stub[128 / 8];
325 struct ofpbuf ofpacts;
328 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
330 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
334 if (bond->hash && bond->recirc_id) {
335 for (i = 0; i < BOND_BUCKETS; i++) {
336 struct bond_slave *slave = bond->hash[i].slave;
339 match_init_catchall(&match);
340 match_set_recirc_id(&match, bond->recirc_id);
341 match_set_dp_hash_masked(&match, i, BOND_MASK);
343 add_pr_rule(bond, &match, slave->ofp_port,
344 &bond->hash[i].pr_rule);
349 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
353 ofpbuf_clear(&ofpacts);
354 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
355 error = ofproto_dpif_add_internal_flow(bond->ofproto,
357 RECIRC_RULE_PRIORITY, 0,
358 &ofpacts, pr_op->pr_rule);
360 char *err_s = match_to_string(&pr_op->match,
361 RECIRC_RULE_PRIORITY);
363 VLOG_ERR("failed to add post recirculation flow %s", err_s);
369 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
371 RECIRC_RULE_PRIORITY);
373 char *err_s = match_to_string(&pr_op->match,
374 RECIRC_RULE_PRIORITY);
376 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
380 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
381 *pr_op->pr_rule = NULL;
387 ofpbuf_uninit(&ofpacts);
391 /* Updates 'bond''s overall configuration to 's'.
393 * The caller should register each slave on 'bond' by calling
394 * bond_slave_register(). This is optional if none of the slaves'
395 * configuration has changed. In any case it can't hurt.
397 * Returns true if the configuration has changed in such a way that requires
401 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
403 bool revalidate = false;
405 ovs_rwlock_wrlock(&rwlock);
406 if (!bond->name || strcmp(bond->name, s->name)) {
408 hmap_remove(all_bonds, &bond->hmap_node);
411 bond->name = xstrdup(s->name);
412 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
415 bond->updelay = s->up_delay;
416 bond->downdelay = s->down_delay;
418 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
419 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
423 if (bond->rebalance_interval != s->rebalance_interval) {
424 bond->rebalance_interval = s->rebalance_interval;
428 if (bond->balance != s->balance) {
429 bond->balance = s->balance;
433 if (bond->basis != s->basis) {
434 bond->basis = s->basis;
438 if (bond->bond_revalidate) {
440 bond->bond_revalidate = false;
443 if (bond->balance != BM_AB) {
444 if (!bond->recirc_id) {
445 bond->recirc_id = recirc_alloc_id(bond->ofproto);
447 } else if (bond->recirc_id) {
448 recirc_free_id(bond->recirc_id);
452 if (bond->balance == BM_AB || !bond->hash || revalidate) {
453 bond_entry_reset(bond);
456 bond->active_slave_mac = s->active_slave_mac;
457 bond->active_slave_changed = false;
459 ovs_rwlock_unlock(&rwlock);
463 static struct bond_slave *
464 bond_find_slave_by_mac(const struct bond *bond, const struct eth_addr mac)
466 struct bond_slave *slave;
468 /* Find the last active slave */
469 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
470 struct eth_addr slave_mac;
472 if (netdev_get_etheraddr(slave->netdev, &slave_mac)) {
476 if (eth_addr_equals(slave_mac, mac)) {
485 bond_active_slave_changed(struct bond *bond)
489 netdev_get_etheraddr(bond->active_slave->netdev, &mac);
490 bond->active_slave_mac = mac;
491 bond->active_slave_changed = true;
492 seq_change(connectivity_seq_get());
496 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
497 OVS_REQ_WRLOCK(rwlock)
499 if (slave->netdev != netdev) {
500 slave->netdev = netdev;
501 slave->change_seq = 0;
505 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
506 * arbitrary client-provided pointer that uniquely identifies a slave within a
507 * bond. If 'slave_' already exists within 'bond' then this function
508 * reconfigures the existing slave.
510 * 'netdev' must be the network device that 'slave_' represents. It is owned
511 * by the client, so the client must not close it before either unregistering
512 * 'slave_' or destroying 'bond'.
515 bond_slave_register(struct bond *bond, void *slave_,
516 ofp_port_t ofport, struct netdev *netdev)
518 struct bond_slave *slave;
520 ovs_rwlock_wrlock(&rwlock);
521 slave = bond_slave_lookup(bond, slave_);
523 slave = xzalloc(sizeof *slave);
525 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
528 slave->ofp_port = ofport;
529 slave->delay_expires = LLONG_MAX;
530 slave->name = xstrdup(netdev_get_name(netdev));
531 bond->bond_revalidate = true;
533 slave->enabled = false;
534 bond_enable_slave(slave, netdev_get_carrier(netdev));
537 bond_slave_set_netdev__(slave, netdev);
540 slave->name = xstrdup(netdev_get_name(netdev));
541 ovs_rwlock_unlock(&rwlock);
544 /* Updates the network device to be used with 'slave_' to 'netdev'.
546 * This is useful if the caller closes and re-opens the network device
547 * registered with bond_slave_register() but doesn't need to change anything
550 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
552 struct bond_slave *slave;
554 ovs_rwlock_wrlock(&rwlock);
555 slave = bond_slave_lookup(bond, slave_);
557 bond_slave_set_netdev__(slave, netdev);
559 ovs_rwlock_unlock(&rwlock);
562 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
563 * then this function has no effect.
565 * Unregistering a slave invalidates all flows. */
567 bond_slave_unregister(struct bond *bond, const void *slave_)
569 struct bond_slave *slave;
572 ovs_rwlock_wrlock(&rwlock);
573 slave = bond_slave_lookup(bond, slave_);
578 bond->bond_revalidate = true;
579 bond_enable_slave(slave, false);
581 del_active = bond->active_slave == slave;
583 struct bond_entry *e;
584 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
585 if (e->slave == slave) {
593 hmap_remove(&bond->slaves, &slave->hmap_node);
594 /* Client owns 'slave->netdev'. */
598 bond_choose_active_slave(bond);
599 bond->send_learning_packets = true;
602 ovs_rwlock_unlock(&rwlock);
605 /* Should be called on each slave in 'bond' before bond_run() to indicate
606 * whether or not 'slave_' may be enabled. This function is intended to allow
607 * other protocols to have some impact on bonding decisions. For example LACP
608 * or high level link monitoring protocols may decide that a given slave should
609 * not be able to send traffic. */
611 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
613 ovs_rwlock_wrlock(&rwlock);
614 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
615 ovs_rwlock_unlock(&rwlock);
618 /* Performs periodic maintenance on 'bond'.
620 * Returns true if the caller should revalidate its flows.
622 * The caller should check bond_should_send_learning_packets() afterward. */
624 bond_run(struct bond *bond, enum lacp_status lacp_status)
626 struct bond_slave *slave;
629 ovs_rwlock_wrlock(&rwlock);
630 if (bond->lacp_status != lacp_status) {
631 bond->lacp_status = lacp_status;
632 bond->bond_revalidate = true;
635 /* Enable slaves based on link status and LACP feedback. */
636 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
637 bond_link_status_update(slave);
638 slave->change_seq = seq_read(connectivity_seq_get());
640 if (!bond->active_slave || !bond->active_slave->enabled) {
641 bond_choose_active_slave(bond);
644 revalidate = bond->bond_revalidate;
645 bond->bond_revalidate = false;
646 ovs_rwlock_unlock(&rwlock);
651 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
653 bond_wait(struct bond *bond)
655 struct bond_slave *slave;
657 ovs_rwlock_rdlock(&rwlock);
658 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
659 if (slave->delay_expires != LLONG_MAX) {
660 poll_timer_wait_until(slave->delay_expires);
663 seq_wait(connectivity_seq_get(), slave->change_seq);
666 if (bond->bond_revalidate) {
667 poll_immediate_wake();
669 ovs_rwlock_unlock(&rwlock);
671 /* We don't wait for bond->next_rebalance because rebalancing can only run
672 * at a flow account checkpoint. ofproto does checkpointing on its own
673 * schedule and bond_rebalance() gets called afterward, so we'd just be
674 * waking up for no purpose. */
677 /* MAC learning table interaction. */
680 may_send_learning_packets(const struct bond *bond)
682 return ((bond->lacp_status == LACP_DISABLED
683 && (bond->balance == BM_SLB || bond->balance == BM_AB))
684 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
685 && bond->active_slave;
688 /* Returns true if 'bond' needs the client to send out packets to assist with
689 * MAC learning on 'bond'. If this function returns true, then the client
690 * should iterate through its MAC learning table for the bridge on which 'bond'
691 * is located. For each MAC that has been learned on a port other than 'bond',
692 * it should call bond_compose_learning_packet().
694 * This function will only return true if 'bond' is in SLB or active-backup
695 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
698 * Calling this function resets the state that it checks. */
700 bond_should_send_learning_packets(struct bond *bond)
704 ovs_rwlock_wrlock(&rwlock);
705 send = bond->send_learning_packets && may_send_learning_packets(bond);
706 bond->send_learning_packets = false;
707 ovs_rwlock_unlock(&rwlock);
711 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
713 * See bond_should_send_learning_packets() for description of usage. The
714 * caller should send the composed packet on the port associated with
715 * port_aux and takes ownership of the returned ofpbuf. */
717 bond_compose_learning_packet(struct bond *bond, const struct eth_addr eth_src,
718 uint16_t vlan, void **port_aux)
720 struct bond_slave *slave;
721 struct dp_packet *packet;
724 ovs_rwlock_rdlock(&rwlock);
725 ovs_assert(may_send_learning_packets(bond));
726 memset(&flow, 0, sizeof flow);
727 flow.dl_src = eth_src;
728 slave = choose_output_slave(bond, &flow, NULL, vlan);
730 packet = dp_packet_new(0);
731 compose_rarp(packet, eth_src);
733 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
736 *port_aux = slave->aux;
737 ovs_rwlock_unlock(&rwlock);
741 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
742 * Ethernet destination address of 'eth_dst', should be admitted.
744 * The return value is one of the following:
746 * - BV_ACCEPT: Admit the packet.
748 * - BV_DROP: Drop the packet.
750 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
751 * Ethernet source address and VLAN. If there is none, or if the packet
752 * is on the learned port, then admit the packet. If a different port has
753 * been learned, however, drop the packet (and do not use it for MAC
757 bond_check_admissibility(struct bond *bond, const void *slave_,
758 const struct eth_addr eth_dst)
760 enum bond_verdict verdict = BV_DROP;
761 struct bond_slave *slave;
763 ovs_rwlock_rdlock(&rwlock);
764 slave = bond_slave_lookup(bond, slave_);
769 /* LACP bonds have very loose admissibility restrictions because we can
770 * assume the remote switch is aware of the bond and will "do the right
771 * thing". However, as a precaution we drop packets on disabled slaves
772 * because no correctly implemented partner switch should be sending
775 * If LACP is configured, but LACP negotiations have been unsuccessful, we
776 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
777 switch (bond->lacp_status) {
778 case LACP_NEGOTIATED:
779 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
781 case LACP_CONFIGURED:
782 if (!bond->lacp_fallback_ab) {
789 /* Drop all multicast packets on inactive slaves. */
790 if (eth_addr_is_multicast(eth_dst)) {
791 if (bond->active_slave != slave) {
796 switch (bond->balance) {
798 /* TCP balanced bonds require successful LACP negotiations. Based on the
799 * above check, LACP is off or lacp_fallback_ab is true on this bond.
800 * If lacp_fallback_ab is true fall through to BM_AB case else, we
801 * drop all incoming traffic. */
802 if (!bond->lacp_fallback_ab) {
807 /* Drop all packets which arrive on backup slaves. This is similar to
808 * how Linux bonding handles active-backup bonds. */
809 if (bond->active_slave != slave) {
810 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
812 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
813 " slave (%s) destined for " ETH_ADDR_FMT,
814 slave->name, ETH_ADDR_ARGS(eth_dst));
821 /* Drop all packets for which we have learned a different input port,
822 * because we probably sent the packet on one slave and got it back on
823 * the other. Gratuitous ARP packets are an exception to this rule:
824 * the host has moved to another switch. The exception to the
825 * exception is if we locked the learning table to avoid reflections on
827 verdict = BV_DROP_IF_MOVED;
833 ovs_rwlock_unlock(&rwlock);
838 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
839 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
840 * NULL if the packet should be dropped because no slaves are enabled.
842 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
843 * should be a VID only (i.e. excluding the PCP bits). Second,
844 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
845 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
846 * packet belongs to (so for an access port it will be the access port's VLAN).
848 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
849 * significant in the selection. At some point earlier, 'wc' should
850 * have been initialized (e.g., by flow_wildcards_init_catchall()).
853 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
854 struct flow_wildcards *wc, uint16_t vlan)
856 struct bond_slave *slave;
859 ovs_rwlock_rdlock(&rwlock);
860 slave = choose_output_slave(bond, flow, wc, vlan);
861 aux = slave ? slave->aux : NULL;
862 ovs_rwlock_unlock(&rwlock);
869 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
870 OVS_REQ_WRLOCK(rwlock)
875 delta = rule_tx_bytes - entry->pr_tx_bytes;
876 entry->tx_bytes += delta;
877 entry->pr_tx_bytes = rule_tx_bytes;
881 /* Maintain bond stats using post recirculation rule byte counters.*/
883 bond_recirculation_account(struct bond *bond)
884 OVS_REQ_WRLOCK(rwlock)
888 for (i=0; i<=BOND_MASK; i++) {
889 struct bond_entry *entry = &bond->hash[i];
890 struct rule *rule = entry->pr_rule;
893 uint64_t n_packets OVS_UNUSED;
894 long long int used OVS_UNUSED;
897 rule->ofproto->ofproto_class->rule_get_stats(
898 rule, &n_packets, &n_bytes, &used);
899 bond_entry_account(entry, n_bytes);
905 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
908 if (bond->balance == BM_TCP && bond->recirc_id) {
910 *recirc_id = bond->recirc_id;
913 *hash_bias = bond->basis;
922 bond_update_post_recirc_rules__(struct bond* bond, const bool force)
923 OVS_REQ_WRLOCK(rwlock)
925 struct bond_entry *e;
926 bool update_rules = force; /* Always update rules if caller forces it. */
928 /* Make sure all bond entries are populated */
929 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
930 if (!e->slave || !e->slave->enabled) {
932 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
933 struct bond_slave, hmap_node);
934 if (!e->slave->enabled) {
935 e->slave = bond->active_slave;
941 update_recirc_rules(bond);
946 bond_update_post_recirc_rules(struct bond* bond, const bool force)
948 ovs_rwlock_wrlock(&rwlock);
949 bond_update_post_recirc_rules__(bond, force);
950 ovs_rwlock_unlock(&rwlock);
956 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
958 return bond->rebalance_interval
959 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
962 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
964 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
967 ovs_rwlock_wrlock(&rwlock);
968 if (bond_is_balanced(bond)) {
969 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
971 ovs_rwlock_unlock(&rwlock);
974 static struct bond_slave *
975 bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
977 return CONTAINER_OF(bal, struct bond_slave, bal_node);
981 log_bals(struct bond *bond, const struct ovs_list *bals)
982 OVS_REQ_RDLOCK(rwlock)
984 if (VLOG_IS_DBG_ENABLED()) {
985 struct ds ds = DS_EMPTY_INITIALIZER;
986 const struct bond_slave *slave;
988 LIST_FOR_EACH (slave, bal_node, bals) {
990 ds_put_char(&ds, ',');
992 ds_put_format(&ds, " %s %"PRIu64"kB",
993 slave->name, slave->tx_bytes / 1024);
995 if (!slave->enabled) {
996 ds_put_cstr(&ds, " (disabled)");
998 if (!ovs_list_is_empty(&slave->entries)) {
999 struct bond_entry *e;
1001 ds_put_cstr(&ds, " (");
1002 LIST_FOR_EACH (e, list_node, &slave->entries) {
1003 if (&e->list_node != ovs_list_front(&slave->entries)) {
1004 ds_put_cstr(&ds, " + ");
1006 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1007 e - bond->hash, e->tx_bytes / 1024);
1009 ds_put_cstr(&ds, ")");
1012 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1017 /* Shifts 'hash' from its current slave to 'to'. */
1019 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1020 OVS_REQ_WRLOCK(rwlock)
1022 struct bond_slave *from = hash->slave;
1023 struct bond *bond = from->bond;
1024 uint64_t delta = hash->tx_bytes;
1026 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1027 "from %s to %s (now carrying %"PRIu64"kB and "
1028 "%"PRIu64"kB load, respectively)",
1029 bond->name, delta / 1024, hash - bond->hash,
1030 from->name, to->name,
1031 (from->tx_bytes - delta) / 1024,
1032 (to->tx_bytes + delta) / 1024);
1034 /* Shift load away from 'from' to 'to'. */
1035 from->tx_bytes -= delta;
1036 to->tx_bytes += delta;
1038 /* Arrange for flows to be revalidated. */
1040 bond->bond_revalidate = true;
1043 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1044 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1045 * given that doing so must decrease the ratio of the load on the two slaves by
1046 * at least 0.1. Returns NULL if there is no appropriate entry.
1048 * The list of entries isn't sorted. I don't know of a reason to prefer to
1049 * shift away small hashes or large hashes. */
1050 static struct bond_entry *
1051 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1052 OVS_REQ_WRLOCK(rwlock)
1054 struct bond_entry *e;
1056 if (ovs_list_is_short(&from->entries)) {
1057 /* 'from' carries no more than one MAC hash, so shifting load away from
1058 * it would be pointless. */
1062 LIST_FOR_EACH (e, list_node, &from->entries) {
1063 uint64_t delta = e->tx_bytes; /* The amount to rebalance. */
1064 uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes)/2;
1065 /* Note, the ideal traffic is the mid point
1066 * between 'from' and 'to'. This value does
1067 * not change by rebalancing. */
1068 uint64_t new_low; /* The lower bandwidth between 'to' and 'from'
1069 after rebalancing. */
1071 new_low = MIN(from->tx_bytes - delta, to_tx_bytes + delta);
1073 if ((new_low > to_tx_bytes) &&
1074 (new_low - to_tx_bytes >= (ideal_tx_bytes - to_tx_bytes) / 10)) {
1075 /* Only rebalance if the new 'low' is closer to to the mid point,
1076 * and the improvement exceeds 10% of current traffic
1077 * deviation from the ideal split.
1079 * The improvement on the 'high' side is always the same as the
1080 * 'low' side. Thus consider 'low' side is sufficient. */
1088 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1091 insert_bal(struct ovs_list *bals, struct bond_slave *slave)
1093 struct bond_slave *pos;
1095 LIST_FOR_EACH (pos, bal_node, bals) {
1096 if (slave->tx_bytes > pos->tx_bytes) {
1100 ovs_list_insert(&pos->bal_node, &slave->bal_node);
1103 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1104 * that descending order of 'tx_bytes' is maintained. */
1106 reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
1108 ovs_list_remove(&slave->bal_node);
1109 insert_bal(bals, slave);
1112 /* If 'bond' needs rebalancing, does so.
1114 * The caller should have called bond_account() for each active flow, or in case
1115 * of recirculation is used, have called bond_recirculation_account(bond),
1116 * to ensure that flow data is consistently accounted at this point.
1119 bond_rebalance(struct bond *bond)
1121 struct bond_slave *slave;
1122 struct bond_entry *e;
1123 struct ovs_list bals;
1124 bool rebalanced = false;
1127 ovs_rwlock_wrlock(&rwlock);
1128 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1131 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1133 use_recirc = ofproto_dpif_get_support(bond->ofproto)->odp.recirc &&
1134 bond_may_recirc(bond, NULL, NULL);
1137 bond_recirculation_account(bond);
1140 /* Add each bond_entry to its slave's 'entries' list.
1141 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1142 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1143 slave->tx_bytes = 0;
1144 ovs_list_init(&slave->entries);
1146 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1147 if (e->slave && e->tx_bytes) {
1148 e->slave->tx_bytes += e->tx_bytes;
1149 ovs_list_push_back(&e->slave->entries, &e->list_node);
1153 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1155 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1156 * with a proper list sort algorithm. */
1157 ovs_list_init(&bals);
1158 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1159 if (slave->enabled) {
1160 insert_bal(&bals, slave);
1163 log_bals(bond, &bals);
1165 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1166 while (!ovs_list_is_short(&bals)) {
1167 struct bond_slave *from = bond_slave_from_bal_node(ovs_list_front(&bals));
1168 struct bond_slave *to = bond_slave_from_bal_node(ovs_list_back(&bals));
1171 overload = from->tx_bytes - to->tx_bytes;
1172 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1173 /* The extra load on 'from' (and all less-loaded slaves), compared
1174 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1175 * it is less than ~1Mbps. No point in rebalancing. */
1179 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1180 * to move from 'from' to 'to'. */
1181 e = choose_entry_to_migrate(from, to->tx_bytes);
1183 bond_shift_load(e, to);
1185 /* Delete element from from->entries.
1187 * We don't add the element to to->hashes. That would only allow
1188 * 'e' to be migrated to another slave in this rebalancing run, and
1189 * there is no point in doing that. */
1190 ovs_list_remove(&e->list_node);
1192 /* Re-sort 'bals'. */
1193 reinsert_bal(&bals, from);
1194 reinsert_bal(&bals, to);
1197 /* Can't usefully migrate anything away from 'from'.
1198 * Don't reconsider it. */
1199 ovs_list_remove(&from->bal_node);
1203 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1204 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1205 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1206 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1210 if (use_recirc && rebalanced) {
1211 bond_update_post_recirc_rules__(bond,true);
1215 ovs_rwlock_unlock(&rwlock);
1218 /* Bonding unixctl user interface functions. */
1220 static struct bond *
1221 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1225 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1227 if (!strcmp(bond->name, name)) {
1234 static struct bond_slave *
1235 bond_lookup_slave(struct bond *bond, const char *slave_name)
1237 struct bond_slave *slave;
1239 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1240 if (!strcmp(slave->name, slave_name)) {
1248 bond_unixctl_list(struct unixctl_conn *conn,
1249 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1250 void *aux OVS_UNUSED)
1252 struct ds ds = DS_EMPTY_INITIALIZER;
1253 const struct bond *bond;
1255 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1257 ovs_rwlock_rdlock(&rwlock);
1258 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1259 const struct bond_slave *slave;
1262 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1263 bond_mode_to_string(bond->balance), bond->recirc_id);
1266 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1268 ds_put_cstr(&ds, ", ");
1270 ds_put_cstr(&ds, slave->name);
1272 ds_put_char(&ds, '\n');
1274 ovs_rwlock_unlock(&rwlock);
1275 unixctl_command_reply(conn, ds_cstr(&ds));
1280 bond_print_details(struct ds *ds, const struct bond *bond)
1281 OVS_REQ_RDLOCK(rwlock)
1283 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1284 const struct shash_node **sorted_slaves = NULL;
1285 const struct bond_slave *slave;
1290 ds_put_format(ds, "---- %s ----\n", bond->name);
1291 ds_put_format(ds, "bond_mode: %s\n",
1292 bond_mode_to_string(bond->balance));
1294 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1295 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1296 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1298 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1300 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1301 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1303 if (bond_is_balanced(bond)) {
1304 ds_put_format(ds, "next rebalance: %lld ms\n",
1305 bond->next_rebalance - time_msec());
1308 ds_put_cstr(ds, "lacp_status: ");
1309 switch (bond->lacp_status) {
1310 case LACP_NEGOTIATED:
1311 ds_put_cstr(ds, "negotiated\n");
1313 case LACP_CONFIGURED:
1314 ds_put_cstr(ds, "configured\n");
1317 ds_put_cstr(ds, "off\n");
1320 ds_put_cstr(ds, "<unknown>\n");
1324 ds_put_cstr(ds, "active slave mac: ");
1325 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1326 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1327 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1329 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1330 shash_add(&slave_shash, slave->name, slave);
1332 sorted_slaves = shash_sort(&slave_shash);
1334 for (i = 0; i < shash_count(&slave_shash); i++) {
1335 struct bond_entry *be;
1337 slave = sorted_slaves[i]->data;
1340 ds_put_format(ds, "\nslave %s: %s\n",
1341 slave->name, slave->enabled ? "enabled" : "disabled");
1342 if (slave == bond->active_slave) {
1343 ds_put_cstr(ds, "\tactive slave\n");
1345 if (slave->delay_expires != LLONG_MAX) {
1346 ds_put_format(ds, "\t%s expires in %lld ms\n",
1347 slave->enabled ? "downdelay" : "updelay",
1348 slave->delay_expires - time_msec());
1351 ds_put_format(ds, "\tmay_enable: %s\n",
1352 slave->may_enable ? "true" : "false");
1354 if (!bond_is_balanced(bond)) {
1359 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1360 int hash = be - bond->hash;
1363 if (be->slave != slave) {
1367 be_tx_k = be->tx_bytes / 1024;
1369 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1373 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1376 shash_destroy(&slave_shash);
1377 free(sorted_slaves);
1378 ds_put_cstr(ds, "\n");
1382 bond_unixctl_show(struct unixctl_conn *conn,
1383 int argc, const char *argv[],
1384 void *aux OVS_UNUSED)
1386 struct ds ds = DS_EMPTY_INITIALIZER;
1388 ovs_rwlock_rdlock(&rwlock);
1390 const struct bond *bond = bond_find(argv[1]);
1393 unixctl_command_reply_error(conn, "no such bond");
1396 bond_print_details(&ds, bond);
1398 const struct bond *bond;
1400 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1401 bond_print_details(&ds, bond);
1405 unixctl_command_reply(conn, ds_cstr(&ds));
1409 ovs_rwlock_unlock(&rwlock);
1413 bond_unixctl_migrate(struct unixctl_conn *conn,
1414 int argc OVS_UNUSED, const char *argv[],
1415 void *aux OVS_UNUSED)
1417 const char *bond_s = argv[1];
1418 const char *hash_s = argv[2];
1419 const char *slave_s = argv[3];
1421 struct bond_slave *slave;
1422 struct bond_entry *entry;
1425 ovs_rwlock_wrlock(&rwlock);
1426 bond = bond_find(bond_s);
1428 unixctl_command_reply_error(conn, "no such bond");
1432 if (bond->balance != BM_SLB) {
1433 unixctl_command_reply_error(conn, "not an SLB bond");
1437 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1438 hash = atoi(hash_s) & BOND_MASK;
1440 unixctl_command_reply_error(conn, "bad hash");
1444 slave = bond_lookup_slave(bond, slave_s);
1446 unixctl_command_reply_error(conn, "no such slave");
1450 if (!slave->enabled) {
1451 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1455 entry = &bond->hash[hash];
1456 bond->bond_revalidate = true;
1457 entry->slave = slave;
1458 unixctl_command_reply(conn, "migrated");
1461 ovs_rwlock_unlock(&rwlock);
1465 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1466 int argc OVS_UNUSED, const char *argv[],
1467 void *aux OVS_UNUSED)
1469 const char *bond_s = argv[1];
1470 const char *slave_s = argv[2];
1472 struct bond_slave *slave;
1474 ovs_rwlock_wrlock(&rwlock);
1475 bond = bond_find(bond_s);
1477 unixctl_command_reply_error(conn, "no such bond");
1481 slave = bond_lookup_slave(bond, slave_s);
1483 unixctl_command_reply_error(conn, "no such slave");
1487 if (!slave->enabled) {
1488 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1492 if (bond->active_slave != slave) {
1493 bond->bond_revalidate = true;
1494 bond->active_slave = slave;
1495 VLOG_INFO("bond %s: active interface is now %s",
1496 bond->name, slave->name);
1497 bond->send_learning_packets = true;
1498 unixctl_command_reply(conn, "done");
1499 bond_active_slave_changed(bond);
1501 unixctl_command_reply(conn, "no change");
1504 ovs_rwlock_unlock(&rwlock);
1508 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1510 const char *bond_s = argv[1];
1511 const char *slave_s = argv[2];
1513 struct bond_slave *slave;
1515 ovs_rwlock_wrlock(&rwlock);
1516 bond = bond_find(bond_s);
1518 unixctl_command_reply_error(conn, "no such bond");
1522 slave = bond_lookup_slave(bond, slave_s);
1524 unixctl_command_reply_error(conn, "no such slave");
1528 bond_enable_slave(slave, enable);
1529 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1532 ovs_rwlock_unlock(&rwlock);
1536 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1537 int argc OVS_UNUSED, const char *argv[],
1538 void *aux OVS_UNUSED)
1540 enable_slave(conn, argv, true);
1544 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1545 int argc OVS_UNUSED, const char *argv[],
1546 void *aux OVS_UNUSED)
1548 enable_slave(conn, argv, false);
1552 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1553 void *aux OVS_UNUSED)
1555 const char *mac_s = argv[1];
1556 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1557 const char *basis_s = argc > 3 ? argv[3] : NULL;
1558 struct eth_addr mac;
1565 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1566 unixctl_command_reply_error(conn, "invalid vlan");
1574 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1575 unixctl_command_reply_error(conn, "invalid basis");
1582 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1583 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1585 hash_cstr = xasprintf("%u", hash);
1586 unixctl_command_reply(conn, hash_cstr);
1589 unixctl_command_reply_error(conn, "invalid mac");
1596 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1597 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1599 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1600 bond_unixctl_migrate, NULL);
1601 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1602 bond_unixctl_set_active_slave, NULL);
1603 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1604 bond_unixctl_enable_slave, NULL);
1605 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1606 bond_unixctl_disable_slave, NULL);
1607 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1608 bond_unixctl_hash, NULL);
1612 bond_entry_reset(struct bond *bond)
1614 if (bond->balance != BM_AB) {
1615 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1618 bond->hash = xmalloc(hash_len);
1620 memset(bond->hash, 0, hash_len);
1622 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1629 static struct bond_slave *
1630 bond_slave_lookup(struct bond *bond, const void *slave_)
1632 struct bond_slave *slave;
1634 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1636 if (slave->aux == slave_) {
1645 bond_enable_slave(struct bond_slave *slave, bool enable)
1647 slave->delay_expires = LLONG_MAX;
1648 if (enable != slave->enabled) {
1649 slave->bond->bond_revalidate = true;
1650 slave->enabled = enable;
1652 ovs_mutex_lock(&slave->bond->mutex);
1654 ovs_list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1656 ovs_list_remove(&slave->list_node);
1658 ovs_mutex_unlock(&slave->bond->mutex);
1660 VLOG_INFO("interface %s: %s", slave->name,
1661 slave->enabled ? "enabled" : "disabled");
1666 bond_link_status_update(struct bond_slave *slave)
1668 struct bond *bond = slave->bond;
1671 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1672 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1673 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1674 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1675 slave->name, up ? "up" : "down");
1676 if (up == slave->enabled) {
1677 slave->delay_expires = LLONG_MAX;
1678 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1679 slave->name, up ? "disabled" : "enabled");
1681 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1682 : up ? bond->updelay : bond->downdelay);
1683 slave->delay_expires = time_msec() + delay;
1685 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1688 up ? "enabled" : "disabled",
1695 if (time_msec() >= slave->delay_expires) {
1696 bond_enable_slave(slave, up);
1701 bond_hash_src(const struct eth_addr mac, uint16_t vlan, uint32_t basis)
1703 return hash_mac(mac, vlan, basis);
1707 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1709 struct flow hash_flow = *flow;
1710 hash_flow.vlan_tci = htons(vlan);
1712 /* The symmetric quality of this hash function is not required, but
1713 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1714 * purposes, so we use it out of convenience. */
1715 return flow_hash_symmetric_l4(&hash_flow, basis);
1719 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1721 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1723 return (bond->balance == BM_TCP
1724 ? bond_hash_tcp(flow, vlan, bond->basis)
1725 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1728 static struct bond_entry *
1729 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1732 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1735 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1736 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1738 static struct bond_slave *
1739 get_enabled_slave(struct bond *bond)
1741 struct ovs_list *node;
1743 ovs_mutex_lock(&bond->mutex);
1744 if (ovs_list_is_empty(&bond->enabled_slaves)) {
1745 ovs_mutex_unlock(&bond->mutex);
1749 node = ovs_list_pop_front(&bond->enabled_slaves);
1750 ovs_list_push_back(&bond->enabled_slaves, node);
1751 ovs_mutex_unlock(&bond->mutex);
1753 return CONTAINER_OF(node, struct bond_slave, list_node);
1756 static struct bond_slave *
1757 choose_output_slave(const struct bond *bond, const struct flow *flow,
1758 struct flow_wildcards *wc, uint16_t vlan)
1760 struct bond_entry *e;
1763 balance = bond->balance;
1764 if (bond->lacp_status == LACP_CONFIGURED) {
1765 /* LACP has been configured on this bond but negotiations were
1766 * unsuccussful. If lacp_fallback_ab is enabled use active-
1767 * backup mode else drop all traffic. */
1768 if (!bond->lacp_fallback_ab) {
1776 return bond->active_slave;
1779 if (bond->lacp_status != LACP_NEGOTIATED) {
1780 /* Must have LACP negotiations for TCP balanced bonds. */
1784 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1789 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1791 e = lookup_bond_entry(bond, flow, vlan);
1792 if (!e->slave || !e->slave->enabled) {
1793 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1802 static struct bond_slave *
1803 bond_choose_slave(const struct bond *bond)
1805 struct bond_slave *slave, *best;
1807 /* Find the last active slave. */
1808 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1809 if (slave && slave->enabled) {
1813 /* Find an enabled slave. */
1814 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1815 if (slave->enabled) {
1820 /* All interfaces are disabled. Find an interface that will be enabled
1821 * after its updelay expires. */
1823 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1824 if (slave->delay_expires != LLONG_MAX
1825 && slave->may_enable
1826 && (!best || slave->delay_expires < best->delay_expires)) {
1834 bond_choose_active_slave(struct bond *bond)
1836 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1837 struct bond_slave *old_active_slave = bond->active_slave;
1839 bond->active_slave = bond_choose_slave(bond);
1840 if (bond->active_slave) {
1841 if (bond->active_slave->enabled) {
1842 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1843 bond->name, bond->active_slave->name);
1845 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1846 "remaining %lld ms updelay (since no interface was "
1847 "enabled)", bond->name, bond->active_slave->name,
1848 bond->active_slave->delay_expires - time_msec());
1849 bond_enable_slave(bond->active_slave, true);
1852 bond->send_learning_packets = true;
1854 if (bond->active_slave != old_active_slave) {
1855 bond_active_slave_changed(bond);
1857 } else if (old_active_slave) {
1858 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1863 * Return true if bond has unstored active slave change.
1864 * If return true, 'mac' will store the bond's current active slave's
1867 bond_get_changed_active_slave(const char *name, struct eth_addr *mac,
1872 ovs_rwlock_wrlock(&rwlock);
1873 bond = bond_find(name);
1875 if (bond->active_slave_changed || force) {
1876 *mac = bond->active_slave_mac;
1877 bond->active_slave_changed = false;
1878 ovs_rwlock_unlock(&rwlock);
1882 ovs_rwlock_unlock(&rwlock);