2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
48 #include "openvswitch/vlog.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
60 /* A hash bucket for mapping a flow to a slave.
61 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
66 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
77 /* A bond slave, that is, one of the links comprising a bond. */
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
80 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
84 struct netdev *netdev; /* Network device, owned by the client. */
85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
86 ofp_port_t ofp_port; /* OpenFlow port number. */
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
90 long long delay_expires; /* Time after which 'enabled' may change. */
91 bool enabled; /* May be chosen for flows? */
92 bool may_enable; /* Client considers this slave bondable. */
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
100 /* A bond, that is, a set of network devices grouped to improve performance or
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
116 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
123 bool bond_revalidate; /* True if flows need revalidation. */
124 uint32_t basis; /* Basis for flow hash function. */
126 /* SLB specific bonding info. */
127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
134 /* Store active slave to OVSDB. */
135 bool active_slave_changed; /* Set to true whenever the bond changes
136 active slave. It will be reset to false
137 after it is stored into OVSDB */
139 /* Interface name may not be persistent across an OS reboot, use
140 * MAC address for identifing the active slave */
141 uint8_t active_slave_mac[ETH_ADDR_LEN];
142 /* The MAC address of the active interface. */
143 /* Legacy compatibility. */
144 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
146 struct ovs_refcount ref_cnt;
149 /* What to do with an bond_recirc_rule. */
151 ADD, /* Add the rule to ofproto's flow table. */
152 DEL, /* Delete the rule from the ofproto's flow table. */
155 /* A rule to add to or delete from ofproto's internal flow table. */
156 struct bond_pr_rule_op {
157 struct hmap_node hmap_node;
159 ofp_port_t out_ofport;
161 struct rule **pr_rule;
164 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
165 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
166 OVS_REQ_RDLOCK(rwlock);
167 static void bond_enable_slave(struct bond_slave *, bool enable)
168 OVS_REQ_WRLOCK(rwlock);
169 static void bond_link_status_update(struct bond_slave *)
170 OVS_REQ_WRLOCK(rwlock);
171 static void bond_choose_active_slave(struct bond *)
172 OVS_REQ_WRLOCK(rwlock);
173 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
174 uint16_t vlan, uint32_t basis);
175 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
177 static struct bond_entry *lookup_bond_entry(const struct bond *,
180 OVS_REQ_RDLOCK(rwlock);
181 static struct bond_slave *get_enabled_slave(struct bond *)
182 OVS_REQ_RDLOCK(rwlock);
183 static struct bond_slave *choose_output_slave(const struct bond *,
185 struct flow_wildcards *,
187 OVS_REQ_RDLOCK(rwlock);
189 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
190 * stores the mode in '*balance' and returns true. Otherwise returns false
191 * without modifying '*balance'. */
193 bond_mode_from_string(enum bond_mode *balance, const char *s)
195 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
197 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
199 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
207 /* Returns a string representing 'balance'. */
209 bond_mode_to_string(enum bond_mode balance) {
212 return "balance-tcp";
214 return "balance-slb";
216 return "active-backup";
222 /* Creates and returns a new bond whose configuration is initially taken from
225 * The caller should register each slave on the new bond by calling
226 * bond_slave_register(). */
228 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
232 bond = xzalloc(sizeof *bond);
233 bond->ofproto = ofproto;
234 hmap_init(&bond->slaves);
235 list_init(&bond->enabled_slaves);
236 ovs_mutex_init(&bond->mutex);
237 ovs_refcount_init(&bond->ref_cnt);
240 hmap_init(&bond->pr_rule_ops);
242 bond_reconfigure(bond, s);
247 bond_ref(const struct bond *bond_)
249 struct bond *bond = CONST_CAST(struct bond *, bond_);
252 ovs_refcount_ref(&bond->ref_cnt);
259 bond_unref(struct bond *bond)
261 struct bond_slave *slave, *next_slave;
262 struct bond_pr_rule_op *pr_op, *next_op;
264 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
268 ovs_rwlock_wrlock(&rwlock);
269 hmap_remove(all_bonds, &bond->hmap_node);
270 ovs_rwlock_unlock(&rwlock);
272 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
273 hmap_remove(&bond->slaves, &slave->hmap_node);
274 /* Client owns 'slave->netdev'. */
278 hmap_destroy(&bond->slaves);
280 ovs_mutex_destroy(&bond->mutex);
284 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
285 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
288 hmap_destroy(&bond->pr_rule_ops);
290 if (bond->recirc_id) {
291 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
298 add_pr_rule(struct bond *bond, const struct match *match,
299 ofp_port_t out_ofport, struct rule **rule)
301 uint32_t hash = match_hash(match, 0);
302 struct bond_pr_rule_op *pr_op;
304 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
305 if (match_equal(&pr_op->match, match)) {
307 pr_op->out_ofport = out_ofport;
308 pr_op->pr_rule = rule;
313 pr_op = xmalloc(sizeof *pr_op);
314 pr_op->match = *match;
316 pr_op->out_ofport = out_ofport;
317 pr_op->pr_rule = rule;
318 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
322 update_recirc_rules(struct bond *bond)
323 OVS_REQ_WRLOCK(rwlock)
326 struct bond_pr_rule_op *pr_op, *next_op;
327 uint64_t ofpacts_stub[128 / 8];
328 struct ofpbuf ofpacts;
331 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
333 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
337 if (bond->hash && bond->recirc_id) {
338 for (i = 0; i < BOND_BUCKETS; i++) {
339 struct bond_slave *slave = bond->hash[i].slave;
342 match_init_catchall(&match);
343 match_set_recirc_id(&match, bond->recirc_id);
344 match_set_dp_hash_masked(&match, i, BOND_MASK);
346 add_pr_rule(bond, &match, slave->ofp_port,
347 &bond->hash[i].pr_rule);
352 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
356 ofpbuf_clear(&ofpacts);
357 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
358 error = ofproto_dpif_add_internal_flow(bond->ofproto,
360 RECIRC_RULE_PRIORITY, 0,
361 &ofpacts, pr_op->pr_rule);
363 char *err_s = match_to_string(&pr_op->match,
364 RECIRC_RULE_PRIORITY);
366 VLOG_ERR("failed to add post recirculation flow %s", err_s);
372 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
374 RECIRC_RULE_PRIORITY);
376 char *err_s = match_to_string(&pr_op->match,
377 RECIRC_RULE_PRIORITY);
379 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
383 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
384 *pr_op->pr_rule = NULL;
390 ofpbuf_uninit(&ofpacts);
394 /* Updates 'bond''s overall configuration to 's'.
396 * The caller should register each slave on 'bond' by calling
397 * bond_slave_register(). This is optional if none of the slaves'
398 * configuration has changed. In any case it can't hurt.
400 * Returns true if the configuration has changed in such a way that requires
404 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
406 bool revalidate = false;
408 ovs_rwlock_wrlock(&rwlock);
409 if (!bond->name || strcmp(bond->name, s->name)) {
411 hmap_remove(all_bonds, &bond->hmap_node);
414 bond->name = xstrdup(s->name);
415 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
418 bond->updelay = s->up_delay;
419 bond->downdelay = s->down_delay;
421 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
422 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
426 if (bond->rebalance_interval != s->rebalance_interval) {
427 bond->rebalance_interval = s->rebalance_interval;
431 if (bond->balance != s->balance) {
432 bond->balance = s->balance;
436 if (bond->basis != s->basis) {
437 bond->basis = s->basis;
441 if (bond->bond_revalidate) {
443 bond->bond_revalidate = false;
446 if (bond->balance != BM_AB) {
447 if (!bond->recirc_id) {
448 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
450 } else if (bond->recirc_id) {
451 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
455 if (bond->balance == BM_AB || !bond->hash || revalidate) {
456 bond_entry_reset(bond);
459 memcpy(bond->active_slave_mac, s->active_slave_mac,
460 sizeof s->active_slave_mac);
462 bond->active_slave_changed = false;
464 ovs_rwlock_unlock(&rwlock);
468 static struct bond_slave *
469 bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
471 struct bond_slave *slave;
473 /* Find the last active slave */
474 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
475 uint8_t slave_mac[ETH_ADDR_LEN];
477 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
481 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
490 bond_active_slave_changed(struct bond *bond)
492 uint8_t mac[ETH_ADDR_LEN];
494 netdev_get_etheraddr(bond->active_slave->netdev, mac);
495 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
496 bond->active_slave_changed = true;
497 seq_change(connectivity_seq_get());
501 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
502 OVS_REQ_WRLOCK(rwlock)
504 if (slave->netdev != netdev) {
505 slave->netdev = netdev;
506 slave->change_seq = 0;
510 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
511 * arbitrary client-provided pointer that uniquely identifies a slave within a
512 * bond. If 'slave_' already exists within 'bond' then this function
513 * reconfigures the existing slave.
515 * 'netdev' must be the network device that 'slave_' represents. It is owned
516 * by the client, so the client must not close it before either unregistering
517 * 'slave_' or destroying 'bond'.
520 bond_slave_register(struct bond *bond, void *slave_,
521 ofp_port_t ofport, struct netdev *netdev)
523 struct bond_slave *slave;
525 ovs_rwlock_wrlock(&rwlock);
526 slave = bond_slave_lookup(bond, slave_);
528 slave = xzalloc(sizeof *slave);
530 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
533 slave->ofp_port = ofport;
534 slave->delay_expires = LLONG_MAX;
535 slave->name = xstrdup(netdev_get_name(netdev));
536 bond->bond_revalidate = true;
538 slave->enabled = false;
539 bond_enable_slave(slave, netdev_get_carrier(netdev));
542 bond_slave_set_netdev__(slave, netdev);
545 slave->name = xstrdup(netdev_get_name(netdev));
546 ovs_rwlock_unlock(&rwlock);
549 /* Updates the network device to be used with 'slave_' to 'netdev'.
551 * This is useful if the caller closes and re-opens the network device
552 * registered with bond_slave_register() but doesn't need to change anything
555 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
557 struct bond_slave *slave;
559 ovs_rwlock_wrlock(&rwlock);
560 slave = bond_slave_lookup(bond, slave_);
562 bond_slave_set_netdev__(slave, netdev);
564 ovs_rwlock_unlock(&rwlock);
567 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
568 * then this function has no effect.
570 * Unregistering a slave invalidates all flows. */
572 bond_slave_unregister(struct bond *bond, const void *slave_)
574 struct bond_slave *slave;
577 ovs_rwlock_wrlock(&rwlock);
578 slave = bond_slave_lookup(bond, slave_);
583 bond->bond_revalidate = true;
584 bond_enable_slave(slave, false);
586 del_active = bond->active_slave == slave;
588 struct bond_entry *e;
589 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
590 if (e->slave == slave) {
598 hmap_remove(&bond->slaves, &slave->hmap_node);
599 /* Client owns 'slave->netdev'. */
603 bond_choose_active_slave(bond);
604 bond->send_learning_packets = true;
607 ovs_rwlock_unlock(&rwlock);
610 /* Should be called on each slave in 'bond' before bond_run() to indicate
611 * whether or not 'slave_' may be enabled. This function is intended to allow
612 * other protocols to have some impact on bonding decisions. For example LACP
613 * or high level link monitoring protocols may decide that a given slave should
614 * not be able to send traffic. */
616 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
618 ovs_rwlock_wrlock(&rwlock);
619 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
620 ovs_rwlock_unlock(&rwlock);
623 /* Performs periodic maintenance on 'bond'.
625 * Returns true if the caller should revalidate its flows.
627 * The caller should check bond_should_send_learning_packets() afterward. */
629 bond_run(struct bond *bond, enum lacp_status lacp_status)
631 struct bond_slave *slave;
634 ovs_rwlock_wrlock(&rwlock);
635 if (bond->lacp_status != lacp_status) {
636 bond->lacp_status = lacp_status;
637 bond->bond_revalidate = true;
640 /* Enable slaves based on link status and LACP feedback. */
641 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
642 bond_link_status_update(slave);
643 slave->change_seq = seq_read(connectivity_seq_get());
645 if (!bond->active_slave || !bond->active_slave->enabled) {
646 bond_choose_active_slave(bond);
649 revalidate = bond->bond_revalidate;
650 bond->bond_revalidate = false;
651 ovs_rwlock_unlock(&rwlock);
656 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
658 bond_wait(struct bond *bond)
660 struct bond_slave *slave;
662 ovs_rwlock_rdlock(&rwlock);
663 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
664 if (slave->delay_expires != LLONG_MAX) {
665 poll_timer_wait_until(slave->delay_expires);
668 seq_wait(connectivity_seq_get(), slave->change_seq);
671 if (bond->bond_revalidate) {
672 poll_immediate_wake();
674 ovs_rwlock_unlock(&rwlock);
676 /* We don't wait for bond->next_rebalance because rebalancing can only run
677 * at a flow account checkpoint. ofproto does checkpointing on its own
678 * schedule and bond_rebalance() gets called afterward, so we'd just be
679 * waking up for no purpose. */
682 /* MAC learning table interaction. */
685 may_send_learning_packets(const struct bond *bond)
687 return ((bond->lacp_status == LACP_DISABLED
688 && (bond->balance == BM_SLB || bond->balance == BM_AB))
689 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
690 && bond->active_slave;
693 /* Returns true if 'bond' needs the client to send out packets to assist with
694 * MAC learning on 'bond'. If this function returns true, then the client
695 * should iterate through its MAC learning table for the bridge on which 'bond'
696 * is located. For each MAC that has been learned on a port other than 'bond',
697 * it should call bond_compose_learning_packet().
699 * This function will only return true if 'bond' is in SLB or active-backup
700 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
703 * Calling this function resets the state that it checks. */
705 bond_should_send_learning_packets(struct bond *bond)
709 ovs_rwlock_wrlock(&rwlock);
710 send = bond->send_learning_packets && may_send_learning_packets(bond);
711 bond->send_learning_packets = false;
712 ovs_rwlock_unlock(&rwlock);
716 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
718 * See bond_should_send_learning_packets() for description of usage. The
719 * caller should send the composed packet on the port associated with
720 * port_aux and takes ownership of the returned ofpbuf. */
722 bond_compose_learning_packet(struct bond *bond,
723 const uint8_t eth_src[ETH_ADDR_LEN],
724 uint16_t vlan, void **port_aux)
726 struct bond_slave *slave;
727 struct ofpbuf *packet;
730 ovs_rwlock_rdlock(&rwlock);
731 ovs_assert(may_send_learning_packets(bond));
732 memset(&flow, 0, sizeof flow);
733 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
734 slave = choose_output_slave(bond, &flow, NULL, vlan);
736 packet = ofpbuf_new(0);
737 compose_rarp(packet, eth_src);
739 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
742 *port_aux = slave->aux;
743 ovs_rwlock_unlock(&rwlock);
747 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
748 * Ethernet destination address of 'eth_dst', should be admitted.
750 * The return value is one of the following:
752 * - BV_ACCEPT: Admit the packet.
754 * - BV_DROP: Drop the packet.
756 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
757 * Ethernet source address and VLAN. If there is none, or if the packet
758 * is on the learned port, then admit the packet. If a different port has
759 * been learned, however, drop the packet (and do not use it for MAC
763 bond_check_admissibility(struct bond *bond, const void *slave_,
764 const uint8_t eth_dst[ETH_ADDR_LEN])
766 enum bond_verdict verdict = BV_DROP;
767 struct bond_slave *slave;
769 ovs_rwlock_rdlock(&rwlock);
770 slave = bond_slave_lookup(bond, slave_);
775 /* LACP bonds have very loose admissibility restrictions because we can
776 * assume the remote switch is aware of the bond and will "do the right
777 * thing". However, as a precaution we drop packets on disabled slaves
778 * because no correctly implemented partner switch should be sending
781 * If LACP is configured, but LACP negotiations have been unsuccessful, we
782 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
783 switch (bond->lacp_status) {
784 case LACP_NEGOTIATED:
785 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
787 case LACP_CONFIGURED:
788 if (!bond->lacp_fallback_ab) {
795 /* Drop all multicast packets on inactive slaves. */
796 if (eth_addr_is_multicast(eth_dst)) {
797 if (bond->active_slave != slave) {
802 switch (bond->balance) {
804 /* TCP balanced bonds require successful LACP negotiations. Based on the
805 * above check, LACP is off or lacp_fallback_ab is true on this bond.
806 * If lacp_fallback_ab is true fall through to BM_AB case else, we
807 * drop all incoming traffic. */
808 if (!bond->lacp_fallback_ab) {
813 /* Drop all packets which arrive on backup slaves. This is similar to
814 * how Linux bonding handles active-backup bonds. */
815 if (bond->active_slave != slave) {
816 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
818 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
819 " slave (%s) destined for " ETH_ADDR_FMT,
820 slave->name, ETH_ADDR_ARGS(eth_dst));
827 /* Drop all packets for which we have learned a different input port,
828 * because we probably sent the packet on one slave and got it back on
829 * the other. Gratuitous ARP packets are an exception to this rule:
830 * the host has moved to another switch. The exception to the
831 * exception is if we locked the learning table to avoid reflections on
833 verdict = BV_DROP_IF_MOVED;
839 ovs_rwlock_unlock(&rwlock);
844 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
845 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
846 * NULL if the packet should be dropped because no slaves are enabled.
848 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
849 * should be a VID only (i.e. excluding the PCP bits). Second,
850 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
851 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
852 * packet belongs to (so for an access port it will be the access port's VLAN).
854 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
855 * significant in the selection. At some point earlier, 'wc' should
856 * have been initialized (e.g., by flow_wildcards_init_catchall()).
859 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
860 struct flow_wildcards *wc, uint16_t vlan)
862 struct bond_slave *slave;
865 ovs_rwlock_rdlock(&rwlock);
866 slave = choose_output_slave(bond, flow, wc, vlan);
867 aux = slave ? slave->aux : NULL;
868 ovs_rwlock_unlock(&rwlock);
875 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
876 OVS_REQ_WRLOCK(rwlock)
881 delta = rule_tx_bytes - entry->pr_tx_bytes;
882 entry->tx_bytes += delta;
883 entry->pr_tx_bytes = rule_tx_bytes;
887 /* Maintain bond stats using post recirculation rule byte counters.*/
889 bond_recirculation_account(struct bond *bond)
890 OVS_REQ_WRLOCK(rwlock)
894 for (i=0; i<=BOND_MASK; i++) {
895 struct bond_entry *entry = &bond->hash[i];
896 struct rule *rule = entry->pr_rule;
899 uint64_t n_packets OVS_UNUSED;
900 long long int used OVS_UNUSED;
903 rule->ofproto->ofproto_class->rule_get_stats(
904 rule, &n_packets, &n_bytes, &used);
905 bond_entry_account(entry, n_bytes);
911 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
914 if (bond->balance == BM_TCP && bond->recirc_id) {
916 *recirc_id = bond->recirc_id;
919 *hash_bias = bond->basis;
928 bond_update_post_recirc_rules__(struct bond* bond, const bool force)
929 OVS_REQ_WRLOCK(rwlock)
931 struct bond_entry *e;
932 bool update_rules = force; /* Always update rules if caller forces it. */
934 /* Make sure all bond entries are populated */
935 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
936 if (!e->slave || !e->slave->enabled) {
938 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
939 struct bond_slave, hmap_node);
940 if (!e->slave->enabled) {
941 e->slave = bond->active_slave;
947 update_recirc_rules(bond);
952 bond_update_post_recirc_rules(struct bond* bond, const bool force)
954 ovs_rwlock_wrlock(&rwlock);
955 bond_update_post_recirc_rules__(bond, force);
956 ovs_rwlock_unlock(&rwlock);
962 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
964 return bond->rebalance_interval
965 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
968 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
970 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
973 ovs_rwlock_wrlock(&rwlock);
974 if (bond_is_balanced(bond)) {
975 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
977 ovs_rwlock_unlock(&rwlock);
980 static struct bond_slave *
981 bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
983 return CONTAINER_OF(bal, struct bond_slave, bal_node);
987 log_bals(struct bond *bond, const struct ovs_list *bals)
988 OVS_REQ_RDLOCK(rwlock)
990 if (VLOG_IS_DBG_ENABLED()) {
991 struct ds ds = DS_EMPTY_INITIALIZER;
992 const struct bond_slave *slave;
994 LIST_FOR_EACH (slave, bal_node, bals) {
996 ds_put_char(&ds, ',');
998 ds_put_format(&ds, " %s %"PRIu64"kB",
999 slave->name, slave->tx_bytes / 1024);
1001 if (!slave->enabled) {
1002 ds_put_cstr(&ds, " (disabled)");
1004 if (!list_is_empty(&slave->entries)) {
1005 struct bond_entry *e;
1007 ds_put_cstr(&ds, " (");
1008 LIST_FOR_EACH (e, list_node, &slave->entries) {
1009 if (&e->list_node != list_front(&slave->entries)) {
1010 ds_put_cstr(&ds, " + ");
1012 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1013 e - bond->hash, e->tx_bytes / 1024);
1015 ds_put_cstr(&ds, ")");
1018 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1023 /* Shifts 'hash' from its current slave to 'to'. */
1025 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1026 OVS_REQ_WRLOCK(rwlock)
1028 struct bond_slave *from = hash->slave;
1029 struct bond *bond = from->bond;
1030 uint64_t delta = hash->tx_bytes;
1032 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1033 "from %s to %s (now carrying %"PRIu64"kB and "
1034 "%"PRIu64"kB load, respectively)",
1035 bond->name, delta / 1024, hash - bond->hash,
1036 from->name, to->name,
1037 (from->tx_bytes - delta) / 1024,
1038 (to->tx_bytes + delta) / 1024);
1040 /* Shift load away from 'from' to 'to'. */
1041 from->tx_bytes -= delta;
1042 to->tx_bytes += delta;
1044 /* Arrange for flows to be revalidated. */
1046 bond->bond_revalidate = true;
1049 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1050 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1051 * given that doing so must decrease the ratio of the load on the two slaves by
1052 * at least 0.1. Returns NULL if there is no appropriate entry.
1054 * The list of entries isn't sorted. I don't know of a reason to prefer to
1055 * shift away small hashes or large hashes. */
1056 static struct bond_entry *
1057 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1058 OVS_REQ_WRLOCK(rwlock)
1060 struct bond_entry *e;
1062 if (list_is_short(&from->entries)) {
1063 /* 'from' carries no more than one MAC hash, so shifting load away from
1064 * it would be pointless. */
1068 LIST_FOR_EACH (e, list_node, &from->entries) {
1069 double old_ratio, new_ratio;
1072 if (to_tx_bytes == 0) {
1073 /* Nothing on the new slave, move it. */
1077 delta = e->tx_bytes;
1078 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1079 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1080 if (old_ratio - new_ratio > 0.1
1081 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1082 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1083 and 'to' slave have the same load. Therefore, we only move an
1084 entry if it decreases the load on 'from', and brings us closer
1085 to equal traffic load. */
1093 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1096 insert_bal(struct ovs_list *bals, struct bond_slave *slave)
1098 struct bond_slave *pos;
1100 LIST_FOR_EACH (pos, bal_node, bals) {
1101 if (slave->tx_bytes > pos->tx_bytes) {
1105 list_insert(&pos->bal_node, &slave->bal_node);
1108 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1109 * that descending order of 'tx_bytes' is maintained. */
1111 reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
1113 list_remove(&slave->bal_node);
1114 insert_bal(bals, slave);
1117 /* If 'bond' needs rebalancing, does so.
1119 * The caller should have called bond_account() for each active flow, or in case
1120 * of recirculation is used, have called bond_recirculation_account(bond),
1121 * to ensure that flow data is consistently accounted at this point.
1124 bond_rebalance(struct bond *bond)
1126 struct bond_slave *slave;
1127 struct bond_entry *e;
1128 struct ovs_list bals;
1129 bool rebalanced = false;
1132 ovs_rwlock_wrlock(&rwlock);
1133 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1136 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1138 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1139 bond_may_recirc(bond, NULL, NULL);
1142 bond_recirculation_account(bond);
1145 /* Add each bond_entry to its slave's 'entries' list.
1146 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1147 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1148 slave->tx_bytes = 0;
1149 list_init(&slave->entries);
1151 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1152 if (e->slave && e->tx_bytes) {
1153 e->slave->tx_bytes += e->tx_bytes;
1154 list_push_back(&e->slave->entries, &e->list_node);
1158 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1160 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1161 * with a proper list sort algorithm. */
1163 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1164 if (slave->enabled) {
1165 insert_bal(&bals, slave);
1168 log_bals(bond, &bals);
1170 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1171 while (!list_is_short(&bals)) {
1172 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1173 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1176 overload = from->tx_bytes - to->tx_bytes;
1177 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1178 /* The extra load on 'from' (and all less-loaded slaves), compared
1179 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1180 * it is less than ~1Mbps. No point in rebalancing. */
1184 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1185 * to move from 'from' to 'to'. */
1186 e = choose_entry_to_migrate(from, to->tx_bytes);
1188 bond_shift_load(e, to);
1190 /* Delete element from from->entries.
1192 * We don't add the element to to->hashes. That would only allow
1193 * 'e' to be migrated to another slave in this rebalancing run, and
1194 * there is no point in doing that. */
1195 list_remove(&e->list_node);
1197 /* Re-sort 'bals'. */
1198 reinsert_bal(&bals, from);
1199 reinsert_bal(&bals, to);
1202 /* Can't usefully migrate anything away from 'from'.
1203 * Don't reconsider it. */
1204 list_remove(&from->bal_node);
1208 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1209 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1210 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1211 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1215 if (use_recirc && rebalanced) {
1216 bond_update_post_recirc_rules__(bond,true);
1220 ovs_rwlock_unlock(&rwlock);
1223 /* Bonding unixctl user interface functions. */
1225 static struct bond *
1226 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1230 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1232 if (!strcmp(bond->name, name)) {
1239 static struct bond_slave *
1240 bond_lookup_slave(struct bond *bond, const char *slave_name)
1242 struct bond_slave *slave;
1244 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1245 if (!strcmp(slave->name, slave_name)) {
1253 bond_unixctl_list(struct unixctl_conn *conn,
1254 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1255 void *aux OVS_UNUSED)
1257 struct ds ds = DS_EMPTY_INITIALIZER;
1258 const struct bond *bond;
1260 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1262 ovs_rwlock_rdlock(&rwlock);
1263 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1264 const struct bond_slave *slave;
1267 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1268 bond_mode_to_string(bond->balance), bond->recirc_id);
1271 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1273 ds_put_cstr(&ds, ", ");
1275 ds_put_cstr(&ds, slave->name);
1277 ds_put_char(&ds, '\n');
1279 ovs_rwlock_unlock(&rwlock);
1280 unixctl_command_reply(conn, ds_cstr(&ds));
1285 bond_print_details(struct ds *ds, const struct bond *bond)
1286 OVS_REQ_RDLOCK(rwlock)
1288 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1289 const struct shash_node **sorted_slaves = NULL;
1290 const struct bond_slave *slave;
1295 ds_put_format(ds, "---- %s ----\n", bond->name);
1296 ds_put_format(ds, "bond_mode: %s\n",
1297 bond_mode_to_string(bond->balance));
1299 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1300 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1301 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1303 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1305 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1306 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1308 if (bond_is_balanced(bond)) {
1309 ds_put_format(ds, "next rebalance: %lld ms\n",
1310 bond->next_rebalance - time_msec());
1313 ds_put_cstr(ds, "lacp_status: ");
1314 switch (bond->lacp_status) {
1315 case LACP_NEGOTIATED:
1316 ds_put_cstr(ds, "negotiated\n");
1318 case LACP_CONFIGURED:
1319 ds_put_cstr(ds, "configured\n");
1322 ds_put_cstr(ds, "off\n");
1325 ds_put_cstr(ds, "<unknown>\n");
1329 ds_put_cstr(ds, "active slave mac: ");
1330 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1331 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1332 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1334 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1335 shash_add(&slave_shash, slave->name, slave);
1337 sorted_slaves = shash_sort(&slave_shash);
1339 for (i = 0; i < shash_count(&slave_shash); i++) {
1340 struct bond_entry *be;
1342 slave = sorted_slaves[i]->data;
1345 ds_put_format(ds, "\nslave %s: %s\n",
1346 slave->name, slave->enabled ? "enabled" : "disabled");
1347 if (slave == bond->active_slave) {
1348 ds_put_cstr(ds, "\tactive slave\n");
1350 if (slave->delay_expires != LLONG_MAX) {
1351 ds_put_format(ds, "\t%s expires in %lld ms\n",
1352 slave->enabled ? "downdelay" : "updelay",
1353 slave->delay_expires - time_msec());
1356 ds_put_format(ds, "\tmay_enable: %s\n",
1357 slave->may_enable ? "true" : "false");
1359 if (!bond_is_balanced(bond)) {
1364 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1365 int hash = be - bond->hash;
1368 if (be->slave != slave) {
1372 be_tx_k = be->tx_bytes / 1024;
1374 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1378 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1381 shash_destroy(&slave_shash);
1382 free(sorted_slaves);
1383 ds_put_cstr(ds, "\n");
1387 bond_unixctl_show(struct unixctl_conn *conn,
1388 int argc, const char *argv[],
1389 void *aux OVS_UNUSED)
1391 struct ds ds = DS_EMPTY_INITIALIZER;
1393 ovs_rwlock_rdlock(&rwlock);
1395 const struct bond *bond = bond_find(argv[1]);
1398 unixctl_command_reply_error(conn, "no such bond");
1401 bond_print_details(&ds, bond);
1403 const struct bond *bond;
1405 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1406 bond_print_details(&ds, bond);
1410 unixctl_command_reply(conn, ds_cstr(&ds));
1414 ovs_rwlock_unlock(&rwlock);
1418 bond_unixctl_migrate(struct unixctl_conn *conn,
1419 int argc OVS_UNUSED, const char *argv[],
1420 void *aux OVS_UNUSED)
1422 const char *bond_s = argv[1];
1423 const char *hash_s = argv[2];
1424 const char *slave_s = argv[3];
1426 struct bond_slave *slave;
1427 struct bond_entry *entry;
1430 ovs_rwlock_wrlock(&rwlock);
1431 bond = bond_find(bond_s);
1433 unixctl_command_reply_error(conn, "no such bond");
1437 if (bond->balance != BM_SLB) {
1438 unixctl_command_reply_error(conn, "not an SLB bond");
1442 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1443 hash = atoi(hash_s) & BOND_MASK;
1445 unixctl_command_reply_error(conn, "bad hash");
1449 slave = bond_lookup_slave(bond, slave_s);
1451 unixctl_command_reply_error(conn, "no such slave");
1455 if (!slave->enabled) {
1456 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1460 entry = &bond->hash[hash];
1461 bond->bond_revalidate = true;
1462 entry->slave = slave;
1463 unixctl_command_reply(conn, "migrated");
1466 ovs_rwlock_unlock(&rwlock);
1470 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1471 int argc OVS_UNUSED, const char *argv[],
1472 void *aux OVS_UNUSED)
1474 const char *bond_s = argv[1];
1475 const char *slave_s = argv[2];
1477 struct bond_slave *slave;
1479 ovs_rwlock_wrlock(&rwlock);
1480 bond = bond_find(bond_s);
1482 unixctl_command_reply_error(conn, "no such bond");
1486 slave = bond_lookup_slave(bond, slave_s);
1488 unixctl_command_reply_error(conn, "no such slave");
1492 if (!slave->enabled) {
1493 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1497 if (bond->active_slave != slave) {
1498 bond->bond_revalidate = true;
1499 bond->active_slave = slave;
1500 VLOG_INFO("bond %s: active interface is now %s",
1501 bond->name, slave->name);
1502 bond->send_learning_packets = true;
1503 unixctl_command_reply(conn, "done");
1504 bond_active_slave_changed(bond);
1506 unixctl_command_reply(conn, "no change");
1509 ovs_rwlock_unlock(&rwlock);
1513 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1515 const char *bond_s = argv[1];
1516 const char *slave_s = argv[2];
1518 struct bond_slave *slave;
1520 ovs_rwlock_wrlock(&rwlock);
1521 bond = bond_find(bond_s);
1523 unixctl_command_reply_error(conn, "no such bond");
1527 slave = bond_lookup_slave(bond, slave_s);
1529 unixctl_command_reply_error(conn, "no such slave");
1533 bond_enable_slave(slave, enable);
1534 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1537 ovs_rwlock_unlock(&rwlock);
1541 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1542 int argc OVS_UNUSED, const char *argv[],
1543 void *aux OVS_UNUSED)
1545 enable_slave(conn, argv, true);
1549 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1550 int argc OVS_UNUSED, const char *argv[],
1551 void *aux OVS_UNUSED)
1553 enable_slave(conn, argv, false);
1557 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1558 void *aux OVS_UNUSED)
1560 const char *mac_s = argv[1];
1561 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1562 const char *basis_s = argc > 3 ? argv[3] : NULL;
1563 uint8_t mac[ETH_ADDR_LEN];
1570 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1571 unixctl_command_reply_error(conn, "invalid vlan");
1579 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1580 unixctl_command_reply_error(conn, "invalid basis");
1587 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1588 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1590 hash_cstr = xasprintf("%u", hash);
1591 unixctl_command_reply(conn, hash_cstr);
1594 unixctl_command_reply_error(conn, "invalid mac");
1601 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1602 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1604 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1605 bond_unixctl_migrate, NULL);
1606 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1607 bond_unixctl_set_active_slave, NULL);
1608 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1609 bond_unixctl_enable_slave, NULL);
1610 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1611 bond_unixctl_disable_slave, NULL);
1612 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1613 bond_unixctl_hash, NULL);
1617 bond_entry_reset(struct bond *bond)
1619 if (bond->balance != BM_AB) {
1620 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1623 bond->hash = xmalloc(hash_len);
1625 memset(bond->hash, 0, hash_len);
1627 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1634 static struct bond_slave *
1635 bond_slave_lookup(struct bond *bond, const void *slave_)
1637 struct bond_slave *slave;
1639 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1641 if (slave->aux == slave_) {
1650 bond_enable_slave(struct bond_slave *slave, bool enable)
1652 slave->delay_expires = LLONG_MAX;
1653 if (enable != slave->enabled) {
1654 slave->bond->bond_revalidate = true;
1655 slave->enabled = enable;
1657 ovs_mutex_lock(&slave->bond->mutex);
1659 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1661 list_remove(&slave->list_node);
1663 ovs_mutex_unlock(&slave->bond->mutex);
1665 VLOG_INFO("interface %s: %s", slave->name,
1666 slave->enabled ? "enabled" : "disabled");
1671 bond_link_status_update(struct bond_slave *slave)
1673 struct bond *bond = slave->bond;
1676 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1677 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1678 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1679 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1680 slave->name, up ? "up" : "down");
1681 if (up == slave->enabled) {
1682 slave->delay_expires = LLONG_MAX;
1683 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1684 slave->name, up ? "disabled" : "enabled");
1686 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1687 : up ? bond->updelay : bond->downdelay);
1688 slave->delay_expires = time_msec() + delay;
1690 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1693 up ? "enabled" : "disabled",
1700 if (time_msec() >= slave->delay_expires) {
1701 bond_enable_slave(slave, up);
1706 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1708 return hash_mac(mac, vlan, basis);
1712 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1714 struct flow hash_flow = *flow;
1715 hash_flow.vlan_tci = htons(vlan);
1717 /* The symmetric quality of this hash function is not required, but
1718 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1719 * purposes, so we use it out of convenience. */
1720 return flow_hash_symmetric_l4(&hash_flow, basis);
1724 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1726 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1728 return (bond->balance == BM_TCP
1729 ? bond_hash_tcp(flow, vlan, bond->basis)
1730 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1733 static struct bond_entry *
1734 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1737 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1740 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1741 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1743 static struct bond_slave *
1744 get_enabled_slave(struct bond *bond)
1746 struct ovs_list *node;
1748 ovs_mutex_lock(&bond->mutex);
1749 if (list_is_empty(&bond->enabled_slaves)) {
1750 ovs_mutex_unlock(&bond->mutex);
1754 node = list_pop_front(&bond->enabled_slaves);
1755 list_push_back(&bond->enabled_slaves, node);
1756 ovs_mutex_unlock(&bond->mutex);
1758 return CONTAINER_OF(node, struct bond_slave, list_node);
1761 static struct bond_slave *
1762 choose_output_slave(const struct bond *bond, const struct flow *flow,
1763 struct flow_wildcards *wc, uint16_t vlan)
1765 struct bond_entry *e;
1768 balance = bond->balance;
1769 if (bond->lacp_status == LACP_CONFIGURED) {
1770 /* LACP has been configured on this bond but negotiations were
1771 * unsuccussful. If lacp_fallback_ab is enabled use active-
1772 * backup mode else drop all traffic. */
1773 if (!bond->lacp_fallback_ab) {
1781 return bond->active_slave;
1784 if (bond->lacp_status != LACP_NEGOTIATED) {
1785 /* Must have LACP negotiations for TCP balanced bonds. */
1789 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1794 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1796 e = lookup_bond_entry(bond, flow, vlan);
1797 if (!e->slave || !e->slave->enabled) {
1798 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1807 static struct bond_slave *
1808 bond_choose_slave(const struct bond *bond)
1810 struct bond_slave *slave, *best;
1812 /* Find the last active slave. */
1813 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1814 if (slave && slave->enabled) {
1818 /* Find an enabled slave. */
1819 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1820 if (slave->enabled) {
1825 /* All interfaces are disabled. Find an interface that will be enabled
1826 * after its updelay expires. */
1828 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1829 if (slave->delay_expires != LLONG_MAX
1830 && slave->may_enable
1831 && (!best || slave->delay_expires < best->delay_expires)) {
1839 bond_choose_active_slave(struct bond *bond)
1841 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1842 struct bond_slave *old_active_slave = bond->active_slave;
1844 bond->active_slave = bond_choose_slave(bond);
1845 if (bond->active_slave) {
1846 if (bond->active_slave->enabled) {
1847 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1848 bond->name, bond->active_slave->name);
1850 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1851 "remaining %lld ms updelay (since no interface was "
1852 "enabled)", bond->name, bond->active_slave->name,
1853 bond->active_slave->delay_expires - time_msec());
1854 bond_enable_slave(bond->active_slave, true);
1857 bond->send_learning_packets = true;
1859 if (bond->active_slave != old_active_slave) {
1860 bond_active_slave_changed(bond);
1862 } else if (old_active_slave) {
1863 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1868 * Return true if bond has unstored active slave change.
1869 * If return true, 'mac' will store the bond's current active slave's
1872 bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1876 ovs_rwlock_wrlock(&rwlock);
1877 bond = bond_find(name);
1879 if (bond->active_slave_changed || force) {
1880 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1881 bond->active_slave_changed = false;
1882 ovs_rwlock_unlock(&rwlock);
1886 ovs_rwlock_unlock(&rwlock);