2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "ofproto/ofproto-dpif-rid.h"
32 #include "connectivity.h"
34 #include "dynamic-string.h"
43 #include "dp-packet.h"
44 #include "poll-loop.h"
50 #include "openvswitch/vlog.h"
52 VLOG_DEFINE_THIS_MODULE(bond);
54 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
55 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
56 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
58 /* Bit-mask for hashing a flow down to a bucket. */
59 #define BOND_MASK 0xff
60 #define BOND_BUCKETS (BOND_MASK + 1)
62 /* A hash bucket for mapping a flow to a slave.
63 * "struct bond" has an array of BOND_BUCKETS of these. */
65 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
66 uint64_t tx_bytes /* Count of bytes recently transmitted. */
67 OVS_GUARDED_BY(rwlock);
68 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
72 * 'pr_rule' is the post-recirculation rule for this entry.
73 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
74 * is used to determine delta (applied to 'tx_bytes' above.) */
76 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
79 /* A bond slave, that is, one of the links comprising a bond. */
81 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
82 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
83 struct bond *bond; /* The bond that contains this slave. */
84 void *aux; /* Client-provided handle for this slave. */
86 struct netdev *netdev; /* Network device, owned by the client. */
87 unsigned int change_seq; /* Tracks changes in 'netdev'. */
88 ofp_port_t ofp_port; /* OpenFlow port number. */
89 char *name; /* Name (a copy of netdev_get_name(netdev)). */
92 long long delay_expires; /* Time after which 'enabled' may change. */
93 bool enabled; /* May be chosen for flows? */
94 bool may_enable; /* Client considers this slave bondable. */
96 /* Rebalancing info. Used only by bond_rebalance(). */
97 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
98 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
99 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
102 /* A bond, that is, a set of network devices grouped to improve performance or
105 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
106 char *name; /* Name provided by client. */
107 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
114 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
115 * (To prevent the bond_slave from disappearing they must also hold
117 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
118 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
121 enum bond_mode balance; /* Balancing mode, one of BM_*. */
122 struct bond_slave *active_slave;
123 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
124 enum lacp_status lacp_status; /* Status of LACP negotiations. */
125 bool bond_revalidate; /* True if flows need revalidation. */
126 uint32_t basis; /* Basis for flow hash function. */
128 /* SLB specific bonding info. */
129 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
130 int rebalance_interval; /* Interval between rebalances, in ms. */
131 long long int next_rebalance; /* Next rebalancing time. */
132 bool send_learning_packets;
133 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
134 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
136 /* Store active slave to OVSDB. */
137 bool active_slave_changed; /* Set to true whenever the bond changes
138 active slave. It will be reset to false
139 after it is stored into OVSDB */
141 /* Interface name may not be persistent across an OS reboot, use
142 * MAC address for identifing the active slave */
143 struct eth_addr active_slave_mac;
144 /* The MAC address of the active interface. */
145 /* Legacy compatibility. */
146 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
148 struct ovs_refcount ref_cnt;
151 /* What to do with an bond_recirc_rule. */
153 ADD, /* Add the rule to ofproto's flow table. */
154 DEL, /* Delete the rule from the ofproto's flow table. */
157 /* A rule to add to or delete from ofproto's internal flow table. */
158 struct bond_pr_rule_op {
159 struct hmap_node hmap_node;
161 ofp_port_t out_ofport;
163 struct rule **pr_rule;
166 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
167 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
168 OVS_REQ_RDLOCK(rwlock);
169 static void bond_enable_slave(struct bond_slave *, bool enable)
170 OVS_REQ_WRLOCK(rwlock);
171 static void bond_link_status_update(struct bond_slave *)
172 OVS_REQ_WRLOCK(rwlock);
173 static void bond_choose_active_slave(struct bond *)
174 OVS_REQ_WRLOCK(rwlock);
175 static unsigned int bond_hash_src(const struct eth_addr mac,
176 uint16_t vlan, uint32_t basis);
177 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
179 static struct bond_entry *lookup_bond_entry(const struct bond *,
182 OVS_REQ_RDLOCK(rwlock);
183 static struct bond_slave *get_enabled_slave(struct bond *)
184 OVS_REQ_RDLOCK(rwlock);
185 static struct bond_slave *choose_output_slave(const struct bond *,
187 struct flow_wildcards *,
189 OVS_REQ_RDLOCK(rwlock);
191 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
192 * stores the mode in '*balance' and returns true. Otherwise returns false
193 * without modifying '*balance'. */
195 bond_mode_from_string(enum bond_mode *balance, const char *s)
197 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
199 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
201 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
209 /* Returns a string representing 'balance'. */
211 bond_mode_to_string(enum bond_mode balance) {
214 return "balance-tcp";
216 return "balance-slb";
218 return "active-backup";
224 /* Creates and returns a new bond whose configuration is initially taken from
227 * The caller should register each slave on the new bond by calling
228 * bond_slave_register(). */
230 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
234 bond = xzalloc(sizeof *bond);
235 bond->ofproto = ofproto;
236 hmap_init(&bond->slaves);
237 list_init(&bond->enabled_slaves);
238 ovs_mutex_init(&bond->mutex);
239 ovs_refcount_init(&bond->ref_cnt);
242 hmap_init(&bond->pr_rule_ops);
244 bond_reconfigure(bond, s);
249 bond_ref(const struct bond *bond_)
251 struct bond *bond = CONST_CAST(struct bond *, bond_);
254 ovs_refcount_ref(&bond->ref_cnt);
261 bond_unref(struct bond *bond)
263 struct bond_slave *slave, *next_slave;
264 struct bond_pr_rule_op *pr_op, *next_op;
266 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
270 ovs_rwlock_wrlock(&rwlock);
271 hmap_remove(all_bonds, &bond->hmap_node);
272 ovs_rwlock_unlock(&rwlock);
274 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
275 hmap_remove(&bond->slaves, &slave->hmap_node);
276 /* Client owns 'slave->netdev'. */
280 hmap_destroy(&bond->slaves);
282 ovs_mutex_destroy(&bond->mutex);
286 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
287 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
290 hmap_destroy(&bond->pr_rule_ops);
292 if (bond->recirc_id) {
293 recirc_free_id(bond->recirc_id);
300 add_pr_rule(struct bond *bond, const struct match *match,
301 ofp_port_t out_ofport, struct rule **rule)
303 uint32_t hash = match_hash(match, 0);
304 struct bond_pr_rule_op *pr_op;
306 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
307 if (match_equal(&pr_op->match, match)) {
309 pr_op->out_ofport = out_ofport;
310 pr_op->pr_rule = rule;
315 pr_op = xmalloc(sizeof *pr_op);
316 pr_op->match = *match;
318 pr_op->out_ofport = out_ofport;
319 pr_op->pr_rule = rule;
320 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
324 update_recirc_rules(struct bond *bond)
325 OVS_REQ_WRLOCK(rwlock)
328 struct bond_pr_rule_op *pr_op, *next_op;
329 uint64_t ofpacts_stub[128 / 8];
330 struct ofpbuf ofpacts;
333 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
335 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
339 if (bond->hash && bond->recirc_id) {
340 for (i = 0; i < BOND_BUCKETS; i++) {
341 struct bond_slave *slave = bond->hash[i].slave;
344 match_init_catchall(&match);
345 match_set_recirc_id(&match, bond->recirc_id);
346 match_set_dp_hash_masked(&match, i, BOND_MASK);
348 add_pr_rule(bond, &match, slave->ofp_port,
349 &bond->hash[i].pr_rule);
354 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
358 ofpbuf_clear(&ofpacts);
359 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
360 error = ofproto_dpif_add_internal_flow(bond->ofproto,
362 RECIRC_RULE_PRIORITY, 0,
363 &ofpacts, pr_op->pr_rule);
365 char *err_s = match_to_string(&pr_op->match,
366 RECIRC_RULE_PRIORITY);
368 VLOG_ERR("failed to add post recirculation flow %s", err_s);
374 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
376 RECIRC_RULE_PRIORITY);
378 char *err_s = match_to_string(&pr_op->match,
379 RECIRC_RULE_PRIORITY);
381 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
385 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
386 *pr_op->pr_rule = NULL;
392 ofpbuf_uninit(&ofpacts);
396 /* Updates 'bond''s overall configuration to 's'.
398 * The caller should register each slave on 'bond' by calling
399 * bond_slave_register(). This is optional if none of the slaves'
400 * configuration has changed. In any case it can't hurt.
402 * Returns true if the configuration has changed in such a way that requires
406 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
408 bool revalidate = false;
410 ovs_rwlock_wrlock(&rwlock);
411 if (!bond->name || strcmp(bond->name, s->name)) {
413 hmap_remove(all_bonds, &bond->hmap_node);
416 bond->name = xstrdup(s->name);
417 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
420 bond->updelay = s->up_delay;
421 bond->downdelay = s->down_delay;
423 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
424 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
428 if (bond->rebalance_interval != s->rebalance_interval) {
429 bond->rebalance_interval = s->rebalance_interval;
433 if (bond->balance != s->balance) {
434 bond->balance = s->balance;
438 if (bond->basis != s->basis) {
439 bond->basis = s->basis;
443 if (bond->bond_revalidate) {
445 bond->bond_revalidate = false;
448 if (bond->balance != BM_AB) {
449 if (!bond->recirc_id) {
450 bond->recirc_id = recirc_alloc_id(bond->ofproto);
452 } else if (bond->recirc_id) {
453 recirc_free_id(bond->recirc_id);
457 if (bond->balance == BM_AB || !bond->hash || revalidate) {
458 bond_entry_reset(bond);
461 bond->active_slave_mac = s->active_slave_mac;
462 bond->active_slave_changed = false;
464 ovs_rwlock_unlock(&rwlock);
468 static struct bond_slave *
469 bond_find_slave_by_mac(const struct bond *bond, const struct eth_addr mac)
471 struct bond_slave *slave;
473 /* Find the last active slave */
474 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
475 struct eth_addr slave_mac;
477 if (netdev_get_etheraddr(slave->netdev, &slave_mac)) {
481 if (eth_addr_equals(slave_mac, mac)) {
490 bond_active_slave_changed(struct bond *bond)
494 netdev_get_etheraddr(bond->active_slave->netdev, &mac);
495 bond->active_slave_mac = mac;
496 bond->active_slave_changed = true;
497 seq_change(connectivity_seq_get());
501 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
502 OVS_REQ_WRLOCK(rwlock)
504 if (slave->netdev != netdev) {
505 slave->netdev = netdev;
506 slave->change_seq = 0;
510 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
511 * arbitrary client-provided pointer that uniquely identifies a slave within a
512 * bond. If 'slave_' already exists within 'bond' then this function
513 * reconfigures the existing slave.
515 * 'netdev' must be the network device that 'slave_' represents. It is owned
516 * by the client, so the client must not close it before either unregistering
517 * 'slave_' or destroying 'bond'.
520 bond_slave_register(struct bond *bond, void *slave_,
521 ofp_port_t ofport, struct netdev *netdev)
523 struct bond_slave *slave;
525 ovs_rwlock_wrlock(&rwlock);
526 slave = bond_slave_lookup(bond, slave_);
528 slave = xzalloc(sizeof *slave);
530 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
533 slave->ofp_port = ofport;
534 slave->delay_expires = LLONG_MAX;
535 slave->name = xstrdup(netdev_get_name(netdev));
536 bond->bond_revalidate = true;
538 slave->enabled = false;
539 bond_enable_slave(slave, netdev_get_carrier(netdev));
542 bond_slave_set_netdev__(slave, netdev);
545 slave->name = xstrdup(netdev_get_name(netdev));
546 ovs_rwlock_unlock(&rwlock);
549 /* Updates the network device to be used with 'slave_' to 'netdev'.
551 * This is useful if the caller closes and re-opens the network device
552 * registered with bond_slave_register() but doesn't need to change anything
555 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
557 struct bond_slave *slave;
559 ovs_rwlock_wrlock(&rwlock);
560 slave = bond_slave_lookup(bond, slave_);
562 bond_slave_set_netdev__(slave, netdev);
564 ovs_rwlock_unlock(&rwlock);
567 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
568 * then this function has no effect.
570 * Unregistering a slave invalidates all flows. */
572 bond_slave_unregister(struct bond *bond, const void *slave_)
574 struct bond_slave *slave;
577 ovs_rwlock_wrlock(&rwlock);
578 slave = bond_slave_lookup(bond, slave_);
583 bond->bond_revalidate = true;
584 bond_enable_slave(slave, false);
586 del_active = bond->active_slave == slave;
588 struct bond_entry *e;
589 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
590 if (e->slave == slave) {
598 hmap_remove(&bond->slaves, &slave->hmap_node);
599 /* Client owns 'slave->netdev'. */
603 bond_choose_active_slave(bond);
604 bond->send_learning_packets = true;
607 ovs_rwlock_unlock(&rwlock);
610 /* Should be called on each slave in 'bond' before bond_run() to indicate
611 * whether or not 'slave_' may be enabled. This function is intended to allow
612 * other protocols to have some impact on bonding decisions. For example LACP
613 * or high level link monitoring protocols may decide that a given slave should
614 * not be able to send traffic. */
616 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
618 ovs_rwlock_wrlock(&rwlock);
619 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
620 ovs_rwlock_unlock(&rwlock);
623 /* Performs periodic maintenance on 'bond'.
625 * Returns true if the caller should revalidate its flows.
627 * The caller should check bond_should_send_learning_packets() afterward. */
629 bond_run(struct bond *bond, enum lacp_status lacp_status)
631 struct bond_slave *slave;
634 ovs_rwlock_wrlock(&rwlock);
635 if (bond->lacp_status != lacp_status) {
636 bond->lacp_status = lacp_status;
637 bond->bond_revalidate = true;
640 /* Enable slaves based on link status and LACP feedback. */
641 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
642 bond_link_status_update(slave);
643 slave->change_seq = seq_read(connectivity_seq_get());
645 if (!bond->active_slave || !bond->active_slave->enabled) {
646 bond_choose_active_slave(bond);
649 revalidate = bond->bond_revalidate;
650 bond->bond_revalidate = false;
651 ovs_rwlock_unlock(&rwlock);
656 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
658 bond_wait(struct bond *bond)
660 struct bond_slave *slave;
662 ovs_rwlock_rdlock(&rwlock);
663 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
664 if (slave->delay_expires != LLONG_MAX) {
665 poll_timer_wait_until(slave->delay_expires);
668 seq_wait(connectivity_seq_get(), slave->change_seq);
671 if (bond->bond_revalidate) {
672 poll_immediate_wake();
674 ovs_rwlock_unlock(&rwlock);
676 /* We don't wait for bond->next_rebalance because rebalancing can only run
677 * at a flow account checkpoint. ofproto does checkpointing on its own
678 * schedule and bond_rebalance() gets called afterward, so we'd just be
679 * waking up for no purpose. */
682 /* MAC learning table interaction. */
685 may_send_learning_packets(const struct bond *bond)
687 return ((bond->lacp_status == LACP_DISABLED
688 && (bond->balance == BM_SLB || bond->balance == BM_AB))
689 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
690 && bond->active_slave;
693 /* Returns true if 'bond' needs the client to send out packets to assist with
694 * MAC learning on 'bond'. If this function returns true, then the client
695 * should iterate through its MAC learning table for the bridge on which 'bond'
696 * is located. For each MAC that has been learned on a port other than 'bond',
697 * it should call bond_compose_learning_packet().
699 * This function will only return true if 'bond' is in SLB or active-backup
700 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
703 * Calling this function resets the state that it checks. */
705 bond_should_send_learning_packets(struct bond *bond)
709 ovs_rwlock_wrlock(&rwlock);
710 send = bond->send_learning_packets && may_send_learning_packets(bond);
711 bond->send_learning_packets = false;
712 ovs_rwlock_unlock(&rwlock);
716 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
718 * See bond_should_send_learning_packets() for description of usage. The
719 * caller should send the composed packet on the port associated with
720 * port_aux and takes ownership of the returned ofpbuf. */
722 bond_compose_learning_packet(struct bond *bond, const struct eth_addr eth_src,
723 uint16_t vlan, void **port_aux)
725 struct bond_slave *slave;
726 struct dp_packet *packet;
729 ovs_rwlock_rdlock(&rwlock);
730 ovs_assert(may_send_learning_packets(bond));
731 memset(&flow, 0, sizeof flow);
732 flow.dl_src = eth_src;
733 slave = choose_output_slave(bond, &flow, NULL, vlan);
735 packet = dp_packet_new(0);
736 compose_rarp(packet, eth_src);
738 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
741 *port_aux = slave->aux;
742 ovs_rwlock_unlock(&rwlock);
746 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
747 * Ethernet destination address of 'eth_dst', should be admitted.
749 * The return value is one of the following:
751 * - BV_ACCEPT: Admit the packet.
753 * - BV_DROP: Drop the packet.
755 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
756 * Ethernet source address and VLAN. If there is none, or if the packet
757 * is on the learned port, then admit the packet. If a different port has
758 * been learned, however, drop the packet (and do not use it for MAC
762 bond_check_admissibility(struct bond *bond, const void *slave_,
763 const struct eth_addr eth_dst)
765 enum bond_verdict verdict = BV_DROP;
766 struct bond_slave *slave;
768 ovs_rwlock_rdlock(&rwlock);
769 slave = bond_slave_lookup(bond, slave_);
774 /* LACP bonds have very loose admissibility restrictions because we can
775 * assume the remote switch is aware of the bond and will "do the right
776 * thing". However, as a precaution we drop packets on disabled slaves
777 * because no correctly implemented partner switch should be sending
780 * If LACP is configured, but LACP negotiations have been unsuccessful, we
781 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
782 switch (bond->lacp_status) {
783 case LACP_NEGOTIATED:
784 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
786 case LACP_CONFIGURED:
787 if (!bond->lacp_fallback_ab) {
794 /* Drop all multicast packets on inactive slaves. */
795 if (eth_addr_is_multicast(eth_dst)) {
796 if (bond->active_slave != slave) {
801 switch (bond->balance) {
803 /* TCP balanced bonds require successful LACP negotiations. Based on the
804 * above check, LACP is off or lacp_fallback_ab is true on this bond.
805 * If lacp_fallback_ab is true fall through to BM_AB case else, we
806 * drop all incoming traffic. */
807 if (!bond->lacp_fallback_ab) {
812 /* Drop all packets which arrive on backup slaves. This is similar to
813 * how Linux bonding handles active-backup bonds. */
814 if (bond->active_slave != slave) {
815 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
817 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
818 " slave (%s) destined for " ETH_ADDR_FMT,
819 slave->name, ETH_ADDR_ARGS(eth_dst));
826 /* Drop all packets for which we have learned a different input port,
827 * because we probably sent the packet on one slave and got it back on
828 * the other. Gratuitous ARP packets are an exception to this rule:
829 * the host has moved to another switch. The exception to the
830 * exception is if we locked the learning table to avoid reflections on
832 verdict = BV_DROP_IF_MOVED;
838 ovs_rwlock_unlock(&rwlock);
843 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
844 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
845 * NULL if the packet should be dropped because no slaves are enabled.
847 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
848 * should be a VID only (i.e. excluding the PCP bits). Second,
849 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
850 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
851 * packet belongs to (so for an access port it will be the access port's VLAN).
853 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
854 * significant in the selection. At some point earlier, 'wc' should
855 * have been initialized (e.g., by flow_wildcards_init_catchall()).
858 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
859 struct flow_wildcards *wc, uint16_t vlan)
861 struct bond_slave *slave;
864 ovs_rwlock_rdlock(&rwlock);
865 slave = choose_output_slave(bond, flow, wc, vlan);
866 aux = slave ? slave->aux : NULL;
867 ovs_rwlock_unlock(&rwlock);
874 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
875 OVS_REQ_WRLOCK(rwlock)
880 delta = rule_tx_bytes - entry->pr_tx_bytes;
881 entry->tx_bytes += delta;
882 entry->pr_tx_bytes = rule_tx_bytes;
886 /* Maintain bond stats using post recirculation rule byte counters.*/
888 bond_recirculation_account(struct bond *bond)
889 OVS_REQ_WRLOCK(rwlock)
893 for (i=0; i<=BOND_MASK; i++) {
894 struct bond_entry *entry = &bond->hash[i];
895 struct rule *rule = entry->pr_rule;
898 uint64_t n_packets OVS_UNUSED;
899 long long int used OVS_UNUSED;
902 rule->ofproto->ofproto_class->rule_get_stats(
903 rule, &n_packets, &n_bytes, &used);
904 bond_entry_account(entry, n_bytes);
910 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
913 if (bond->balance == BM_TCP && bond->recirc_id) {
915 *recirc_id = bond->recirc_id;
918 *hash_bias = bond->basis;
927 bond_update_post_recirc_rules__(struct bond* bond, const bool force)
928 OVS_REQ_WRLOCK(rwlock)
930 struct bond_entry *e;
931 bool update_rules = force; /* Always update rules if caller forces it. */
933 /* Make sure all bond entries are populated */
934 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
935 if (!e->slave || !e->slave->enabled) {
937 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
938 struct bond_slave, hmap_node);
939 if (!e->slave->enabled) {
940 e->slave = bond->active_slave;
946 update_recirc_rules(bond);
951 bond_update_post_recirc_rules(struct bond* bond, const bool force)
953 ovs_rwlock_wrlock(&rwlock);
954 bond_update_post_recirc_rules__(bond, force);
955 ovs_rwlock_unlock(&rwlock);
961 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
963 return bond->rebalance_interval
964 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
967 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
969 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
972 ovs_rwlock_wrlock(&rwlock);
973 if (bond_is_balanced(bond)) {
974 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
976 ovs_rwlock_unlock(&rwlock);
979 static struct bond_slave *
980 bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
982 return CONTAINER_OF(bal, struct bond_slave, bal_node);
986 log_bals(struct bond *bond, const struct ovs_list *bals)
987 OVS_REQ_RDLOCK(rwlock)
989 if (VLOG_IS_DBG_ENABLED()) {
990 struct ds ds = DS_EMPTY_INITIALIZER;
991 const struct bond_slave *slave;
993 LIST_FOR_EACH (slave, bal_node, bals) {
995 ds_put_char(&ds, ',');
997 ds_put_format(&ds, " %s %"PRIu64"kB",
998 slave->name, slave->tx_bytes / 1024);
1000 if (!slave->enabled) {
1001 ds_put_cstr(&ds, " (disabled)");
1003 if (!list_is_empty(&slave->entries)) {
1004 struct bond_entry *e;
1006 ds_put_cstr(&ds, " (");
1007 LIST_FOR_EACH (e, list_node, &slave->entries) {
1008 if (&e->list_node != list_front(&slave->entries)) {
1009 ds_put_cstr(&ds, " + ");
1011 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1012 e - bond->hash, e->tx_bytes / 1024);
1014 ds_put_cstr(&ds, ")");
1017 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1022 /* Shifts 'hash' from its current slave to 'to'. */
1024 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1025 OVS_REQ_WRLOCK(rwlock)
1027 struct bond_slave *from = hash->slave;
1028 struct bond *bond = from->bond;
1029 uint64_t delta = hash->tx_bytes;
1031 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1032 "from %s to %s (now carrying %"PRIu64"kB and "
1033 "%"PRIu64"kB load, respectively)",
1034 bond->name, delta / 1024, hash - bond->hash,
1035 from->name, to->name,
1036 (from->tx_bytes - delta) / 1024,
1037 (to->tx_bytes + delta) / 1024);
1039 /* Shift load away from 'from' to 'to'. */
1040 from->tx_bytes -= delta;
1041 to->tx_bytes += delta;
1043 /* Arrange for flows to be revalidated. */
1045 bond->bond_revalidate = true;
1048 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1049 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1050 * given that doing so must decrease the ratio of the load on the two slaves by
1051 * at least 0.1. Returns NULL if there is no appropriate entry.
1053 * The list of entries isn't sorted. I don't know of a reason to prefer to
1054 * shift away small hashes or large hashes. */
1055 static struct bond_entry *
1056 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1057 OVS_REQ_WRLOCK(rwlock)
1059 struct bond_entry *e;
1061 if (list_is_short(&from->entries)) {
1062 /* 'from' carries no more than one MAC hash, so shifting load away from
1063 * it would be pointless. */
1067 LIST_FOR_EACH (e, list_node, &from->entries) {
1068 double old_ratio, new_ratio;
1071 if (to_tx_bytes == 0) {
1072 /* Nothing on the new slave, move it. */
1076 delta = e->tx_bytes;
1077 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1078 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1079 if (old_ratio - new_ratio > 0.1
1080 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1081 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1082 and 'to' slave have the same load. Therefore, we only move an
1083 entry if it decreases the load on 'from', and brings us closer
1084 to equal traffic load. */
1092 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1095 insert_bal(struct ovs_list *bals, struct bond_slave *slave)
1097 struct bond_slave *pos;
1099 LIST_FOR_EACH (pos, bal_node, bals) {
1100 if (slave->tx_bytes > pos->tx_bytes) {
1104 list_insert(&pos->bal_node, &slave->bal_node);
1107 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1108 * that descending order of 'tx_bytes' is maintained. */
1110 reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
1112 list_remove(&slave->bal_node);
1113 insert_bal(bals, slave);
1116 /* If 'bond' needs rebalancing, does so.
1118 * The caller should have called bond_account() for each active flow, or in case
1119 * of recirculation is used, have called bond_recirculation_account(bond),
1120 * to ensure that flow data is consistently accounted at this point.
1123 bond_rebalance(struct bond *bond)
1125 struct bond_slave *slave;
1126 struct bond_entry *e;
1127 struct ovs_list bals;
1128 bool rebalanced = false;
1131 ovs_rwlock_wrlock(&rwlock);
1132 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1135 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1137 use_recirc = ofproto_dpif_get_support(bond->ofproto)->odp.recirc &&
1138 bond_may_recirc(bond, NULL, NULL);
1141 bond_recirculation_account(bond);
1144 /* Add each bond_entry to its slave's 'entries' list.
1145 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1146 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1147 slave->tx_bytes = 0;
1148 list_init(&slave->entries);
1150 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1151 if (e->slave && e->tx_bytes) {
1152 e->slave->tx_bytes += e->tx_bytes;
1153 list_push_back(&e->slave->entries, &e->list_node);
1157 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1159 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1160 * with a proper list sort algorithm. */
1162 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1163 if (slave->enabled) {
1164 insert_bal(&bals, slave);
1167 log_bals(bond, &bals);
1169 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1170 while (!list_is_short(&bals)) {
1171 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1172 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1175 overload = from->tx_bytes - to->tx_bytes;
1176 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1177 /* The extra load on 'from' (and all less-loaded slaves), compared
1178 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1179 * it is less than ~1Mbps. No point in rebalancing. */
1183 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1184 * to move from 'from' to 'to'. */
1185 e = choose_entry_to_migrate(from, to->tx_bytes);
1187 bond_shift_load(e, to);
1189 /* Delete element from from->entries.
1191 * We don't add the element to to->hashes. That would only allow
1192 * 'e' to be migrated to another slave in this rebalancing run, and
1193 * there is no point in doing that. */
1194 list_remove(&e->list_node);
1196 /* Re-sort 'bals'. */
1197 reinsert_bal(&bals, from);
1198 reinsert_bal(&bals, to);
1201 /* Can't usefully migrate anything away from 'from'.
1202 * Don't reconsider it. */
1203 list_remove(&from->bal_node);
1207 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1208 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1209 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1210 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1214 if (use_recirc && rebalanced) {
1215 bond_update_post_recirc_rules__(bond,true);
1219 ovs_rwlock_unlock(&rwlock);
1222 /* Bonding unixctl user interface functions. */
1224 static struct bond *
1225 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1229 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1231 if (!strcmp(bond->name, name)) {
1238 static struct bond_slave *
1239 bond_lookup_slave(struct bond *bond, const char *slave_name)
1241 struct bond_slave *slave;
1243 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1244 if (!strcmp(slave->name, slave_name)) {
1252 bond_unixctl_list(struct unixctl_conn *conn,
1253 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1254 void *aux OVS_UNUSED)
1256 struct ds ds = DS_EMPTY_INITIALIZER;
1257 const struct bond *bond;
1259 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1261 ovs_rwlock_rdlock(&rwlock);
1262 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1263 const struct bond_slave *slave;
1266 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1267 bond_mode_to_string(bond->balance), bond->recirc_id);
1270 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1272 ds_put_cstr(&ds, ", ");
1274 ds_put_cstr(&ds, slave->name);
1276 ds_put_char(&ds, '\n');
1278 ovs_rwlock_unlock(&rwlock);
1279 unixctl_command_reply(conn, ds_cstr(&ds));
1284 bond_print_details(struct ds *ds, const struct bond *bond)
1285 OVS_REQ_RDLOCK(rwlock)
1287 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1288 const struct shash_node **sorted_slaves = NULL;
1289 const struct bond_slave *slave;
1294 ds_put_format(ds, "---- %s ----\n", bond->name);
1295 ds_put_format(ds, "bond_mode: %s\n",
1296 bond_mode_to_string(bond->balance));
1298 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1299 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1300 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1302 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1304 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1305 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1307 if (bond_is_balanced(bond)) {
1308 ds_put_format(ds, "next rebalance: %lld ms\n",
1309 bond->next_rebalance - time_msec());
1312 ds_put_cstr(ds, "lacp_status: ");
1313 switch (bond->lacp_status) {
1314 case LACP_NEGOTIATED:
1315 ds_put_cstr(ds, "negotiated\n");
1317 case LACP_CONFIGURED:
1318 ds_put_cstr(ds, "configured\n");
1321 ds_put_cstr(ds, "off\n");
1324 ds_put_cstr(ds, "<unknown>\n");
1328 ds_put_cstr(ds, "active slave mac: ");
1329 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1330 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1331 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1333 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1334 shash_add(&slave_shash, slave->name, slave);
1336 sorted_slaves = shash_sort(&slave_shash);
1338 for (i = 0; i < shash_count(&slave_shash); i++) {
1339 struct bond_entry *be;
1341 slave = sorted_slaves[i]->data;
1344 ds_put_format(ds, "\nslave %s: %s\n",
1345 slave->name, slave->enabled ? "enabled" : "disabled");
1346 if (slave == bond->active_slave) {
1347 ds_put_cstr(ds, "\tactive slave\n");
1349 if (slave->delay_expires != LLONG_MAX) {
1350 ds_put_format(ds, "\t%s expires in %lld ms\n",
1351 slave->enabled ? "downdelay" : "updelay",
1352 slave->delay_expires - time_msec());
1355 ds_put_format(ds, "\tmay_enable: %s\n",
1356 slave->may_enable ? "true" : "false");
1358 if (!bond_is_balanced(bond)) {
1363 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1364 int hash = be - bond->hash;
1367 if (be->slave != slave) {
1371 be_tx_k = be->tx_bytes / 1024;
1373 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1377 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1380 shash_destroy(&slave_shash);
1381 free(sorted_slaves);
1382 ds_put_cstr(ds, "\n");
1386 bond_unixctl_show(struct unixctl_conn *conn,
1387 int argc, const char *argv[],
1388 void *aux OVS_UNUSED)
1390 struct ds ds = DS_EMPTY_INITIALIZER;
1392 ovs_rwlock_rdlock(&rwlock);
1394 const struct bond *bond = bond_find(argv[1]);
1397 unixctl_command_reply_error(conn, "no such bond");
1400 bond_print_details(&ds, bond);
1402 const struct bond *bond;
1404 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1405 bond_print_details(&ds, bond);
1409 unixctl_command_reply(conn, ds_cstr(&ds));
1413 ovs_rwlock_unlock(&rwlock);
1417 bond_unixctl_migrate(struct unixctl_conn *conn,
1418 int argc OVS_UNUSED, const char *argv[],
1419 void *aux OVS_UNUSED)
1421 const char *bond_s = argv[1];
1422 const char *hash_s = argv[2];
1423 const char *slave_s = argv[3];
1425 struct bond_slave *slave;
1426 struct bond_entry *entry;
1429 ovs_rwlock_wrlock(&rwlock);
1430 bond = bond_find(bond_s);
1432 unixctl_command_reply_error(conn, "no such bond");
1436 if (bond->balance != BM_SLB) {
1437 unixctl_command_reply_error(conn, "not an SLB bond");
1441 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1442 hash = atoi(hash_s) & BOND_MASK;
1444 unixctl_command_reply_error(conn, "bad hash");
1448 slave = bond_lookup_slave(bond, slave_s);
1450 unixctl_command_reply_error(conn, "no such slave");
1454 if (!slave->enabled) {
1455 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1459 entry = &bond->hash[hash];
1460 bond->bond_revalidate = true;
1461 entry->slave = slave;
1462 unixctl_command_reply(conn, "migrated");
1465 ovs_rwlock_unlock(&rwlock);
1469 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1470 int argc OVS_UNUSED, const char *argv[],
1471 void *aux OVS_UNUSED)
1473 const char *bond_s = argv[1];
1474 const char *slave_s = argv[2];
1476 struct bond_slave *slave;
1478 ovs_rwlock_wrlock(&rwlock);
1479 bond = bond_find(bond_s);
1481 unixctl_command_reply_error(conn, "no such bond");
1485 slave = bond_lookup_slave(bond, slave_s);
1487 unixctl_command_reply_error(conn, "no such slave");
1491 if (!slave->enabled) {
1492 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1496 if (bond->active_slave != slave) {
1497 bond->bond_revalidate = true;
1498 bond->active_slave = slave;
1499 VLOG_INFO("bond %s: active interface is now %s",
1500 bond->name, slave->name);
1501 bond->send_learning_packets = true;
1502 unixctl_command_reply(conn, "done");
1503 bond_active_slave_changed(bond);
1505 unixctl_command_reply(conn, "no change");
1508 ovs_rwlock_unlock(&rwlock);
1512 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1514 const char *bond_s = argv[1];
1515 const char *slave_s = argv[2];
1517 struct bond_slave *slave;
1519 ovs_rwlock_wrlock(&rwlock);
1520 bond = bond_find(bond_s);
1522 unixctl_command_reply_error(conn, "no such bond");
1526 slave = bond_lookup_slave(bond, slave_s);
1528 unixctl_command_reply_error(conn, "no such slave");
1532 bond_enable_slave(slave, enable);
1533 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1536 ovs_rwlock_unlock(&rwlock);
1540 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1541 int argc OVS_UNUSED, const char *argv[],
1542 void *aux OVS_UNUSED)
1544 enable_slave(conn, argv, true);
1548 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1549 int argc OVS_UNUSED, const char *argv[],
1550 void *aux OVS_UNUSED)
1552 enable_slave(conn, argv, false);
1556 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1557 void *aux OVS_UNUSED)
1559 const char *mac_s = argv[1];
1560 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1561 const char *basis_s = argc > 3 ? argv[3] : NULL;
1562 struct eth_addr mac;
1569 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1570 unixctl_command_reply_error(conn, "invalid vlan");
1578 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1579 unixctl_command_reply_error(conn, "invalid basis");
1586 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1587 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1589 hash_cstr = xasprintf("%u", hash);
1590 unixctl_command_reply(conn, hash_cstr);
1593 unixctl_command_reply_error(conn, "invalid mac");
1600 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1601 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1603 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1604 bond_unixctl_migrate, NULL);
1605 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1606 bond_unixctl_set_active_slave, NULL);
1607 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1608 bond_unixctl_enable_slave, NULL);
1609 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1610 bond_unixctl_disable_slave, NULL);
1611 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1612 bond_unixctl_hash, NULL);
1616 bond_entry_reset(struct bond *bond)
1618 if (bond->balance != BM_AB) {
1619 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1622 bond->hash = xmalloc(hash_len);
1624 memset(bond->hash, 0, hash_len);
1626 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1633 static struct bond_slave *
1634 bond_slave_lookup(struct bond *bond, const void *slave_)
1636 struct bond_slave *slave;
1638 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1640 if (slave->aux == slave_) {
1649 bond_enable_slave(struct bond_slave *slave, bool enable)
1651 slave->delay_expires = LLONG_MAX;
1652 if (enable != slave->enabled) {
1653 slave->bond->bond_revalidate = true;
1654 slave->enabled = enable;
1656 ovs_mutex_lock(&slave->bond->mutex);
1658 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1660 list_remove(&slave->list_node);
1662 ovs_mutex_unlock(&slave->bond->mutex);
1664 VLOG_INFO("interface %s: %s", slave->name,
1665 slave->enabled ? "enabled" : "disabled");
1670 bond_link_status_update(struct bond_slave *slave)
1672 struct bond *bond = slave->bond;
1675 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1676 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1677 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1678 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1679 slave->name, up ? "up" : "down");
1680 if (up == slave->enabled) {
1681 slave->delay_expires = LLONG_MAX;
1682 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1683 slave->name, up ? "disabled" : "enabled");
1685 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1686 : up ? bond->updelay : bond->downdelay);
1687 slave->delay_expires = time_msec() + delay;
1689 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1692 up ? "enabled" : "disabled",
1699 if (time_msec() >= slave->delay_expires) {
1700 bond_enable_slave(slave, up);
1705 bond_hash_src(const struct eth_addr mac, uint16_t vlan, uint32_t basis)
1707 return hash_mac(mac, vlan, basis);
1711 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1713 struct flow hash_flow = *flow;
1714 hash_flow.vlan_tci = htons(vlan);
1716 /* The symmetric quality of this hash function is not required, but
1717 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1718 * purposes, so we use it out of convenience. */
1719 return flow_hash_symmetric_l4(&hash_flow, basis);
1723 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1725 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1727 return (bond->balance == BM_TCP
1728 ? bond_hash_tcp(flow, vlan, bond->basis)
1729 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1732 static struct bond_entry *
1733 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1736 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1739 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1740 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1742 static struct bond_slave *
1743 get_enabled_slave(struct bond *bond)
1745 struct ovs_list *node;
1747 ovs_mutex_lock(&bond->mutex);
1748 if (list_is_empty(&bond->enabled_slaves)) {
1749 ovs_mutex_unlock(&bond->mutex);
1753 node = list_pop_front(&bond->enabled_slaves);
1754 list_push_back(&bond->enabled_slaves, node);
1755 ovs_mutex_unlock(&bond->mutex);
1757 return CONTAINER_OF(node, struct bond_slave, list_node);
1760 static struct bond_slave *
1761 choose_output_slave(const struct bond *bond, const struct flow *flow,
1762 struct flow_wildcards *wc, uint16_t vlan)
1764 struct bond_entry *e;
1767 balance = bond->balance;
1768 if (bond->lacp_status == LACP_CONFIGURED) {
1769 /* LACP has been configured on this bond but negotiations were
1770 * unsuccussful. If lacp_fallback_ab is enabled use active-
1771 * backup mode else drop all traffic. */
1772 if (!bond->lacp_fallback_ab) {
1780 return bond->active_slave;
1783 if (bond->lacp_status != LACP_NEGOTIATED) {
1784 /* Must have LACP negotiations for TCP balanced bonds. */
1788 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1793 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1795 e = lookup_bond_entry(bond, flow, vlan);
1796 if (!e->slave || !e->slave->enabled) {
1797 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1806 static struct bond_slave *
1807 bond_choose_slave(const struct bond *bond)
1809 struct bond_slave *slave, *best;
1811 /* Find the last active slave. */
1812 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1813 if (slave && slave->enabled) {
1817 /* Find an enabled slave. */
1818 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1819 if (slave->enabled) {
1824 /* All interfaces are disabled. Find an interface that will be enabled
1825 * after its updelay expires. */
1827 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1828 if (slave->delay_expires != LLONG_MAX
1829 && slave->may_enable
1830 && (!best || slave->delay_expires < best->delay_expires)) {
1838 bond_choose_active_slave(struct bond *bond)
1840 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1841 struct bond_slave *old_active_slave = bond->active_slave;
1843 bond->active_slave = bond_choose_slave(bond);
1844 if (bond->active_slave) {
1845 if (bond->active_slave->enabled) {
1846 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1847 bond->name, bond->active_slave->name);
1849 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1850 "remaining %lld ms updelay (since no interface was "
1851 "enabled)", bond->name, bond->active_slave->name,
1852 bond->active_slave->delay_expires - time_msec());
1853 bond_enable_slave(bond->active_slave, true);
1856 bond->send_learning_packets = true;
1858 if (bond->active_slave != old_active_slave) {
1859 bond_active_slave_changed(bond);
1861 } else if (old_active_slave) {
1862 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1867 * Return true if bond has unstored active slave change.
1868 * If return true, 'mac' will store the bond's current active slave's
1871 bond_get_changed_active_slave(const char *name, struct eth_addr *mac,
1876 ovs_rwlock_wrlock(&rwlock);
1877 bond = bond_find(name);
1879 if (bond->active_slave_changed || force) {
1880 *mac = bond->active_slave_mac;
1881 bond->active_slave_changed = false;
1882 ovs_rwlock_unlock(&rwlock);
1886 ovs_rwlock_unlock(&rwlock);