2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "dp-packet.h"
43 #include "poll-loop.h"
49 #include "openvswitch/vlog.h"
51 VLOG_DEFINE_THIS_MODULE(bond);
53 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
54 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
55 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
57 /* Bit-mask for hashing a flow down to a bucket. */
58 #define BOND_MASK 0xff
59 #define BOND_BUCKETS (BOND_MASK + 1)
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct ovs_list list_node; /* In bond_slave's 'entries' list. */
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
78 /* A bond slave, that is, one of the links comprising a bond. */
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct ovs_list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* OpenFlow port number. */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct ovs_list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
101 /* A bond, that is, a set of network devices grouped to improve performance or
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135 /* Store active slave to OVSDB. */
136 bool active_slave_changed; /* Set to true whenever the bond changes
137 active slave. It will be reset to false
138 after it is stored into OVSDB */
140 /* Interface name may not be persistent across an OS reboot, use
141 * MAC address for identifing the active slave */
142 uint8_t active_slave_mac[ETH_ADDR_LEN];
143 /* The MAC address of the active interface. */
144 /* Legacy compatibility. */
145 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
147 struct ovs_refcount ref_cnt;
150 /* What to do with an bond_recirc_rule. */
152 ADD, /* Add the rule to ofproto's flow table. */
153 DEL, /* Delete the rule from the ofproto's flow table. */
156 /* A rule to add to or delete from ofproto's internal flow table. */
157 struct bond_pr_rule_op {
158 struct hmap_node hmap_node;
160 ofp_port_t out_ofport;
162 struct rule **pr_rule;
165 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
166 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
167 OVS_REQ_RDLOCK(rwlock);
168 static void bond_enable_slave(struct bond_slave *, bool enable)
169 OVS_REQ_WRLOCK(rwlock);
170 static void bond_link_status_update(struct bond_slave *)
171 OVS_REQ_WRLOCK(rwlock);
172 static void bond_choose_active_slave(struct bond *)
173 OVS_REQ_WRLOCK(rwlock);
174 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
175 uint16_t vlan, uint32_t basis);
176 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
178 static struct bond_entry *lookup_bond_entry(const struct bond *,
181 OVS_REQ_RDLOCK(rwlock);
182 static struct bond_slave *get_enabled_slave(struct bond *)
183 OVS_REQ_RDLOCK(rwlock);
184 static struct bond_slave *choose_output_slave(const struct bond *,
186 struct flow_wildcards *,
188 OVS_REQ_RDLOCK(rwlock);
190 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
191 * stores the mode in '*balance' and returns true. Otherwise returns false
192 * without modifying '*balance'. */
194 bond_mode_from_string(enum bond_mode *balance, const char *s)
196 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
198 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
200 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
208 /* Returns a string representing 'balance'. */
210 bond_mode_to_string(enum bond_mode balance) {
213 return "balance-tcp";
215 return "balance-slb";
217 return "active-backup";
223 /* Creates and returns a new bond whose configuration is initially taken from
226 * The caller should register each slave on the new bond by calling
227 * bond_slave_register(). */
229 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
233 bond = xzalloc(sizeof *bond);
234 bond->ofproto = ofproto;
235 hmap_init(&bond->slaves);
236 list_init(&bond->enabled_slaves);
237 ovs_mutex_init(&bond->mutex);
238 ovs_refcount_init(&bond->ref_cnt);
241 hmap_init(&bond->pr_rule_ops);
243 bond_reconfigure(bond, s);
248 bond_ref(const struct bond *bond_)
250 struct bond *bond = CONST_CAST(struct bond *, bond_);
253 ovs_refcount_ref(&bond->ref_cnt);
260 bond_unref(struct bond *bond)
262 struct bond_slave *slave, *next_slave;
263 struct bond_pr_rule_op *pr_op, *next_op;
265 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
269 ovs_rwlock_wrlock(&rwlock);
270 hmap_remove(all_bonds, &bond->hmap_node);
271 ovs_rwlock_unlock(&rwlock);
273 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
274 hmap_remove(&bond->slaves, &slave->hmap_node);
275 /* Client owns 'slave->netdev'. */
279 hmap_destroy(&bond->slaves);
281 ovs_mutex_destroy(&bond->mutex);
285 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
286 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
289 hmap_destroy(&bond->pr_rule_ops);
291 if (bond->recirc_id) {
292 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
299 add_pr_rule(struct bond *bond, const struct match *match,
300 ofp_port_t out_ofport, struct rule **rule)
302 uint32_t hash = match_hash(match, 0);
303 struct bond_pr_rule_op *pr_op;
305 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
306 if (match_equal(&pr_op->match, match)) {
308 pr_op->out_ofport = out_ofport;
309 pr_op->pr_rule = rule;
314 pr_op = xmalloc(sizeof *pr_op);
315 pr_op->match = *match;
317 pr_op->out_ofport = out_ofport;
318 pr_op->pr_rule = rule;
319 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
323 update_recirc_rules(struct bond *bond)
324 OVS_REQ_WRLOCK(rwlock)
327 struct bond_pr_rule_op *pr_op, *next_op;
328 uint64_t ofpacts_stub[128 / 8];
329 struct ofpbuf ofpacts;
332 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
334 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
338 if (bond->hash && bond->recirc_id) {
339 for (i = 0; i < BOND_BUCKETS; i++) {
340 struct bond_slave *slave = bond->hash[i].slave;
343 match_init_catchall(&match);
344 match_set_recirc_id(&match, bond->recirc_id);
345 match_set_dp_hash_masked(&match, i, BOND_MASK);
347 add_pr_rule(bond, &match, slave->ofp_port,
348 &bond->hash[i].pr_rule);
353 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
357 ofpbuf_clear(&ofpacts);
358 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
359 error = ofproto_dpif_add_internal_flow(bond->ofproto,
361 RECIRC_RULE_PRIORITY, 0,
362 &ofpacts, pr_op->pr_rule);
364 char *err_s = match_to_string(&pr_op->match,
365 RECIRC_RULE_PRIORITY);
367 VLOG_ERR("failed to add post recirculation flow %s", err_s);
373 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
375 RECIRC_RULE_PRIORITY);
377 char *err_s = match_to_string(&pr_op->match,
378 RECIRC_RULE_PRIORITY);
380 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
384 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
385 *pr_op->pr_rule = NULL;
391 ofpbuf_uninit(&ofpacts);
395 /* Updates 'bond''s overall configuration to 's'.
397 * The caller should register each slave on 'bond' by calling
398 * bond_slave_register(). This is optional if none of the slaves'
399 * configuration has changed. In any case it can't hurt.
401 * Returns true if the configuration has changed in such a way that requires
405 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
407 bool revalidate = false;
409 ovs_rwlock_wrlock(&rwlock);
410 if (!bond->name || strcmp(bond->name, s->name)) {
412 hmap_remove(all_bonds, &bond->hmap_node);
415 bond->name = xstrdup(s->name);
416 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
419 bond->updelay = s->up_delay;
420 bond->downdelay = s->down_delay;
422 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
423 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
427 if (bond->rebalance_interval != s->rebalance_interval) {
428 bond->rebalance_interval = s->rebalance_interval;
432 if (bond->balance != s->balance) {
433 bond->balance = s->balance;
437 if (bond->basis != s->basis) {
438 bond->basis = s->basis;
442 if (bond->bond_revalidate) {
444 bond->bond_revalidate = false;
447 if (bond->balance != BM_AB) {
448 if (!bond->recirc_id) {
449 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
451 } else if (bond->recirc_id) {
452 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
456 if (bond->balance == BM_AB || !bond->hash || revalidate) {
457 bond_entry_reset(bond);
460 memcpy(bond->active_slave_mac, s->active_slave_mac,
461 sizeof s->active_slave_mac);
463 bond->active_slave_changed = false;
465 ovs_rwlock_unlock(&rwlock);
469 static struct bond_slave *
470 bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[ETH_ADDR_LEN])
472 struct bond_slave *slave;
474 /* Find the last active slave */
475 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
476 uint8_t slave_mac[ETH_ADDR_LEN];
478 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
482 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
491 bond_active_slave_changed(struct bond *bond)
493 uint8_t mac[ETH_ADDR_LEN];
495 netdev_get_etheraddr(bond->active_slave->netdev, mac);
496 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
497 bond->active_slave_changed = true;
498 seq_change(connectivity_seq_get());
502 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
503 OVS_REQ_WRLOCK(rwlock)
505 if (slave->netdev != netdev) {
506 slave->netdev = netdev;
507 slave->change_seq = 0;
511 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
512 * arbitrary client-provided pointer that uniquely identifies a slave within a
513 * bond. If 'slave_' already exists within 'bond' then this function
514 * reconfigures the existing slave.
516 * 'netdev' must be the network device that 'slave_' represents. It is owned
517 * by the client, so the client must not close it before either unregistering
518 * 'slave_' or destroying 'bond'.
521 bond_slave_register(struct bond *bond, void *slave_,
522 ofp_port_t ofport, struct netdev *netdev)
524 struct bond_slave *slave;
526 ovs_rwlock_wrlock(&rwlock);
527 slave = bond_slave_lookup(bond, slave_);
529 slave = xzalloc(sizeof *slave);
531 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
534 slave->ofp_port = ofport;
535 slave->delay_expires = LLONG_MAX;
536 slave->name = xstrdup(netdev_get_name(netdev));
537 bond->bond_revalidate = true;
539 slave->enabled = false;
540 bond_enable_slave(slave, netdev_get_carrier(netdev));
543 bond_slave_set_netdev__(slave, netdev);
546 slave->name = xstrdup(netdev_get_name(netdev));
547 ovs_rwlock_unlock(&rwlock);
550 /* Updates the network device to be used with 'slave_' to 'netdev'.
552 * This is useful if the caller closes and re-opens the network device
553 * registered with bond_slave_register() but doesn't need to change anything
556 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
558 struct bond_slave *slave;
560 ovs_rwlock_wrlock(&rwlock);
561 slave = bond_slave_lookup(bond, slave_);
563 bond_slave_set_netdev__(slave, netdev);
565 ovs_rwlock_unlock(&rwlock);
568 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
569 * then this function has no effect.
571 * Unregistering a slave invalidates all flows. */
573 bond_slave_unregister(struct bond *bond, const void *slave_)
575 struct bond_slave *slave;
578 ovs_rwlock_wrlock(&rwlock);
579 slave = bond_slave_lookup(bond, slave_);
584 bond->bond_revalidate = true;
585 bond_enable_slave(slave, false);
587 del_active = bond->active_slave == slave;
589 struct bond_entry *e;
590 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
591 if (e->slave == slave) {
599 hmap_remove(&bond->slaves, &slave->hmap_node);
600 /* Client owns 'slave->netdev'. */
604 bond_choose_active_slave(bond);
605 bond->send_learning_packets = true;
608 ovs_rwlock_unlock(&rwlock);
611 /* Should be called on each slave in 'bond' before bond_run() to indicate
612 * whether or not 'slave_' may be enabled. This function is intended to allow
613 * other protocols to have some impact on bonding decisions. For example LACP
614 * or high level link monitoring protocols may decide that a given slave should
615 * not be able to send traffic. */
617 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
619 ovs_rwlock_wrlock(&rwlock);
620 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
621 ovs_rwlock_unlock(&rwlock);
624 /* Performs periodic maintenance on 'bond'.
626 * Returns true if the caller should revalidate its flows.
628 * The caller should check bond_should_send_learning_packets() afterward. */
630 bond_run(struct bond *bond, enum lacp_status lacp_status)
632 struct bond_slave *slave;
635 ovs_rwlock_wrlock(&rwlock);
636 if (bond->lacp_status != lacp_status) {
637 bond->lacp_status = lacp_status;
638 bond->bond_revalidate = true;
641 /* Enable slaves based on link status and LACP feedback. */
642 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
643 bond_link_status_update(slave);
644 slave->change_seq = seq_read(connectivity_seq_get());
646 if (!bond->active_slave || !bond->active_slave->enabled) {
647 bond_choose_active_slave(bond);
650 revalidate = bond->bond_revalidate;
651 bond->bond_revalidate = false;
652 ovs_rwlock_unlock(&rwlock);
657 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
659 bond_wait(struct bond *bond)
661 struct bond_slave *slave;
663 ovs_rwlock_rdlock(&rwlock);
664 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
665 if (slave->delay_expires != LLONG_MAX) {
666 poll_timer_wait_until(slave->delay_expires);
669 seq_wait(connectivity_seq_get(), slave->change_seq);
672 if (bond->bond_revalidate) {
673 poll_immediate_wake();
675 ovs_rwlock_unlock(&rwlock);
677 /* We don't wait for bond->next_rebalance because rebalancing can only run
678 * at a flow account checkpoint. ofproto does checkpointing on its own
679 * schedule and bond_rebalance() gets called afterward, so we'd just be
680 * waking up for no purpose. */
683 /* MAC learning table interaction. */
686 may_send_learning_packets(const struct bond *bond)
688 return ((bond->lacp_status == LACP_DISABLED
689 && (bond->balance == BM_SLB || bond->balance == BM_AB))
690 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
691 && bond->active_slave;
694 /* Returns true if 'bond' needs the client to send out packets to assist with
695 * MAC learning on 'bond'. If this function returns true, then the client
696 * should iterate through its MAC learning table for the bridge on which 'bond'
697 * is located. For each MAC that has been learned on a port other than 'bond',
698 * it should call bond_compose_learning_packet().
700 * This function will only return true if 'bond' is in SLB or active-backup
701 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
704 * Calling this function resets the state that it checks. */
706 bond_should_send_learning_packets(struct bond *bond)
710 ovs_rwlock_wrlock(&rwlock);
711 send = bond->send_learning_packets && may_send_learning_packets(bond);
712 bond->send_learning_packets = false;
713 ovs_rwlock_unlock(&rwlock);
717 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
719 * See bond_should_send_learning_packets() for description of usage. The
720 * caller should send the composed packet on the port associated with
721 * port_aux and takes ownership of the returned ofpbuf. */
723 bond_compose_learning_packet(struct bond *bond,
724 const uint8_t eth_src[ETH_ADDR_LEN],
725 uint16_t vlan, void **port_aux)
727 struct bond_slave *slave;
728 struct dp_packet *packet;
731 ovs_rwlock_rdlock(&rwlock);
732 ovs_assert(may_send_learning_packets(bond));
733 memset(&flow, 0, sizeof flow);
734 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
735 slave = choose_output_slave(bond, &flow, NULL, vlan);
737 packet = dp_packet_new(0);
738 compose_rarp(packet, eth_src);
740 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
743 *port_aux = slave->aux;
744 ovs_rwlock_unlock(&rwlock);
748 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
749 * Ethernet destination address of 'eth_dst', should be admitted.
751 * The return value is one of the following:
753 * - BV_ACCEPT: Admit the packet.
755 * - BV_DROP: Drop the packet.
757 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
758 * Ethernet source address and VLAN. If there is none, or if the packet
759 * is on the learned port, then admit the packet. If a different port has
760 * been learned, however, drop the packet (and do not use it for MAC
764 bond_check_admissibility(struct bond *bond, const void *slave_,
765 const uint8_t eth_dst[ETH_ADDR_LEN])
767 enum bond_verdict verdict = BV_DROP;
768 struct bond_slave *slave;
770 ovs_rwlock_rdlock(&rwlock);
771 slave = bond_slave_lookup(bond, slave_);
776 /* LACP bonds have very loose admissibility restrictions because we can
777 * assume the remote switch is aware of the bond and will "do the right
778 * thing". However, as a precaution we drop packets on disabled slaves
779 * because no correctly implemented partner switch should be sending
782 * If LACP is configured, but LACP negotiations have been unsuccessful, we
783 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
784 switch (bond->lacp_status) {
785 case LACP_NEGOTIATED:
786 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
788 case LACP_CONFIGURED:
789 if (!bond->lacp_fallback_ab) {
796 /* Drop all multicast packets on inactive slaves. */
797 if (eth_addr_is_multicast(eth_dst)) {
798 if (bond->active_slave != slave) {
803 switch (bond->balance) {
805 /* TCP balanced bonds require successful LACP negotiations. Based on the
806 * above check, LACP is off or lacp_fallback_ab is true on this bond.
807 * If lacp_fallback_ab is true fall through to BM_AB case else, we
808 * drop all incoming traffic. */
809 if (!bond->lacp_fallback_ab) {
814 /* Drop all packets which arrive on backup slaves. This is similar to
815 * how Linux bonding handles active-backup bonds. */
816 if (bond->active_slave != slave) {
817 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
819 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
820 " slave (%s) destined for " ETH_ADDR_FMT,
821 slave->name, ETH_ADDR_ARGS(eth_dst));
828 /* Drop all packets for which we have learned a different input port,
829 * because we probably sent the packet on one slave and got it back on
830 * the other. Gratuitous ARP packets are an exception to this rule:
831 * the host has moved to another switch. The exception to the
832 * exception is if we locked the learning table to avoid reflections on
834 verdict = BV_DROP_IF_MOVED;
840 ovs_rwlock_unlock(&rwlock);
845 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
846 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
847 * NULL if the packet should be dropped because no slaves are enabled.
849 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
850 * should be a VID only (i.e. excluding the PCP bits). Second,
851 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
852 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
853 * packet belongs to (so for an access port it will be the access port's VLAN).
855 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
856 * significant in the selection. At some point earlier, 'wc' should
857 * have been initialized (e.g., by flow_wildcards_init_catchall()).
860 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
861 struct flow_wildcards *wc, uint16_t vlan)
863 struct bond_slave *slave;
866 ovs_rwlock_rdlock(&rwlock);
867 slave = choose_output_slave(bond, flow, wc, vlan);
868 aux = slave ? slave->aux : NULL;
869 ovs_rwlock_unlock(&rwlock);
876 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
877 OVS_REQ_WRLOCK(rwlock)
882 delta = rule_tx_bytes - entry->pr_tx_bytes;
883 entry->tx_bytes += delta;
884 entry->pr_tx_bytes = rule_tx_bytes;
888 /* Maintain bond stats using post recirculation rule byte counters.*/
890 bond_recirculation_account(struct bond *bond)
891 OVS_REQ_WRLOCK(rwlock)
895 for (i=0; i<=BOND_MASK; i++) {
896 struct bond_entry *entry = &bond->hash[i];
897 struct rule *rule = entry->pr_rule;
900 uint64_t n_packets OVS_UNUSED;
901 long long int used OVS_UNUSED;
904 rule->ofproto->ofproto_class->rule_get_stats(
905 rule, &n_packets, &n_bytes, &used);
906 bond_entry_account(entry, n_bytes);
912 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
915 if (bond->balance == BM_TCP && bond->recirc_id) {
917 *recirc_id = bond->recirc_id;
920 *hash_bias = bond->basis;
929 bond_update_post_recirc_rules__(struct bond* bond, const bool force)
930 OVS_REQ_WRLOCK(rwlock)
932 struct bond_entry *e;
933 bool update_rules = force; /* Always update rules if caller forces it. */
935 /* Make sure all bond entries are populated */
936 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
937 if (!e->slave || !e->slave->enabled) {
939 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
940 struct bond_slave, hmap_node);
941 if (!e->slave->enabled) {
942 e->slave = bond->active_slave;
948 update_recirc_rules(bond);
953 bond_update_post_recirc_rules(struct bond* bond, const bool force)
955 ovs_rwlock_wrlock(&rwlock);
956 bond_update_post_recirc_rules__(bond, force);
957 ovs_rwlock_unlock(&rwlock);
963 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
965 return bond->rebalance_interval
966 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
969 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
971 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
974 ovs_rwlock_wrlock(&rwlock);
975 if (bond_is_balanced(bond)) {
976 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
978 ovs_rwlock_unlock(&rwlock);
981 static struct bond_slave *
982 bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock)
984 return CONTAINER_OF(bal, struct bond_slave, bal_node);
988 log_bals(struct bond *bond, const struct ovs_list *bals)
989 OVS_REQ_RDLOCK(rwlock)
991 if (VLOG_IS_DBG_ENABLED()) {
992 struct ds ds = DS_EMPTY_INITIALIZER;
993 const struct bond_slave *slave;
995 LIST_FOR_EACH (slave, bal_node, bals) {
997 ds_put_char(&ds, ',');
999 ds_put_format(&ds, " %s %"PRIu64"kB",
1000 slave->name, slave->tx_bytes / 1024);
1002 if (!slave->enabled) {
1003 ds_put_cstr(&ds, " (disabled)");
1005 if (!list_is_empty(&slave->entries)) {
1006 struct bond_entry *e;
1008 ds_put_cstr(&ds, " (");
1009 LIST_FOR_EACH (e, list_node, &slave->entries) {
1010 if (&e->list_node != list_front(&slave->entries)) {
1011 ds_put_cstr(&ds, " + ");
1013 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1014 e - bond->hash, e->tx_bytes / 1024);
1016 ds_put_cstr(&ds, ")");
1019 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1024 /* Shifts 'hash' from its current slave to 'to'. */
1026 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1027 OVS_REQ_WRLOCK(rwlock)
1029 struct bond_slave *from = hash->slave;
1030 struct bond *bond = from->bond;
1031 uint64_t delta = hash->tx_bytes;
1033 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1034 "from %s to %s (now carrying %"PRIu64"kB and "
1035 "%"PRIu64"kB load, respectively)",
1036 bond->name, delta / 1024, hash - bond->hash,
1037 from->name, to->name,
1038 (from->tx_bytes - delta) / 1024,
1039 (to->tx_bytes + delta) / 1024);
1041 /* Shift load away from 'from' to 'to'. */
1042 from->tx_bytes -= delta;
1043 to->tx_bytes += delta;
1045 /* Arrange for flows to be revalidated. */
1047 bond->bond_revalidate = true;
1050 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1051 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1052 * given that doing so must decrease the ratio of the load on the two slaves by
1053 * at least 0.1. Returns NULL if there is no appropriate entry.
1055 * The list of entries isn't sorted. I don't know of a reason to prefer to
1056 * shift away small hashes or large hashes. */
1057 static struct bond_entry *
1058 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1059 OVS_REQ_WRLOCK(rwlock)
1061 struct bond_entry *e;
1063 if (list_is_short(&from->entries)) {
1064 /* 'from' carries no more than one MAC hash, so shifting load away from
1065 * it would be pointless. */
1069 LIST_FOR_EACH (e, list_node, &from->entries) {
1070 double old_ratio, new_ratio;
1073 if (to_tx_bytes == 0) {
1074 /* Nothing on the new slave, move it. */
1078 delta = e->tx_bytes;
1079 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1080 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1081 if (old_ratio - new_ratio > 0.1
1082 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1083 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1084 and 'to' slave have the same load. Therefore, we only move an
1085 entry if it decreases the load on 'from', and brings us closer
1086 to equal traffic load. */
1094 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1097 insert_bal(struct ovs_list *bals, struct bond_slave *slave)
1099 struct bond_slave *pos;
1101 LIST_FOR_EACH (pos, bal_node, bals) {
1102 if (slave->tx_bytes > pos->tx_bytes) {
1106 list_insert(&pos->bal_node, &slave->bal_node);
1109 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1110 * that descending order of 'tx_bytes' is maintained. */
1112 reinsert_bal(struct ovs_list *bals, struct bond_slave *slave)
1114 list_remove(&slave->bal_node);
1115 insert_bal(bals, slave);
1118 /* If 'bond' needs rebalancing, does so.
1120 * The caller should have called bond_account() for each active flow, or in case
1121 * of recirculation is used, have called bond_recirculation_account(bond),
1122 * to ensure that flow data is consistently accounted at this point.
1125 bond_rebalance(struct bond *bond)
1127 struct bond_slave *slave;
1128 struct bond_entry *e;
1129 struct ovs_list bals;
1130 bool rebalanced = false;
1133 ovs_rwlock_wrlock(&rwlock);
1134 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1137 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1139 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1140 bond_may_recirc(bond, NULL, NULL);
1143 bond_recirculation_account(bond);
1146 /* Add each bond_entry to its slave's 'entries' list.
1147 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1148 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1149 slave->tx_bytes = 0;
1150 list_init(&slave->entries);
1152 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1153 if (e->slave && e->tx_bytes) {
1154 e->slave->tx_bytes += e->tx_bytes;
1155 list_push_back(&e->slave->entries, &e->list_node);
1159 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1161 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1162 * with a proper list sort algorithm. */
1164 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1165 if (slave->enabled) {
1166 insert_bal(&bals, slave);
1169 log_bals(bond, &bals);
1171 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1172 while (!list_is_short(&bals)) {
1173 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1174 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1177 overload = from->tx_bytes - to->tx_bytes;
1178 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1179 /* The extra load on 'from' (and all less-loaded slaves), compared
1180 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1181 * it is less than ~1Mbps. No point in rebalancing. */
1185 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1186 * to move from 'from' to 'to'. */
1187 e = choose_entry_to_migrate(from, to->tx_bytes);
1189 bond_shift_load(e, to);
1191 /* Delete element from from->entries.
1193 * We don't add the element to to->hashes. That would only allow
1194 * 'e' to be migrated to another slave in this rebalancing run, and
1195 * there is no point in doing that. */
1196 list_remove(&e->list_node);
1198 /* Re-sort 'bals'. */
1199 reinsert_bal(&bals, from);
1200 reinsert_bal(&bals, to);
1203 /* Can't usefully migrate anything away from 'from'.
1204 * Don't reconsider it. */
1205 list_remove(&from->bal_node);
1209 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1210 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1211 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1212 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1216 if (use_recirc && rebalanced) {
1217 bond_update_post_recirc_rules__(bond,true);
1221 ovs_rwlock_unlock(&rwlock);
1224 /* Bonding unixctl user interface functions. */
1226 static struct bond *
1227 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1231 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1233 if (!strcmp(bond->name, name)) {
1240 static struct bond_slave *
1241 bond_lookup_slave(struct bond *bond, const char *slave_name)
1243 struct bond_slave *slave;
1245 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1246 if (!strcmp(slave->name, slave_name)) {
1254 bond_unixctl_list(struct unixctl_conn *conn,
1255 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1256 void *aux OVS_UNUSED)
1258 struct ds ds = DS_EMPTY_INITIALIZER;
1259 const struct bond *bond;
1261 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1263 ovs_rwlock_rdlock(&rwlock);
1264 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1265 const struct bond_slave *slave;
1268 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1269 bond_mode_to_string(bond->balance), bond->recirc_id);
1272 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1274 ds_put_cstr(&ds, ", ");
1276 ds_put_cstr(&ds, slave->name);
1278 ds_put_char(&ds, '\n');
1280 ovs_rwlock_unlock(&rwlock);
1281 unixctl_command_reply(conn, ds_cstr(&ds));
1286 bond_print_details(struct ds *ds, const struct bond *bond)
1287 OVS_REQ_RDLOCK(rwlock)
1289 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1290 const struct shash_node **sorted_slaves = NULL;
1291 const struct bond_slave *slave;
1296 ds_put_format(ds, "---- %s ----\n", bond->name);
1297 ds_put_format(ds, "bond_mode: %s\n",
1298 bond_mode_to_string(bond->balance));
1300 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1301 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1302 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1304 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1306 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1307 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1309 if (bond_is_balanced(bond)) {
1310 ds_put_format(ds, "next rebalance: %lld ms\n",
1311 bond->next_rebalance - time_msec());
1314 ds_put_cstr(ds, "lacp_status: ");
1315 switch (bond->lacp_status) {
1316 case LACP_NEGOTIATED:
1317 ds_put_cstr(ds, "negotiated\n");
1319 case LACP_CONFIGURED:
1320 ds_put_cstr(ds, "configured\n");
1323 ds_put_cstr(ds, "off\n");
1326 ds_put_cstr(ds, "<unknown>\n");
1330 ds_put_cstr(ds, "active slave mac: ");
1331 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1332 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1333 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1335 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1336 shash_add(&slave_shash, slave->name, slave);
1338 sorted_slaves = shash_sort(&slave_shash);
1340 for (i = 0; i < shash_count(&slave_shash); i++) {
1341 struct bond_entry *be;
1343 slave = sorted_slaves[i]->data;
1346 ds_put_format(ds, "\nslave %s: %s\n",
1347 slave->name, slave->enabled ? "enabled" : "disabled");
1348 if (slave == bond->active_slave) {
1349 ds_put_cstr(ds, "\tactive slave\n");
1351 if (slave->delay_expires != LLONG_MAX) {
1352 ds_put_format(ds, "\t%s expires in %lld ms\n",
1353 slave->enabled ? "downdelay" : "updelay",
1354 slave->delay_expires - time_msec());
1357 ds_put_format(ds, "\tmay_enable: %s\n",
1358 slave->may_enable ? "true" : "false");
1360 if (!bond_is_balanced(bond)) {
1365 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1366 int hash = be - bond->hash;
1369 if (be->slave != slave) {
1373 be_tx_k = be->tx_bytes / 1024;
1375 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1379 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1382 shash_destroy(&slave_shash);
1383 free(sorted_slaves);
1384 ds_put_cstr(ds, "\n");
1388 bond_unixctl_show(struct unixctl_conn *conn,
1389 int argc, const char *argv[],
1390 void *aux OVS_UNUSED)
1392 struct ds ds = DS_EMPTY_INITIALIZER;
1394 ovs_rwlock_rdlock(&rwlock);
1396 const struct bond *bond = bond_find(argv[1]);
1399 unixctl_command_reply_error(conn, "no such bond");
1402 bond_print_details(&ds, bond);
1404 const struct bond *bond;
1406 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1407 bond_print_details(&ds, bond);
1411 unixctl_command_reply(conn, ds_cstr(&ds));
1415 ovs_rwlock_unlock(&rwlock);
1419 bond_unixctl_migrate(struct unixctl_conn *conn,
1420 int argc OVS_UNUSED, const char *argv[],
1421 void *aux OVS_UNUSED)
1423 const char *bond_s = argv[1];
1424 const char *hash_s = argv[2];
1425 const char *slave_s = argv[3];
1427 struct bond_slave *slave;
1428 struct bond_entry *entry;
1431 ovs_rwlock_wrlock(&rwlock);
1432 bond = bond_find(bond_s);
1434 unixctl_command_reply_error(conn, "no such bond");
1438 if (bond->balance != BM_SLB) {
1439 unixctl_command_reply_error(conn, "not an SLB bond");
1443 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1444 hash = atoi(hash_s) & BOND_MASK;
1446 unixctl_command_reply_error(conn, "bad hash");
1450 slave = bond_lookup_slave(bond, slave_s);
1452 unixctl_command_reply_error(conn, "no such slave");
1456 if (!slave->enabled) {
1457 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1461 entry = &bond->hash[hash];
1462 bond->bond_revalidate = true;
1463 entry->slave = slave;
1464 unixctl_command_reply(conn, "migrated");
1467 ovs_rwlock_unlock(&rwlock);
1471 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1472 int argc OVS_UNUSED, const char *argv[],
1473 void *aux OVS_UNUSED)
1475 const char *bond_s = argv[1];
1476 const char *slave_s = argv[2];
1478 struct bond_slave *slave;
1480 ovs_rwlock_wrlock(&rwlock);
1481 bond = bond_find(bond_s);
1483 unixctl_command_reply_error(conn, "no such bond");
1487 slave = bond_lookup_slave(bond, slave_s);
1489 unixctl_command_reply_error(conn, "no such slave");
1493 if (!slave->enabled) {
1494 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1498 if (bond->active_slave != slave) {
1499 bond->bond_revalidate = true;
1500 bond->active_slave = slave;
1501 VLOG_INFO("bond %s: active interface is now %s",
1502 bond->name, slave->name);
1503 bond->send_learning_packets = true;
1504 unixctl_command_reply(conn, "done");
1505 bond_active_slave_changed(bond);
1507 unixctl_command_reply(conn, "no change");
1510 ovs_rwlock_unlock(&rwlock);
1514 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1516 const char *bond_s = argv[1];
1517 const char *slave_s = argv[2];
1519 struct bond_slave *slave;
1521 ovs_rwlock_wrlock(&rwlock);
1522 bond = bond_find(bond_s);
1524 unixctl_command_reply_error(conn, "no such bond");
1528 slave = bond_lookup_slave(bond, slave_s);
1530 unixctl_command_reply_error(conn, "no such slave");
1534 bond_enable_slave(slave, enable);
1535 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1538 ovs_rwlock_unlock(&rwlock);
1542 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1543 int argc OVS_UNUSED, const char *argv[],
1544 void *aux OVS_UNUSED)
1546 enable_slave(conn, argv, true);
1550 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1551 int argc OVS_UNUSED, const char *argv[],
1552 void *aux OVS_UNUSED)
1554 enable_slave(conn, argv, false);
1558 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1559 void *aux OVS_UNUSED)
1561 const char *mac_s = argv[1];
1562 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1563 const char *basis_s = argc > 3 ? argv[3] : NULL;
1564 uint8_t mac[ETH_ADDR_LEN];
1571 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1572 unixctl_command_reply_error(conn, "invalid vlan");
1580 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1581 unixctl_command_reply_error(conn, "invalid basis");
1588 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1589 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1591 hash_cstr = xasprintf("%u", hash);
1592 unixctl_command_reply(conn, hash_cstr);
1595 unixctl_command_reply_error(conn, "invalid mac");
1602 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1603 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1605 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1606 bond_unixctl_migrate, NULL);
1607 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1608 bond_unixctl_set_active_slave, NULL);
1609 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1610 bond_unixctl_enable_slave, NULL);
1611 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1612 bond_unixctl_disable_slave, NULL);
1613 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1614 bond_unixctl_hash, NULL);
1618 bond_entry_reset(struct bond *bond)
1620 if (bond->balance != BM_AB) {
1621 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1624 bond->hash = xmalloc(hash_len);
1626 memset(bond->hash, 0, hash_len);
1628 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1635 static struct bond_slave *
1636 bond_slave_lookup(struct bond *bond, const void *slave_)
1638 struct bond_slave *slave;
1640 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1642 if (slave->aux == slave_) {
1651 bond_enable_slave(struct bond_slave *slave, bool enable)
1653 slave->delay_expires = LLONG_MAX;
1654 if (enable != slave->enabled) {
1655 slave->bond->bond_revalidate = true;
1656 slave->enabled = enable;
1658 ovs_mutex_lock(&slave->bond->mutex);
1660 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1662 list_remove(&slave->list_node);
1664 ovs_mutex_unlock(&slave->bond->mutex);
1666 VLOG_INFO("interface %s: %s", slave->name,
1667 slave->enabled ? "enabled" : "disabled");
1672 bond_link_status_update(struct bond_slave *slave)
1674 struct bond *bond = slave->bond;
1677 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1678 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1679 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1680 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1681 slave->name, up ? "up" : "down");
1682 if (up == slave->enabled) {
1683 slave->delay_expires = LLONG_MAX;
1684 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1685 slave->name, up ? "disabled" : "enabled");
1687 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1688 : up ? bond->updelay : bond->downdelay);
1689 slave->delay_expires = time_msec() + delay;
1691 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1694 up ? "enabled" : "disabled",
1701 if (time_msec() >= slave->delay_expires) {
1702 bond_enable_slave(slave, up);
1707 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1709 return hash_mac(mac, vlan, basis);
1713 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1715 struct flow hash_flow = *flow;
1716 hash_flow.vlan_tci = htons(vlan);
1718 /* The symmetric quality of this hash function is not required, but
1719 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1720 * purposes, so we use it out of convenience. */
1721 return flow_hash_symmetric_l4(&hash_flow, basis);
1725 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1727 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1729 return (bond->balance == BM_TCP
1730 ? bond_hash_tcp(flow, vlan, bond->basis)
1731 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1734 static struct bond_entry *
1735 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1738 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1741 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1742 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1744 static struct bond_slave *
1745 get_enabled_slave(struct bond *bond)
1747 struct ovs_list *node;
1749 ovs_mutex_lock(&bond->mutex);
1750 if (list_is_empty(&bond->enabled_slaves)) {
1751 ovs_mutex_unlock(&bond->mutex);
1755 node = list_pop_front(&bond->enabled_slaves);
1756 list_push_back(&bond->enabled_slaves, node);
1757 ovs_mutex_unlock(&bond->mutex);
1759 return CONTAINER_OF(node, struct bond_slave, list_node);
1762 static struct bond_slave *
1763 choose_output_slave(const struct bond *bond, const struct flow *flow,
1764 struct flow_wildcards *wc, uint16_t vlan)
1766 struct bond_entry *e;
1769 balance = bond->balance;
1770 if (bond->lacp_status == LACP_CONFIGURED) {
1771 /* LACP has been configured on this bond but negotiations were
1772 * unsuccussful. If lacp_fallback_ab is enabled use active-
1773 * backup mode else drop all traffic. */
1774 if (!bond->lacp_fallback_ab) {
1782 return bond->active_slave;
1785 if (bond->lacp_status != LACP_NEGOTIATED) {
1786 /* Must have LACP negotiations for TCP balanced bonds. */
1790 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1795 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1797 e = lookup_bond_entry(bond, flow, vlan);
1798 if (!e->slave || !e->slave->enabled) {
1799 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1808 static struct bond_slave *
1809 bond_choose_slave(const struct bond *bond)
1811 struct bond_slave *slave, *best;
1813 /* Find the last active slave. */
1814 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1815 if (slave && slave->enabled) {
1819 /* Find an enabled slave. */
1820 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1821 if (slave->enabled) {
1826 /* All interfaces are disabled. Find an interface that will be enabled
1827 * after its updelay expires. */
1829 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1830 if (slave->delay_expires != LLONG_MAX
1831 && slave->may_enable
1832 && (!best || slave->delay_expires < best->delay_expires)) {
1840 bond_choose_active_slave(struct bond *bond)
1842 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1843 struct bond_slave *old_active_slave = bond->active_slave;
1845 bond->active_slave = bond_choose_slave(bond);
1846 if (bond->active_slave) {
1847 if (bond->active_slave->enabled) {
1848 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1849 bond->name, bond->active_slave->name);
1851 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1852 "remaining %lld ms updelay (since no interface was "
1853 "enabled)", bond->name, bond->active_slave->name,
1854 bond->active_slave->delay_expires - time_msec());
1855 bond_enable_slave(bond->active_slave, true);
1858 bond->send_learning_packets = true;
1860 if (bond->active_slave != old_active_slave) {
1861 bond_active_slave_changed(bond);
1863 } else if (old_active_slave) {
1864 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1869 * Return true if bond has unstored active slave change.
1870 * If return true, 'mac' will store the bond's current active slave's
1873 bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1877 ovs_rwlock_wrlock(&rwlock);
1878 bond = bond_find(name);
1880 if (bond->active_slave_changed || force) {
1881 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1882 bond->active_slave_changed = false;
1883 ovs_rwlock_unlock(&rwlock);
1887 ovs_rwlock_unlock(&rwlock);