2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
60 /* A hash bucket for mapping a flow to a slave.
61 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
66 struct list list_node; /* In bond_slave's 'entries' list. */
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
77 /* A bond slave, that is, one of the links comprising a bond. */
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
80 struct list list_node; /* In struct bond's enabled_slaves list. */
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
84 struct netdev *netdev; /* Network device, owned by the client. */
85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
86 ofp_port_t ofp_port; /* Open flow port number */
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
90 long long delay_expires; /* Time after which 'enabled' may change. */
91 bool enabled; /* May be chosen for flows? */
92 bool may_enable; /* Client considers this slave bondable. */
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct list entries; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
100 /* A bond, that is, a set of network devices grouped to improve performance or
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
116 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
123 bool bond_revalidate; /* True if flows need revalidation. */
124 uint32_t basis; /* Basis for flow hash function. */
126 /* SLB specific bonding info. */
127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
134 /* Legacy compatibility. */
135 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
137 struct ovs_refcount ref_cnt;
140 /* What to do with an bond_recirc_rule. */
142 ADD, /* Add the rule to ofproto's flow table. */
143 DEL, /* Delete the rule from the ofproto's flow table. */
146 /* A rule to add to or delete from ofproto's internal flow table. */
147 struct bond_pr_rule_op {
148 struct hmap_node hmap_node;
150 ofp_port_t out_ofport;
152 struct rule **pr_rule;
155 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
156 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
157 OVS_REQ_RDLOCK(rwlock);
158 static void bond_enable_slave(struct bond_slave *, bool enable)
159 OVS_REQ_WRLOCK(rwlock);
160 static void bond_link_status_update(struct bond_slave *)
161 OVS_REQ_WRLOCK(rwlock);
162 static void bond_choose_active_slave(struct bond *)
163 OVS_REQ_WRLOCK(rwlock);
164 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
165 uint16_t vlan, uint32_t basis);
166 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
168 static struct bond_entry *lookup_bond_entry(const struct bond *,
171 OVS_REQ_RDLOCK(rwlock);
172 static struct bond_slave *get_enabled_slave(struct bond *)
173 OVS_REQ_RDLOCK(rwlock);
174 static struct bond_slave *choose_output_slave(const struct bond *,
176 struct flow_wildcards *,
178 OVS_REQ_RDLOCK(rwlock);
180 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
181 * stores the mode in '*balance' and returns true. Otherwise returns false
182 * without modifying '*balance'. */
184 bond_mode_from_string(enum bond_mode *balance, const char *s)
186 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
188 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
190 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
198 /* Returns a string representing 'balance'. */
200 bond_mode_to_string(enum bond_mode balance) {
203 return "balance-tcp";
205 return "balance-slb";
207 return "active-backup";
213 /* Creates and returns a new bond whose configuration is initially taken from
216 * The caller should register each slave on the new bond by calling
217 * bond_slave_register(). */
219 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
223 bond = xzalloc(sizeof *bond);
224 bond->ofproto = ofproto;
225 hmap_init(&bond->slaves);
226 list_init(&bond->enabled_slaves);
227 ovs_mutex_init(&bond->mutex);
228 ovs_refcount_init(&bond->ref_cnt);
231 hmap_init(&bond->pr_rule_ops);
233 bond_reconfigure(bond, s);
238 bond_ref(const struct bond *bond_)
240 struct bond *bond = CONST_CAST(struct bond *, bond_);
243 ovs_refcount_ref(&bond->ref_cnt);
250 bond_unref(struct bond *bond)
252 struct bond_slave *slave, *next_slave;
253 struct bond_pr_rule_op *pr_op, *next_op;
255 if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) {
259 ovs_rwlock_wrlock(&rwlock);
260 hmap_remove(all_bonds, &bond->hmap_node);
261 ovs_rwlock_unlock(&rwlock);
263 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
264 hmap_remove(&bond->slaves, &slave->hmap_node);
265 /* Client owns 'slave->netdev'. */
269 hmap_destroy(&bond->slaves);
271 ovs_mutex_destroy(&bond->mutex);
275 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
276 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
279 hmap_destroy(&bond->pr_rule_ops);
281 if (bond->recirc_id) {
282 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
289 add_pr_rule(struct bond *bond, const struct match *match,
290 ofp_port_t out_ofport, struct rule **rule)
292 uint32_t hash = match_hash(match, 0);
293 struct bond_pr_rule_op *pr_op;
295 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
296 if (match_equal(&pr_op->match, match)) {
298 pr_op->out_ofport = out_ofport;
299 pr_op->pr_rule = rule;
304 pr_op = xmalloc(sizeof *pr_op);
305 pr_op->match = *match;
307 pr_op->out_ofport = out_ofport;
308 pr_op->pr_rule = rule;
309 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
313 update_recirc_rules(struct bond *bond)
316 struct bond_pr_rule_op *pr_op, *next_op;
317 uint64_t ofpacts_stub[128 / 8];
318 struct ofpbuf ofpacts;
321 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
323 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
327 if (bond->hash && bond->recirc_id) {
328 for (i = 0; i < BOND_BUCKETS; i++) {
329 struct bond_slave *slave = bond->hash[i].slave;
332 match_init_catchall(&match);
333 match_set_recirc_id(&match, bond->recirc_id);
334 match_set_dp_hash_masked(&match, i, BOND_MASK);
336 add_pr_rule(bond, &match, slave->ofp_port,
337 &bond->hash[i].pr_rule);
342 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
346 ofpbuf_clear(&ofpacts);
347 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
348 error = ofproto_dpif_add_internal_flow(bond->ofproto,
350 RECIRC_RULE_PRIORITY, 0,
351 &ofpacts, pr_op->pr_rule);
353 char *err_s = match_to_string(&pr_op->match,
354 RECIRC_RULE_PRIORITY);
356 VLOG_ERR("failed to add post recirculation flow %s", err_s);
362 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
364 RECIRC_RULE_PRIORITY);
366 char *err_s = match_to_string(&pr_op->match,
367 RECIRC_RULE_PRIORITY);
369 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
373 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
374 *pr_op->pr_rule = NULL;
380 ofpbuf_uninit(&ofpacts);
384 /* Updates 'bond''s overall configuration to 's'.
386 * The caller should register each slave on 'bond' by calling
387 * bond_slave_register(). This is optional if none of the slaves'
388 * configuration has changed. In any case it can't hurt.
390 * Returns true if the configuration has changed in such a way that requires
394 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
396 bool revalidate = false;
398 ovs_rwlock_wrlock(&rwlock);
399 if (!bond->name || strcmp(bond->name, s->name)) {
401 hmap_remove(all_bonds, &bond->hmap_node);
404 bond->name = xstrdup(s->name);
405 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
408 bond->updelay = s->up_delay;
409 bond->downdelay = s->down_delay;
411 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
412 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
416 if (bond->rebalance_interval != s->rebalance_interval) {
417 bond->rebalance_interval = s->rebalance_interval;
421 if (bond->balance != s->balance) {
422 bond->balance = s->balance;
426 if (bond->basis != s->basis) {
427 bond->basis = s->basis;
431 if (bond->bond_revalidate) {
433 bond->bond_revalidate = false;
436 if (bond->balance != BM_AB) {
437 if (!bond->recirc_id) {
438 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
440 } else if (bond->recirc_id) {
441 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
445 if (bond->balance == BM_AB || !bond->hash || revalidate) {
446 bond_entry_reset(bond);
449 ovs_rwlock_unlock(&rwlock);
454 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
455 OVS_REQ_WRLOCK(rwlock)
457 if (slave->netdev != netdev) {
458 slave->netdev = netdev;
459 slave->change_seq = 0;
463 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
464 * arbitrary client-provided pointer that uniquely identifies a slave within a
465 * bond. If 'slave_' already exists within 'bond' then this function
466 * reconfigures the existing slave.
468 * 'netdev' must be the network device that 'slave_' represents. It is owned
469 * by the client, so the client must not close it before either unregistering
470 * 'slave_' or destroying 'bond'.
473 bond_slave_register(struct bond *bond, void *slave_,
474 ofp_port_t ofport, struct netdev *netdev)
476 struct bond_slave *slave;
478 ovs_rwlock_wrlock(&rwlock);
479 slave = bond_slave_lookup(bond, slave_);
481 slave = xzalloc(sizeof *slave);
483 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
486 slave->ofp_port = ofport;
487 slave->delay_expires = LLONG_MAX;
488 slave->name = xstrdup(netdev_get_name(netdev));
489 bond->bond_revalidate = true;
491 slave->enabled = false;
492 bond_enable_slave(slave, netdev_get_carrier(netdev));
495 bond_slave_set_netdev__(slave, netdev);
498 slave->name = xstrdup(netdev_get_name(netdev));
499 ovs_rwlock_unlock(&rwlock);
502 /* Updates the network device to be used with 'slave_' to 'netdev'.
504 * This is useful if the caller closes and re-opens the network device
505 * registered with bond_slave_register() but doesn't need to change anything
508 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
510 struct bond_slave *slave;
512 ovs_rwlock_wrlock(&rwlock);
513 slave = bond_slave_lookup(bond, slave_);
515 bond_slave_set_netdev__(slave, netdev);
517 ovs_rwlock_unlock(&rwlock);
520 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
521 * then this function has no effect.
523 * Unregistering a slave invalidates all flows. */
525 bond_slave_unregister(struct bond *bond, const void *slave_)
527 struct bond_slave *slave;
530 ovs_rwlock_wrlock(&rwlock);
531 slave = bond_slave_lookup(bond, slave_);
536 bond->bond_revalidate = true;
537 bond_enable_slave(slave, false);
539 del_active = bond->active_slave == slave;
541 struct bond_entry *e;
542 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
543 if (e->slave == slave) {
551 hmap_remove(&bond->slaves, &slave->hmap_node);
552 /* Client owns 'slave->netdev'. */
556 bond_choose_active_slave(bond);
557 bond->send_learning_packets = true;
560 ovs_rwlock_unlock(&rwlock);
563 /* Should be called on each slave in 'bond' before bond_run() to indicate
564 * whether or not 'slave_' may be enabled. This function is intended to allow
565 * other protocols to have some impact on bonding decisions. For example LACP
566 * or high level link monitoring protocols may decide that a given slave should
567 * not be able to send traffic. */
569 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
571 ovs_rwlock_wrlock(&rwlock);
572 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
573 ovs_rwlock_unlock(&rwlock);
576 /* Performs periodic maintenance on 'bond'.
578 * Returns true if the caller should revalidate its flows.
580 * The caller should check bond_should_send_learning_packets() afterward. */
582 bond_run(struct bond *bond, enum lacp_status lacp_status)
584 struct bond_slave *slave;
587 ovs_rwlock_wrlock(&rwlock);
588 if (bond->lacp_status != lacp_status) {
589 bond->lacp_status = lacp_status;
590 bond->bond_revalidate = true;
593 /* Enable slaves based on link status and LACP feedback. */
594 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
595 bond_link_status_update(slave);
596 slave->change_seq = seq_read(connectivity_seq_get());
598 if (!bond->active_slave || !bond->active_slave->enabled) {
599 bond_choose_active_slave(bond);
602 revalidate = bond->bond_revalidate;
603 bond->bond_revalidate = false;
604 ovs_rwlock_unlock(&rwlock);
609 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
611 bond_wait(struct bond *bond)
613 struct bond_slave *slave;
615 ovs_rwlock_rdlock(&rwlock);
616 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
617 if (slave->delay_expires != LLONG_MAX) {
618 poll_timer_wait_until(slave->delay_expires);
621 seq_wait(connectivity_seq_get(), slave->change_seq);
624 if (bond->bond_revalidate) {
625 poll_immediate_wake();
627 ovs_rwlock_unlock(&rwlock);
629 /* We don't wait for bond->next_rebalance because rebalancing can only run
630 * at a flow account checkpoint. ofproto does checkpointing on its own
631 * schedule and bond_rebalance() gets called afterward, so we'd just be
632 * waking up for no purpose. */
635 /* MAC learning table interaction. */
638 may_send_learning_packets(const struct bond *bond)
640 return ((bond->lacp_status == LACP_DISABLED
641 && (bond->balance == BM_SLB || bond->balance == BM_AB))
642 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
643 && bond->active_slave;
646 /* Returns true if 'bond' needs the client to send out packets to assist with
647 * MAC learning on 'bond'. If this function returns true, then the client
648 * should iterate through its MAC learning table for the bridge on which 'bond'
649 * is located. For each MAC that has been learned on a port other than 'bond',
650 * it should call bond_compose_learning_packet().
652 * This function will only return true if 'bond' is in SLB or active-backup
653 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
656 * Calling this function resets the state that it checks. */
658 bond_should_send_learning_packets(struct bond *bond)
662 ovs_rwlock_wrlock(&rwlock);
663 send = bond->send_learning_packets && may_send_learning_packets(bond);
664 bond->send_learning_packets = false;
665 ovs_rwlock_unlock(&rwlock);
669 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
671 * See bond_should_send_learning_packets() for description of usage. The
672 * caller should send the composed packet on the port associated with
673 * port_aux and takes ownership of the returned ofpbuf. */
675 bond_compose_learning_packet(struct bond *bond,
676 const uint8_t eth_src[ETH_ADDR_LEN],
677 uint16_t vlan, void **port_aux)
679 struct bond_slave *slave;
680 struct ofpbuf *packet;
683 ovs_rwlock_rdlock(&rwlock);
684 ovs_assert(may_send_learning_packets(bond));
685 memset(&flow, 0, sizeof flow);
686 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
687 slave = choose_output_slave(bond, &flow, NULL, vlan);
689 packet = ofpbuf_new(0);
690 compose_rarp(packet, eth_src);
692 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
695 *port_aux = slave->aux;
696 ovs_rwlock_unlock(&rwlock);
700 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
701 * Ethernet destination address of 'eth_dst', should be admitted.
703 * The return value is one of the following:
705 * - BV_ACCEPT: Admit the packet.
707 * - BV_DROP: Drop the packet.
709 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
710 * Ethernet source address and VLAN. If there is none, or if the packet
711 * is on the learned port, then admit the packet. If a different port has
712 * been learned, however, drop the packet (and do not use it for MAC
716 bond_check_admissibility(struct bond *bond, const void *slave_,
717 const uint8_t eth_dst[ETH_ADDR_LEN])
719 enum bond_verdict verdict = BV_DROP;
720 struct bond_slave *slave;
722 ovs_rwlock_rdlock(&rwlock);
723 slave = bond_slave_lookup(bond, slave_);
728 /* LACP bonds have very loose admissibility restrictions because we can
729 * assume the remote switch is aware of the bond and will "do the right
730 * thing". However, as a precaution we drop packets on disabled slaves
731 * because no correctly implemented partner switch should be sending
734 * If LACP is configured, but LACP negotiations have been unsuccessful, we
735 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
736 switch (bond->lacp_status) {
737 case LACP_NEGOTIATED:
738 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
740 case LACP_CONFIGURED:
741 if (!bond->lacp_fallback_ab) {
748 /* Drop all multicast packets on inactive slaves. */
749 if (eth_addr_is_multicast(eth_dst)) {
750 if (bond->active_slave != slave) {
755 switch (bond->balance) {
757 /* TCP balanced bonds require successful LACP negotiations. Based on the
758 * above check, LACP is off or lacp_fallback_ab is true on this bond.
759 * If lacp_fallback_ab is true fall through to BM_AB case else, we
760 * drop all incoming traffic. */
761 if (!bond->lacp_fallback_ab) {
766 /* Drop all packets which arrive on backup slaves. This is similar to
767 * how Linux bonding handles active-backup bonds. */
768 if (bond->active_slave != slave) {
769 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
771 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
772 " slave (%s) destined for " ETH_ADDR_FMT,
773 slave->name, ETH_ADDR_ARGS(eth_dst));
780 /* Drop all packets for which we have learned a different input port,
781 * because we probably sent the packet on one slave and got it back on
782 * the other. Gratuitous ARP packets are an exception to this rule:
783 * the host has moved to another switch. The exception to the
784 * exception is if we locked the learning table to avoid reflections on
786 verdict = BV_DROP_IF_MOVED;
792 ovs_rwlock_unlock(&rwlock);
797 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
798 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
799 * NULL if the packet should be dropped because no slaves are enabled.
801 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
802 * should be a VID only (i.e. excluding the PCP bits). Second,
803 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
804 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
805 * packet belongs to (so for an access port it will be the access port's VLAN).
807 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
808 * significant in the selection. At some point earlier, 'wc' should
809 * have been initialized (e.g., by flow_wildcards_init_catchall()).
812 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
813 struct flow_wildcards *wc, uint16_t vlan)
815 struct bond_slave *slave;
818 ovs_rwlock_rdlock(&rwlock);
819 slave = choose_output_slave(bond, flow, wc, vlan);
820 aux = slave ? slave->aux : NULL;
821 ovs_rwlock_unlock(&rwlock);
828 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
829 OVS_REQ_WRLOCK(rwlock)
834 delta = rule_tx_bytes - entry->pr_tx_bytes;
835 entry->tx_bytes += delta;
836 entry->pr_tx_bytes = rule_tx_bytes;
840 /* Maintain bond stats using post recirculation rule byte counters.*/
842 bond_recirculation_account(struct bond *bond)
843 OVS_REQ_WRLOCK(rwlock)
847 for (i=0; i<=BOND_MASK; i++) {
848 struct bond_entry *entry = &bond->hash[i];
849 struct rule *rule = entry->pr_rule;
852 uint64_t n_packets OVS_UNUSED;
853 long long int used OVS_UNUSED;
856 rule->ofproto->ofproto_class->rule_get_stats(
857 rule, &n_packets, &n_bytes, &used);
858 bond_entry_account(entry, n_bytes);
864 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
867 if (bond->balance == BM_TCP && bond->recirc_id) {
869 *recirc_id = bond->recirc_id;
872 *hash_bias = bond->basis;
881 bond_update_post_recirc_rules(struct bond* bond, const bool force)
883 struct bond_entry *e;
884 bool update_rules = force; /* Always update rules if caller forces it. */
886 /* Make sure all bond entries are populated */
887 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
888 if (!e->slave || !e->slave->enabled) {
890 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
891 struct bond_slave, hmap_node);
892 if (!e->slave->enabled) {
893 e->slave = bond->active_slave;
899 update_recirc_rules(bond);
906 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
908 return bond->rebalance_interval
909 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
912 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
914 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
917 ovs_rwlock_wrlock(&rwlock);
918 if (bond_is_balanced(bond)) {
919 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
921 ovs_rwlock_unlock(&rwlock);
924 static struct bond_slave *
925 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
927 return CONTAINER_OF(bal, struct bond_slave, bal_node);
931 log_bals(struct bond *bond, const struct list *bals)
932 OVS_REQ_RDLOCK(rwlock)
934 if (VLOG_IS_DBG_ENABLED()) {
935 struct ds ds = DS_EMPTY_INITIALIZER;
936 const struct bond_slave *slave;
938 LIST_FOR_EACH (slave, bal_node, bals) {
940 ds_put_char(&ds, ',');
942 ds_put_format(&ds, " %s %"PRIu64"kB",
943 slave->name, slave->tx_bytes / 1024);
945 if (!slave->enabled) {
946 ds_put_cstr(&ds, " (disabled)");
948 if (!list_is_empty(&slave->entries)) {
949 struct bond_entry *e;
951 ds_put_cstr(&ds, " (");
952 LIST_FOR_EACH (e, list_node, &slave->entries) {
953 if (&e->list_node != list_front(&slave->entries)) {
954 ds_put_cstr(&ds, " + ");
956 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
957 e - bond->hash, e->tx_bytes / 1024);
959 ds_put_cstr(&ds, ")");
962 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
967 /* Shifts 'hash' from its current slave to 'to'. */
969 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
970 OVS_REQ_WRLOCK(rwlock)
972 struct bond_slave *from = hash->slave;
973 struct bond *bond = from->bond;
974 uint64_t delta = hash->tx_bytes;
976 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
977 "from %s to %s (now carrying %"PRIu64"kB and "
978 "%"PRIu64"kB load, respectively)",
979 bond->name, delta / 1024, hash - bond->hash,
980 from->name, to->name,
981 (from->tx_bytes - delta) / 1024,
982 (to->tx_bytes + delta) / 1024);
984 /* Shift load away from 'from' to 'to'. */
985 from->tx_bytes -= delta;
986 to->tx_bytes += delta;
988 /* Arrange for flows to be revalidated. */
990 bond->bond_revalidate = true;
993 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
994 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
995 * given that doing so must decrease the ratio of the load on the two slaves by
996 * at least 0.1. Returns NULL if there is no appropriate entry.
998 * The list of entries isn't sorted. I don't know of a reason to prefer to
999 * shift away small hashes or large hashes. */
1000 static struct bond_entry *
1001 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1002 OVS_REQ_WRLOCK(rwlock)
1004 struct bond_entry *e;
1006 if (list_is_short(&from->entries)) {
1007 /* 'from' carries no more than one MAC hash, so shifting load away from
1008 * it would be pointless. */
1012 LIST_FOR_EACH (e, list_node, &from->entries) {
1013 double old_ratio, new_ratio;
1016 if (to_tx_bytes == 0) {
1017 /* Nothing on the new slave, move it. */
1021 delta = e->tx_bytes;
1022 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1023 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1024 if (old_ratio - new_ratio > 0.1
1025 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1026 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1027 and 'to' slave have the same load. Therefore, we only move an
1028 entry if it decreases the load on 'from', and brings us closer
1029 to equal traffic load. */
1037 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1040 insert_bal(struct list *bals, struct bond_slave *slave)
1042 struct bond_slave *pos;
1044 LIST_FOR_EACH (pos, bal_node, bals) {
1045 if (slave->tx_bytes > pos->tx_bytes) {
1049 list_insert(&pos->bal_node, &slave->bal_node);
1052 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1053 * that descending order of 'tx_bytes' is maintained. */
1055 reinsert_bal(struct list *bals, struct bond_slave *slave)
1057 list_remove(&slave->bal_node);
1058 insert_bal(bals, slave);
1061 /* If 'bond' needs rebalancing, does so.
1063 * The caller should have called bond_account() for each active flow, or in case
1064 * of recirculation is used, have called bond_recirculation_account(bond),
1065 * to ensure that flow data is consistently accounted at this point.
1068 bond_rebalance(struct bond *bond)
1070 struct bond_slave *slave;
1071 struct bond_entry *e;
1073 bool rebalanced = false;
1076 ovs_rwlock_wrlock(&rwlock);
1077 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1080 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1082 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1083 bond_may_recirc(bond, NULL, NULL);
1086 bond_recirculation_account(bond);
1089 /* Add each bond_entry to its slave's 'entries' list.
1090 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1091 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1092 slave->tx_bytes = 0;
1093 list_init(&slave->entries);
1095 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1096 if (e->slave && e->tx_bytes) {
1097 e->slave->tx_bytes += e->tx_bytes;
1098 list_push_back(&e->slave->entries, &e->list_node);
1102 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1104 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1105 * with a proper list sort algorithm. */
1107 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1108 if (slave->enabled) {
1109 insert_bal(&bals, slave);
1112 log_bals(bond, &bals);
1114 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1115 while (!list_is_short(&bals)) {
1116 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1117 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1120 overload = from->tx_bytes - to->tx_bytes;
1121 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1122 /* The extra load on 'from' (and all less-loaded slaves), compared
1123 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1124 * it is less than ~1Mbps. No point in rebalancing. */
1128 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1129 * to move from 'from' to 'to'. */
1130 e = choose_entry_to_migrate(from, to->tx_bytes);
1132 bond_shift_load(e, to);
1134 /* Delete element from from->entries.
1136 * We don't add the element to to->hashes. That would only allow
1137 * 'e' to be migrated to another slave in this rebalancing run, and
1138 * there is no point in doing that. */
1139 list_remove(&e->list_node);
1141 /* Re-sort 'bals'. */
1142 reinsert_bal(&bals, from);
1143 reinsert_bal(&bals, to);
1146 /* Can't usefully migrate anything away from 'from'.
1147 * Don't reconsider it. */
1148 list_remove(&from->bal_node);
1152 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1153 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1154 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1155 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1159 if (use_recirc && rebalanced) {
1160 bond_update_post_recirc_rules(bond,true);
1164 ovs_rwlock_unlock(&rwlock);
1167 /* Bonding unixctl user interface functions. */
1169 static struct bond *
1170 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1174 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1176 if (!strcmp(bond->name, name)) {
1183 static struct bond_slave *
1184 bond_lookup_slave(struct bond *bond, const char *slave_name)
1186 struct bond_slave *slave;
1188 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1189 if (!strcmp(slave->name, slave_name)) {
1197 bond_unixctl_list(struct unixctl_conn *conn,
1198 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1199 void *aux OVS_UNUSED)
1201 struct ds ds = DS_EMPTY_INITIALIZER;
1202 const struct bond *bond;
1204 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1206 ovs_rwlock_rdlock(&rwlock);
1207 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1208 const struct bond_slave *slave;
1211 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1212 bond_mode_to_string(bond->balance), bond->recirc_id);
1215 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1217 ds_put_cstr(&ds, ", ");
1219 ds_put_cstr(&ds, slave->name);
1221 ds_put_char(&ds, '\n');
1223 ovs_rwlock_unlock(&rwlock);
1224 unixctl_command_reply(conn, ds_cstr(&ds));
1229 bond_print_details(struct ds *ds, const struct bond *bond)
1230 OVS_REQ_RDLOCK(rwlock)
1232 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1233 const struct shash_node **sorted_slaves = NULL;
1234 const struct bond_slave *slave;
1239 ds_put_format(ds, "---- %s ----\n", bond->name);
1240 ds_put_format(ds, "bond_mode: %s\n",
1241 bond_mode_to_string(bond->balance));
1243 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1244 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1245 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1247 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1249 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1250 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1252 if (bond_is_balanced(bond)) {
1253 ds_put_format(ds, "next rebalance: %lld ms\n",
1254 bond->next_rebalance - time_msec());
1257 ds_put_cstr(ds, "lacp_status: ");
1258 switch (bond->lacp_status) {
1259 case LACP_NEGOTIATED:
1260 ds_put_cstr(ds, "negotiated\n");
1262 case LACP_CONFIGURED:
1263 ds_put_cstr(ds, "configured\n");
1266 ds_put_cstr(ds, "off\n");
1269 ds_put_cstr(ds, "<unknown>\n");
1273 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1274 shash_add(&slave_shash, slave->name, slave);
1276 sorted_slaves = shash_sort(&slave_shash);
1278 for (i = 0; i < shash_count(&slave_shash); i++) {
1279 struct bond_entry *be;
1281 slave = sorted_slaves[i]->data;
1284 ds_put_format(ds, "\nslave %s: %s\n",
1285 slave->name, slave->enabled ? "enabled" : "disabled");
1286 if (slave == bond->active_slave) {
1287 ds_put_cstr(ds, "\tactive slave\n");
1289 if (slave->delay_expires != LLONG_MAX) {
1290 ds_put_format(ds, "\t%s expires in %lld ms\n",
1291 slave->enabled ? "downdelay" : "updelay",
1292 slave->delay_expires - time_msec());
1295 ds_put_format(ds, "\tmay_enable: %s\n",
1296 slave->may_enable ? "true" : "false");
1298 if (!bond_is_balanced(bond)) {
1303 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1304 int hash = be - bond->hash;
1307 if (be->slave != slave) {
1311 be_tx_k = be->tx_bytes / 1024;
1313 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1317 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1320 shash_destroy(&slave_shash);
1321 free(sorted_slaves);
1322 ds_put_cstr(ds, "\n");
1326 bond_unixctl_show(struct unixctl_conn *conn,
1327 int argc, const char *argv[],
1328 void *aux OVS_UNUSED)
1330 struct ds ds = DS_EMPTY_INITIALIZER;
1332 ovs_rwlock_rdlock(&rwlock);
1334 const struct bond *bond = bond_find(argv[1]);
1337 unixctl_command_reply_error(conn, "no such bond");
1340 bond_print_details(&ds, bond);
1342 const struct bond *bond;
1344 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1345 bond_print_details(&ds, bond);
1349 unixctl_command_reply(conn, ds_cstr(&ds));
1353 ovs_rwlock_unlock(&rwlock);
1357 bond_unixctl_migrate(struct unixctl_conn *conn,
1358 int argc OVS_UNUSED, const char *argv[],
1359 void *aux OVS_UNUSED)
1361 const char *bond_s = argv[1];
1362 const char *hash_s = argv[2];
1363 const char *slave_s = argv[3];
1365 struct bond_slave *slave;
1366 struct bond_entry *entry;
1369 ovs_rwlock_wrlock(&rwlock);
1370 bond = bond_find(bond_s);
1372 unixctl_command_reply_error(conn, "no such bond");
1376 if (bond->balance != BM_SLB) {
1377 unixctl_command_reply_error(conn, "not an SLB bond");
1381 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1382 hash = atoi(hash_s) & BOND_MASK;
1384 unixctl_command_reply_error(conn, "bad hash");
1388 slave = bond_lookup_slave(bond, slave_s);
1390 unixctl_command_reply_error(conn, "no such slave");
1394 if (!slave->enabled) {
1395 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1399 entry = &bond->hash[hash];
1400 bond->bond_revalidate = true;
1401 entry->slave = slave;
1402 unixctl_command_reply(conn, "migrated");
1405 ovs_rwlock_unlock(&rwlock);
1409 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1410 int argc OVS_UNUSED, const char *argv[],
1411 void *aux OVS_UNUSED)
1413 const char *bond_s = argv[1];
1414 const char *slave_s = argv[2];
1416 struct bond_slave *slave;
1418 ovs_rwlock_wrlock(&rwlock);
1419 bond = bond_find(bond_s);
1421 unixctl_command_reply_error(conn, "no such bond");
1425 slave = bond_lookup_slave(bond, slave_s);
1427 unixctl_command_reply_error(conn, "no such slave");
1431 if (!slave->enabled) {
1432 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1436 if (bond->active_slave != slave) {
1437 bond->bond_revalidate = true;
1438 bond->active_slave = slave;
1439 VLOG_INFO("bond %s: active interface is now %s",
1440 bond->name, slave->name);
1441 bond->send_learning_packets = true;
1442 unixctl_command_reply(conn, "done");
1444 unixctl_command_reply(conn, "no change");
1447 ovs_rwlock_unlock(&rwlock);
1451 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1453 const char *bond_s = argv[1];
1454 const char *slave_s = argv[2];
1456 struct bond_slave *slave;
1458 ovs_rwlock_wrlock(&rwlock);
1459 bond = bond_find(bond_s);
1461 unixctl_command_reply_error(conn, "no such bond");
1465 slave = bond_lookup_slave(bond, slave_s);
1467 unixctl_command_reply_error(conn, "no such slave");
1471 bond_enable_slave(slave, enable);
1472 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1475 ovs_rwlock_unlock(&rwlock);
1479 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1480 int argc OVS_UNUSED, const char *argv[],
1481 void *aux OVS_UNUSED)
1483 enable_slave(conn, argv, true);
1487 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1488 int argc OVS_UNUSED, const char *argv[],
1489 void *aux OVS_UNUSED)
1491 enable_slave(conn, argv, false);
1495 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1496 void *aux OVS_UNUSED)
1498 const char *mac_s = argv[1];
1499 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1500 const char *basis_s = argc > 3 ? argv[3] : NULL;
1501 uint8_t mac[ETH_ADDR_LEN];
1508 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1509 unixctl_command_reply_error(conn, "invalid vlan");
1517 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1518 unixctl_command_reply_error(conn, "invalid basis");
1525 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1526 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1528 hash_cstr = xasprintf("%u", hash);
1529 unixctl_command_reply(conn, hash_cstr);
1532 unixctl_command_reply_error(conn, "invalid mac");
1539 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1540 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1542 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1543 bond_unixctl_migrate, NULL);
1544 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1545 bond_unixctl_set_active_slave, NULL);
1546 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1547 bond_unixctl_enable_slave, NULL);
1548 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1549 bond_unixctl_disable_slave, NULL);
1550 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1551 bond_unixctl_hash, NULL);
1555 bond_entry_reset(struct bond *bond)
1557 if (bond->balance != BM_AB) {
1558 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1561 bond->hash = xmalloc(hash_len);
1563 memset(bond->hash, 0, hash_len);
1565 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1572 static struct bond_slave *
1573 bond_slave_lookup(struct bond *bond, const void *slave_)
1575 struct bond_slave *slave;
1577 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1579 if (slave->aux == slave_) {
1588 bond_enable_slave(struct bond_slave *slave, bool enable)
1590 slave->delay_expires = LLONG_MAX;
1591 if (enable != slave->enabled) {
1592 slave->bond->bond_revalidate = true;
1593 slave->enabled = enable;
1595 ovs_mutex_lock(&slave->bond->mutex);
1597 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1599 list_remove(&slave->list_node);
1601 ovs_mutex_unlock(&slave->bond->mutex);
1603 VLOG_INFO("interface %s: %s", slave->name,
1604 slave->enabled ? "enabled" : "disabled");
1609 bond_link_status_update(struct bond_slave *slave)
1611 struct bond *bond = slave->bond;
1614 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1615 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1616 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1617 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1618 slave->name, up ? "up" : "down");
1619 if (up == slave->enabled) {
1620 slave->delay_expires = LLONG_MAX;
1621 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1622 slave->name, up ? "disabled" : "enabled");
1624 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1625 : up ? bond->updelay : bond->downdelay);
1626 slave->delay_expires = time_msec() + delay;
1628 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1631 up ? "enabled" : "disabled",
1638 if (time_msec() >= slave->delay_expires) {
1639 bond_enable_slave(slave, up);
1644 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1646 return hash_mac(mac, vlan, basis);
1650 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1652 struct flow hash_flow = *flow;
1653 hash_flow.vlan_tci = htons(vlan);
1655 /* The symmetric quality of this hash function is not required, but
1656 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1657 * purposes, so we use it out of convenience. */
1658 return flow_hash_symmetric_l4(&hash_flow, basis);
1662 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1664 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1666 return (bond->balance == BM_TCP
1667 ? bond_hash_tcp(flow, vlan, bond->basis)
1668 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1671 static struct bond_entry *
1672 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1675 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1678 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1679 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1681 static struct bond_slave *
1682 get_enabled_slave(struct bond *bond)
1686 ovs_mutex_lock(&bond->mutex);
1687 if (list_is_empty(&bond->enabled_slaves)) {
1688 ovs_mutex_unlock(&bond->mutex);
1692 node = list_pop_front(&bond->enabled_slaves);
1693 list_push_back(&bond->enabled_slaves, node);
1694 ovs_mutex_unlock(&bond->mutex);
1696 return CONTAINER_OF(node, struct bond_slave, list_node);
1699 static struct bond_slave *
1700 choose_output_slave(const struct bond *bond, const struct flow *flow,
1701 struct flow_wildcards *wc, uint16_t vlan)
1703 struct bond_entry *e;
1706 balance = bond->balance;
1707 if (bond->lacp_status == LACP_CONFIGURED) {
1708 /* LACP has been configured on this bond but negotiations were
1709 * unsuccussful. If lacp_fallback_ab is enabled use active-
1710 * backup mode else drop all traffic. */
1711 if (!bond->lacp_fallback_ab) {
1719 return bond->active_slave;
1722 if (bond->lacp_status != LACP_NEGOTIATED) {
1723 /* Must have LACP negotiations for TCP balanced bonds. */
1727 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1732 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1734 e = lookup_bond_entry(bond, flow, vlan);
1735 if (!e->slave || !e->slave->enabled) {
1736 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1745 static struct bond_slave *
1746 bond_choose_slave(const struct bond *bond)
1748 struct bond_slave *slave, *best;
1750 /* Find an enabled slave. */
1751 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1752 if (slave->enabled) {
1757 /* All interfaces are disabled. Find an interface that will be enabled
1758 * after its updelay expires. */
1760 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1761 if (slave->delay_expires != LLONG_MAX
1762 && slave->may_enable
1763 && (!best || slave->delay_expires < best->delay_expires)) {
1771 bond_choose_active_slave(struct bond *bond)
1773 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1774 struct bond_slave *old_active_slave = bond->active_slave;
1776 bond->active_slave = bond_choose_slave(bond);
1777 if (bond->active_slave) {
1778 if (bond->active_slave->enabled) {
1779 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1780 bond->name, bond->active_slave->name);
1782 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1783 "remaining %lld ms updelay (since no interface was "
1784 "enabled)", bond->name, bond->active_slave->name,
1785 bond->active_slave->delay_expires - time_msec());
1786 bond_enable_slave(bond->active_slave, true);
1789 bond->send_learning_packets = true;
1790 } else if (old_active_slave) {
1791 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);