2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
59 #define RECIRC_RULE_PRIORITY 20 /* Priority level for internal rules */
61 /* A hash bucket for mapping a flow to a slave.
62 * "struct bond" has an array of BOND_BUCKETS of these. */
64 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
65 uint64_t tx_bytes /* Count of bytes recently transmitted. */
66 OVS_GUARDED_BY(rwlock);
67 struct list list_node; /* In bond_slave's 'entries' list. */
71 * 'pr_rule' is the post-recirculation rule for this entry.
72 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
73 * is used to determine delta (applied to 'tx_bytes' above.) */
75 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
78 /* A bond slave, that is, one of the links comprising a bond. */
80 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
81 struct list list_node; /* In struct bond's enabled_slaves list. */
82 struct bond *bond; /* The bond that contains this slave. */
83 void *aux; /* Client-provided handle for this slave. */
85 struct netdev *netdev; /* Network device, owned by the client. */
86 unsigned int change_seq; /* Tracks changes in 'netdev'. */
87 ofp_port_t ofp_port; /* Open flow port number */
88 char *name; /* Name (a copy of netdev_get_name(netdev)). */
91 long long delay_expires; /* Time after which 'enabled' may change. */
92 bool enabled; /* May be chosen for flows? */
93 bool may_enable; /* Client considers this slave bondable. */
95 /* Rebalancing info. Used only by bond_rebalance(). */
96 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
97 struct list entries; /* 'struct bond_entry's assigned here. */
98 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
101 /* A bond, that is, a set of network devices grouped to improve performance or
104 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
105 char *name; /* Name provided by client. */
106 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
113 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
114 * (To prevent the bond_slave from disappearing they must also hold
116 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
117 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
120 enum bond_mode balance; /* Balancing mode, one of BM_*. */
121 struct bond_slave *active_slave;
122 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
123 enum lacp_status lacp_status; /* Status of LACP negotiations. */
124 bool bond_revalidate; /* True if flows need revalidation. */
125 uint32_t basis; /* Basis for flow hash function. */
127 /* SLB specific bonding info. */
128 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
129 int rebalance_interval; /* Interval between rebalances, in ms. */
130 long long int next_rebalance; /* Next rebalancing time. */
131 bool send_learning_packets;
132 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
133 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
135 /* Store active slave to OVSDB. */
136 bool active_slave_changed; /* Set to true whenever the bond changes
137 active slave. It will be reset to false
138 after it is stored into OVSDB */
140 /* Interface name may not be persistent across an OS reboot, use
141 * MAC address for identifing the active slave */
142 uint8_t active_slave_mac[ETH_ADDR_LEN];
143 /* The MAC address of the active interface. */
144 /* Legacy compatibility. */
145 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
146 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
148 struct ovs_refcount ref_cnt;
151 /* What to do with an bond_recirc_rule. */
153 ADD, /* Add the rule to ofproto's flow table. */
154 DEL, /* Delete the rule from the ofproto's flow table. */
157 /* A rule to add to or delete from ofproto's internal flow table. */
158 struct bond_pr_rule_op {
159 struct hmap_node hmap_node;
161 ofp_port_t out_ofport;
163 struct rule **pr_rule;
166 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
167 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
168 OVS_REQ_RDLOCK(rwlock);
169 static void bond_enable_slave(struct bond_slave *, bool enable)
170 OVS_REQ_WRLOCK(rwlock);
171 static void bond_link_status_update(struct bond_slave *)
172 OVS_REQ_WRLOCK(rwlock);
173 static void bond_choose_active_slave(struct bond *)
174 OVS_REQ_WRLOCK(rwlock);
175 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
176 uint16_t vlan, uint32_t basis);
177 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
179 static struct bond_entry *lookup_bond_entry(const struct bond *,
182 OVS_REQ_RDLOCK(rwlock);
183 static struct bond_slave *get_enabled_slave(struct bond *)
184 OVS_REQ_RDLOCK(rwlock);
185 static struct bond_slave *choose_output_slave(const struct bond *,
187 struct flow_wildcards *,
189 OVS_REQ_RDLOCK(rwlock);
190 static void bond_update_fake_slave_stats(struct bond *)
191 OVS_REQ_RDLOCK(rwlock);
193 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
194 * stores the mode in '*balance' and returns true. Otherwise returns false
195 * without modifying '*balance'. */
197 bond_mode_from_string(enum bond_mode *balance, const char *s)
199 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
201 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
203 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
211 /* Returns a string representing 'balance'. */
213 bond_mode_to_string(enum bond_mode balance) {
216 return "balance-tcp";
218 return "balance-slb";
220 return "active-backup";
226 /* Creates and returns a new bond whose configuration is initially taken from
229 * The caller should register each slave on the new bond by calling
230 * bond_slave_register(). */
232 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
236 bond = xzalloc(sizeof *bond);
237 bond->ofproto = ofproto;
238 hmap_init(&bond->slaves);
239 list_init(&bond->enabled_slaves);
240 ovs_mutex_init(&bond->mutex);
241 bond->next_fake_iface_update = LLONG_MAX;
242 ovs_refcount_init(&bond->ref_cnt);
245 hmap_init(&bond->pr_rule_ops);
247 bond_reconfigure(bond, s);
252 bond_ref(const struct bond *bond_)
254 struct bond *bond = CONST_CAST(struct bond *, bond_);
257 ovs_refcount_ref(&bond->ref_cnt);
264 bond_unref(struct bond *bond)
266 struct bond_slave *slave, *next_slave;
267 struct bond_pr_rule_op *pr_op, *next_op;
269 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
273 ovs_rwlock_wrlock(&rwlock);
274 hmap_remove(all_bonds, &bond->hmap_node);
275 ovs_rwlock_unlock(&rwlock);
277 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
278 hmap_remove(&bond->slaves, &slave->hmap_node);
279 /* Client owns 'slave->netdev'. */
283 hmap_destroy(&bond->slaves);
285 ovs_mutex_destroy(&bond->mutex);
289 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
290 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
293 hmap_destroy(&bond->pr_rule_ops);
295 if (bond->recirc_id) {
296 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
303 add_pr_rule(struct bond *bond, const struct match *match,
304 ofp_port_t out_ofport, struct rule **rule)
306 uint32_t hash = match_hash(match, 0);
307 struct bond_pr_rule_op *pr_op;
309 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
310 if (match_equal(&pr_op->match, match)) {
312 pr_op->out_ofport = out_ofport;
313 pr_op->pr_rule = rule;
318 pr_op = xmalloc(sizeof *pr_op);
319 pr_op->match = *match;
321 pr_op->out_ofport = out_ofport;
322 pr_op->pr_rule = rule;
323 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
327 update_recirc_rules(struct bond *bond)
330 struct bond_pr_rule_op *pr_op, *next_op;
331 uint64_t ofpacts_stub[128 / 8];
332 struct ofpbuf ofpacts;
335 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
337 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
341 if (bond->hash && bond->recirc_id) {
342 for (i = 0; i < BOND_BUCKETS; i++) {
343 struct bond_slave *slave = bond->hash[i].slave;
346 match_init_catchall(&match);
347 match_set_recirc_id(&match, bond->recirc_id);
348 match_set_dp_hash_masked(&match, i, BOND_MASK);
350 add_pr_rule(bond, &match, slave->ofp_port,
351 &bond->hash[i].pr_rule);
356 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
360 ofpbuf_clear(&ofpacts);
361 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
362 error = ofproto_dpif_add_internal_flow(bond->ofproto,
364 RECIRC_RULE_PRIORITY,
365 &ofpacts, pr_op->pr_rule);
367 char *err_s = match_to_string(&pr_op->match,
368 RECIRC_RULE_PRIORITY);
370 VLOG_ERR("failed to add post recirculation flow %s", err_s);
376 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
378 RECIRC_RULE_PRIORITY);
380 char *err_s = match_to_string(&pr_op->match,
381 RECIRC_RULE_PRIORITY);
383 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
387 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
388 *pr_op->pr_rule = NULL;
394 ofpbuf_uninit(&ofpacts);
398 /* Updates 'bond''s overall configuration to 's'.
400 * The caller should register each slave on 'bond' by calling
401 * bond_slave_register(). This is optional if none of the slaves'
402 * configuration has changed. In any case it can't hurt.
404 * Returns true if the configuration has changed in such a way that requires
408 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
410 bool revalidate = false;
412 ovs_rwlock_wrlock(&rwlock);
413 if (!bond->name || strcmp(bond->name, s->name)) {
415 hmap_remove(all_bonds, &bond->hmap_node);
418 bond->name = xstrdup(s->name);
419 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
422 bond->updelay = s->up_delay;
423 bond->downdelay = s->down_delay;
425 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
426 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
430 if (bond->rebalance_interval != s->rebalance_interval) {
431 bond->rebalance_interval = s->rebalance_interval;
435 if (bond->balance != s->balance) {
436 bond->balance = s->balance;
440 if (bond->basis != s->basis) {
441 bond->basis = s->basis;
446 if (bond->next_fake_iface_update == LLONG_MAX) {
447 bond->next_fake_iface_update = time_msec();
450 bond->next_fake_iface_update = LLONG_MAX;
453 if (bond->bond_revalidate) {
455 bond->bond_revalidate = false;
458 if (bond->balance != BM_AB) {
459 if (!bond->recirc_id) {
460 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
462 } else if (bond->recirc_id) {
463 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
467 if (bond->balance == BM_AB || !bond->hash || revalidate) {
468 bond_entry_reset(bond);
471 memcpy(bond->active_slave_mac, s->active_slave_mac,
472 sizeof s->active_slave_mac);
474 bond->active_slave_changed = false;
476 ovs_rwlock_unlock(&rwlock);
480 static struct bond_slave *
481 bond_find_slave_by_mac(const struct bond *bond, const uint8_t mac[6])
483 struct bond_slave *slave;
485 /* Find the last active slave */
486 HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) {
487 uint8_t slave_mac[6];
489 if (netdev_get_etheraddr(slave->netdev, slave_mac)) {
493 if (!memcmp(slave_mac, mac, sizeof(slave_mac))) {
502 bond_active_slave_changed(struct bond *bond)
506 netdev_get_etheraddr(bond->active_slave->netdev, mac);
507 memcpy(bond->active_slave_mac, mac, sizeof bond->active_slave_mac);
508 bond->active_slave_changed = true;
509 seq_change(connectivity_seq_get());
513 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
514 OVS_REQ_WRLOCK(rwlock)
516 if (slave->netdev != netdev) {
517 slave->netdev = netdev;
518 slave->change_seq = 0;
522 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
523 * arbitrary client-provided pointer that uniquely identifies a slave within a
524 * bond. If 'slave_' already exists within 'bond' then this function
525 * reconfigures the existing slave.
527 * 'netdev' must be the network device that 'slave_' represents. It is owned
528 * by the client, so the client must not close it before either unregistering
529 * 'slave_' or destroying 'bond'.
532 bond_slave_register(struct bond *bond, void *slave_,
533 ofp_port_t ofport, struct netdev *netdev)
535 struct bond_slave *slave;
537 ovs_rwlock_wrlock(&rwlock);
538 slave = bond_slave_lookup(bond, slave_);
540 slave = xzalloc(sizeof *slave);
542 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
545 slave->ofp_port = ofport;
546 slave->delay_expires = LLONG_MAX;
547 slave->name = xstrdup(netdev_get_name(netdev));
548 bond->bond_revalidate = true;
550 slave->enabled = false;
551 bond_enable_slave(slave, netdev_get_carrier(netdev));
554 bond_slave_set_netdev__(slave, netdev);
557 slave->name = xstrdup(netdev_get_name(netdev));
558 ovs_rwlock_unlock(&rwlock);
561 /* Updates the network device to be used with 'slave_' to 'netdev'.
563 * This is useful if the caller closes and re-opens the network device
564 * registered with bond_slave_register() but doesn't need to change anything
567 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
569 struct bond_slave *slave;
571 ovs_rwlock_wrlock(&rwlock);
572 slave = bond_slave_lookup(bond, slave_);
574 bond_slave_set_netdev__(slave, netdev);
576 ovs_rwlock_unlock(&rwlock);
579 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
580 * then this function has no effect.
582 * Unregistering a slave invalidates all flows. */
584 bond_slave_unregister(struct bond *bond, const void *slave_)
586 struct bond_slave *slave;
589 ovs_rwlock_wrlock(&rwlock);
590 slave = bond_slave_lookup(bond, slave_);
595 bond->bond_revalidate = true;
596 bond_enable_slave(slave, false);
598 del_active = bond->active_slave == slave;
600 struct bond_entry *e;
601 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
602 if (e->slave == slave) {
610 hmap_remove(&bond->slaves, &slave->hmap_node);
611 /* Client owns 'slave->netdev'. */
615 bond_choose_active_slave(bond);
616 bond->send_learning_packets = true;
619 ovs_rwlock_unlock(&rwlock);
622 /* Should be called on each slave in 'bond' before bond_run() to indicate
623 * whether or not 'slave_' may be enabled. This function is intended to allow
624 * other protocols to have some impact on bonding decisions. For example LACP
625 * or high level link monitoring protocols may decide that a given slave should
626 * not be able to send traffic. */
628 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
630 ovs_rwlock_wrlock(&rwlock);
631 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
632 ovs_rwlock_unlock(&rwlock);
635 /* Performs periodic maintenance on 'bond'.
637 * Returns true if the caller should revalidate its flows.
639 * The caller should check bond_should_send_learning_packets() afterward. */
641 bond_run(struct bond *bond, enum lacp_status lacp_status)
643 struct bond_slave *slave;
646 ovs_rwlock_wrlock(&rwlock);
647 if (bond->lacp_status != lacp_status) {
648 bond->lacp_status = lacp_status;
649 bond->bond_revalidate = true;
652 /* Enable slaves based on link status and LACP feedback. */
653 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
654 bond_link_status_update(slave);
655 slave->change_seq = seq_read(connectivity_seq_get());
657 if (!bond->active_slave || !bond->active_slave->enabled) {
658 bond_choose_active_slave(bond);
661 /* Update fake bond interface stats. */
662 if (time_msec() >= bond->next_fake_iface_update) {
663 bond_update_fake_slave_stats(bond);
664 bond->next_fake_iface_update = time_msec() + 1000;
667 revalidate = bond->bond_revalidate;
668 bond->bond_revalidate = false;
669 ovs_rwlock_unlock(&rwlock);
674 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
676 bond_wait(struct bond *bond)
678 struct bond_slave *slave;
680 ovs_rwlock_rdlock(&rwlock);
681 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
682 if (slave->delay_expires != LLONG_MAX) {
683 poll_timer_wait_until(slave->delay_expires);
686 seq_wait(connectivity_seq_get(), slave->change_seq);
689 if (bond->next_fake_iface_update != LLONG_MAX) {
690 poll_timer_wait_until(bond->next_fake_iface_update);
693 if (bond->bond_revalidate) {
694 poll_immediate_wake();
696 ovs_rwlock_unlock(&rwlock);
698 /* We don't wait for bond->next_rebalance because rebalancing can only run
699 * at a flow account checkpoint. ofproto does checkpointing on its own
700 * schedule and bond_rebalance() gets called afterward, so we'd just be
701 * waking up for no purpose. */
704 /* MAC learning table interaction. */
707 may_send_learning_packets(const struct bond *bond)
709 return ((bond->lacp_status == LACP_DISABLED
710 && (bond->balance == BM_SLB || bond->balance == BM_AB))
711 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
712 && bond->active_slave;
715 /* Returns true if 'bond' needs the client to send out packets to assist with
716 * MAC learning on 'bond'. If this function returns true, then the client
717 * should iterate through its MAC learning table for the bridge on which 'bond'
718 * is located. For each MAC that has been learned on a port other than 'bond',
719 * it should call bond_compose_learning_packet().
721 * This function will only return true if 'bond' is in SLB or active-backup
722 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
725 * Calling this function resets the state that it checks. */
727 bond_should_send_learning_packets(struct bond *bond)
731 ovs_rwlock_wrlock(&rwlock);
732 send = bond->send_learning_packets && may_send_learning_packets(bond);
733 bond->send_learning_packets = false;
734 ovs_rwlock_unlock(&rwlock);
738 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
740 * See bond_should_send_learning_packets() for description of usage. The
741 * caller should send the composed packet on the port associated with
742 * port_aux and takes ownership of the returned ofpbuf. */
744 bond_compose_learning_packet(struct bond *bond,
745 const uint8_t eth_src[ETH_ADDR_LEN],
746 uint16_t vlan, void **port_aux)
748 struct bond_slave *slave;
749 struct ofpbuf *packet;
752 ovs_rwlock_rdlock(&rwlock);
753 ovs_assert(may_send_learning_packets(bond));
754 memset(&flow, 0, sizeof flow);
755 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
756 slave = choose_output_slave(bond, &flow, NULL, vlan);
758 packet = ofpbuf_new(0);
759 compose_rarp(packet, eth_src);
761 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
764 *port_aux = slave->aux;
765 ovs_rwlock_unlock(&rwlock);
769 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
770 * Ethernet destination address of 'eth_dst', should be admitted.
772 * The return value is one of the following:
774 * - BV_ACCEPT: Admit the packet.
776 * - BV_DROP: Drop the packet.
778 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
779 * Ethernet source address and VLAN. If there is none, or if the packet
780 * is on the learned port, then admit the packet. If a different port has
781 * been learned, however, drop the packet (and do not use it for MAC
785 bond_check_admissibility(struct bond *bond, const void *slave_,
786 const uint8_t eth_dst[ETH_ADDR_LEN])
788 enum bond_verdict verdict = BV_DROP;
789 struct bond_slave *slave;
791 ovs_rwlock_rdlock(&rwlock);
792 slave = bond_slave_lookup(bond, slave_);
797 /* LACP bonds have very loose admissibility restrictions because we can
798 * assume the remote switch is aware of the bond and will "do the right
799 * thing". However, as a precaution we drop packets on disabled slaves
800 * because no correctly implemented partner switch should be sending
803 * If LACP is configured, but LACP negotiations have been unsuccessful, we
804 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
805 switch (bond->lacp_status) {
806 case LACP_NEGOTIATED:
807 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
809 case LACP_CONFIGURED:
810 if (!bond->lacp_fallback_ab) {
817 /* Drop all multicast packets on inactive slaves. */
818 if (eth_addr_is_multicast(eth_dst)) {
819 if (bond->active_slave != slave) {
824 switch (bond->balance) {
826 /* TCP balanced bonds require successful LACP negotiations. Based on the
827 * above check, LACP is off or lacp_fallback_ab is true on this bond.
828 * If lacp_fallback_ab is true fall through to BM_AB case else, we
829 * drop all incoming traffic. */
830 if (!bond->lacp_fallback_ab) {
835 /* Drop all packets which arrive on backup slaves. This is similar to
836 * how Linux bonding handles active-backup bonds. */
837 if (bond->active_slave != slave) {
838 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
840 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
841 " slave (%s) destined for " ETH_ADDR_FMT,
842 slave->name, ETH_ADDR_ARGS(eth_dst));
849 /* Drop all packets for which we have learned a different input port,
850 * because we probably sent the packet on one slave and got it back on
851 * the other. Gratuitous ARP packets are an exception to this rule:
852 * the host has moved to another switch. The exception to the
853 * exception is if we locked the learning table to avoid reflections on
855 verdict = BV_DROP_IF_MOVED;
861 ovs_rwlock_unlock(&rwlock);
866 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
867 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
868 * NULL if the packet should be dropped because no slaves are enabled.
870 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
871 * should be a VID only (i.e. excluding the PCP bits). Second,
872 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
873 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
874 * packet belongs to (so for an access port it will be the access port's VLAN).
876 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
877 * significant in the selection. At some point earlier, 'wc' should
878 * have been initialized (e.g., by flow_wildcards_init_catchall()).
881 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
882 struct flow_wildcards *wc, uint16_t vlan)
884 struct bond_slave *slave;
887 ovs_rwlock_rdlock(&rwlock);
888 slave = choose_output_slave(bond, flow, wc, vlan);
889 aux = slave ? slave->aux : NULL;
890 ovs_rwlock_unlock(&rwlock);
897 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
898 OVS_REQ_WRLOCK(rwlock)
903 delta = rule_tx_bytes - entry->pr_tx_bytes;
904 entry->tx_bytes += delta;
905 entry->pr_tx_bytes = rule_tx_bytes;
909 /* Maintain bond stats using post recirculation rule byte counters.*/
911 bond_recirculation_account(struct bond *bond)
912 OVS_REQ_WRLOCK(rwlock)
916 for (i=0; i<=BOND_MASK; i++) {
917 struct bond_entry *entry = &bond->hash[i];
918 struct rule *rule = entry->pr_rule;
921 uint64_t n_packets OVS_UNUSED;
922 long long int used OVS_UNUSED;
925 rule->ofproto->ofproto_class->rule_get_stats(
926 rule, &n_packets, &n_bytes, &used);
927 bond_entry_account(entry, n_bytes);
933 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
936 if (bond->balance == BM_TCP && bond->recirc_id) {
938 *recirc_id = bond->recirc_id;
941 *hash_bias = bond->basis;
950 bond_update_post_recirc_rules(struct bond* bond, const bool force)
952 struct bond_entry *e;
953 bool update_rules = force; /* Always update rules if caller forces it. */
955 /* Make sure all bond entries are populated */
956 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
957 if (!e->slave || !e->slave->enabled) {
959 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
960 struct bond_slave, hmap_node);
961 if (!e->slave->enabled) {
962 e->slave = bond->active_slave;
968 update_recirc_rules(bond);
975 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
977 return bond->rebalance_interval
978 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
981 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
983 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
986 ovs_rwlock_wrlock(&rwlock);
987 if (bond_is_balanced(bond)) {
988 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
990 ovs_rwlock_unlock(&rwlock);
993 static struct bond_slave *
994 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
996 return CONTAINER_OF(bal, struct bond_slave, bal_node);
1000 log_bals(struct bond *bond, const struct list *bals)
1001 OVS_REQ_RDLOCK(rwlock)
1003 if (VLOG_IS_DBG_ENABLED()) {
1004 struct ds ds = DS_EMPTY_INITIALIZER;
1005 const struct bond_slave *slave;
1007 LIST_FOR_EACH (slave, bal_node, bals) {
1009 ds_put_char(&ds, ',');
1011 ds_put_format(&ds, " %s %"PRIu64"kB",
1012 slave->name, slave->tx_bytes / 1024);
1014 if (!slave->enabled) {
1015 ds_put_cstr(&ds, " (disabled)");
1017 if (!list_is_empty(&slave->entries)) {
1018 struct bond_entry *e;
1020 ds_put_cstr(&ds, " (");
1021 LIST_FOR_EACH (e, list_node, &slave->entries) {
1022 if (&e->list_node != list_front(&slave->entries)) {
1023 ds_put_cstr(&ds, " + ");
1025 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
1026 e - bond->hash, e->tx_bytes / 1024);
1028 ds_put_cstr(&ds, ")");
1031 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
1036 /* Shifts 'hash' from its current slave to 'to'. */
1038 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
1039 OVS_REQ_WRLOCK(rwlock)
1041 struct bond_slave *from = hash->slave;
1042 struct bond *bond = from->bond;
1043 uint64_t delta = hash->tx_bytes;
1045 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
1046 "from %s to %s (now carrying %"PRIu64"kB and "
1047 "%"PRIu64"kB load, respectively)",
1048 bond->name, delta / 1024, hash - bond->hash,
1049 from->name, to->name,
1050 (from->tx_bytes - delta) / 1024,
1051 (to->tx_bytes + delta) / 1024);
1053 /* Shift load away from 'from' to 'to'. */
1054 from->tx_bytes -= delta;
1055 to->tx_bytes += delta;
1057 /* Arrange for flows to be revalidated. */
1059 bond->bond_revalidate = true;
1062 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1063 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1064 * given that doing so must decrease the ratio of the load on the two slaves by
1065 * at least 0.1. Returns NULL if there is no appropriate entry.
1067 * The list of entries isn't sorted. I don't know of a reason to prefer to
1068 * shift away small hashes or large hashes. */
1069 static struct bond_entry *
1070 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1071 OVS_REQ_WRLOCK(rwlock)
1073 struct bond_entry *e;
1075 if (list_is_short(&from->entries)) {
1076 /* 'from' carries no more than one MAC hash, so shifting load away from
1077 * it would be pointless. */
1081 LIST_FOR_EACH (e, list_node, &from->entries) {
1082 double old_ratio, new_ratio;
1085 if (to_tx_bytes == 0) {
1086 /* Nothing on the new slave, move it. */
1090 delta = e->tx_bytes;
1091 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1092 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1093 if (old_ratio - new_ratio > 0.1
1094 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1095 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1096 and 'to' slave have the same load. Therefore, we only move an
1097 entry if it decreases the load on 'from', and brings us closer
1098 to equal traffic load. */
1106 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1109 insert_bal(struct list *bals, struct bond_slave *slave)
1111 struct bond_slave *pos;
1113 LIST_FOR_EACH (pos, bal_node, bals) {
1114 if (slave->tx_bytes > pos->tx_bytes) {
1118 list_insert(&pos->bal_node, &slave->bal_node);
1121 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1122 * that descending order of 'tx_bytes' is maintained. */
1124 reinsert_bal(struct list *bals, struct bond_slave *slave)
1126 list_remove(&slave->bal_node);
1127 insert_bal(bals, slave);
1130 /* If 'bond' needs rebalancing, does so.
1132 * The caller should have called bond_account() for each active flow, or in case
1133 * of recirculation is used, have called bond_recirculation_account(bond),
1134 * to ensure that flow data is consistently accounted at this point.
1137 bond_rebalance(struct bond *bond)
1139 struct bond_slave *slave;
1140 struct bond_entry *e;
1142 bool rebalanced = false;
1145 ovs_rwlock_wrlock(&rwlock);
1146 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1149 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1151 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1152 bond_may_recirc(bond, NULL, NULL);
1155 bond_recirculation_account(bond);
1158 /* Add each bond_entry to its slave's 'entries' list.
1159 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1160 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1161 slave->tx_bytes = 0;
1162 list_init(&slave->entries);
1164 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1165 if (e->slave && e->tx_bytes) {
1166 e->slave->tx_bytes += e->tx_bytes;
1167 list_push_back(&e->slave->entries, &e->list_node);
1171 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1173 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1174 * with a proper list sort algorithm. */
1176 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1177 if (slave->enabled) {
1178 insert_bal(&bals, slave);
1181 log_bals(bond, &bals);
1183 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1184 while (!list_is_short(&bals)) {
1185 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1186 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1189 overload = from->tx_bytes - to->tx_bytes;
1190 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1191 /* The extra load on 'from' (and all less-loaded slaves), compared
1192 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1193 * it is less than ~1Mbps. No point in rebalancing. */
1197 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1198 * to move from 'from' to 'to'. */
1199 e = choose_entry_to_migrate(from, to->tx_bytes);
1201 bond_shift_load(e, to);
1203 /* Delete element from from->entries.
1205 * We don't add the element to to->hashes. That would only allow
1206 * 'e' to be migrated to another slave in this rebalancing run, and
1207 * there is no point in doing that. */
1208 list_remove(&e->list_node);
1210 /* Re-sort 'bals'. */
1211 reinsert_bal(&bals, from);
1212 reinsert_bal(&bals, to);
1215 /* Can't usefully migrate anything away from 'from'.
1216 * Don't reconsider it. */
1217 list_remove(&from->bal_node);
1221 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1222 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1223 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1224 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1228 if (use_recirc && rebalanced) {
1229 bond_update_post_recirc_rules(bond,true);
1233 ovs_rwlock_unlock(&rwlock);
1236 /* Bonding unixctl user interface functions. */
1238 static struct bond *
1239 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1243 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1245 if (!strcmp(bond->name, name)) {
1252 static struct bond_slave *
1253 bond_lookup_slave(struct bond *bond, const char *slave_name)
1255 struct bond_slave *slave;
1257 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1258 if (!strcmp(slave->name, slave_name)) {
1266 bond_unixctl_list(struct unixctl_conn *conn,
1267 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1268 void *aux OVS_UNUSED)
1270 struct ds ds = DS_EMPTY_INITIALIZER;
1271 const struct bond *bond;
1273 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1275 ovs_rwlock_rdlock(&rwlock);
1276 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1277 const struct bond_slave *slave;
1280 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1281 bond_mode_to_string(bond->balance), bond->recirc_id);
1284 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1286 ds_put_cstr(&ds, ", ");
1288 ds_put_cstr(&ds, slave->name);
1290 ds_put_char(&ds, '\n');
1292 ovs_rwlock_unlock(&rwlock);
1293 unixctl_command_reply(conn, ds_cstr(&ds));
1298 bond_print_details(struct ds *ds, const struct bond *bond)
1299 OVS_REQ_RDLOCK(rwlock)
1301 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1302 const struct shash_node **sorted_slaves = NULL;
1303 const struct bond_slave *slave;
1308 ds_put_format(ds, "---- %s ----\n", bond->name);
1309 ds_put_format(ds, "bond_mode: %s\n",
1310 bond_mode_to_string(bond->balance));
1312 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1313 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1314 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1316 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1318 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1319 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1321 if (bond_is_balanced(bond)) {
1322 ds_put_format(ds, "next rebalance: %lld ms\n",
1323 bond->next_rebalance - time_msec());
1326 ds_put_cstr(ds, "lacp_status: ");
1327 switch (bond->lacp_status) {
1328 case LACP_NEGOTIATED:
1329 ds_put_cstr(ds, "negotiated\n");
1331 case LACP_CONFIGURED:
1332 ds_put_cstr(ds, "configured\n");
1335 ds_put_cstr(ds, "off\n");
1338 ds_put_cstr(ds, "<unknown>\n");
1342 ds_put_cstr(ds, "active slave mac: ");
1343 ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac));
1344 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1345 ds_put_format(ds,"(%s)\n", slave ? slave->name : "none");
1347 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1348 shash_add(&slave_shash, slave->name, slave);
1350 sorted_slaves = shash_sort(&slave_shash);
1352 for (i = 0; i < shash_count(&slave_shash); i++) {
1353 struct bond_entry *be;
1355 slave = sorted_slaves[i]->data;
1358 ds_put_format(ds, "\nslave %s: %s\n",
1359 slave->name, slave->enabled ? "enabled" : "disabled");
1360 if (slave == bond->active_slave) {
1361 ds_put_cstr(ds, "\tactive slave\n");
1363 if (slave->delay_expires != LLONG_MAX) {
1364 ds_put_format(ds, "\t%s expires in %lld ms\n",
1365 slave->enabled ? "downdelay" : "updelay",
1366 slave->delay_expires - time_msec());
1369 ds_put_format(ds, "\tmay_enable: %s\n",
1370 slave->may_enable ? "true" : "false");
1372 if (!bond_is_balanced(bond)) {
1377 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1378 int hash = be - bond->hash;
1381 if (be->slave != slave) {
1385 be_tx_k = be->tx_bytes / 1024;
1387 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1391 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1394 shash_destroy(&slave_shash);
1395 free(sorted_slaves);
1396 ds_put_cstr(ds, "\n");
1400 bond_unixctl_show(struct unixctl_conn *conn,
1401 int argc, const char *argv[],
1402 void *aux OVS_UNUSED)
1404 struct ds ds = DS_EMPTY_INITIALIZER;
1406 ovs_rwlock_rdlock(&rwlock);
1408 const struct bond *bond = bond_find(argv[1]);
1411 unixctl_command_reply_error(conn, "no such bond");
1414 bond_print_details(&ds, bond);
1416 const struct bond *bond;
1418 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1419 bond_print_details(&ds, bond);
1423 unixctl_command_reply(conn, ds_cstr(&ds));
1427 ovs_rwlock_unlock(&rwlock);
1431 bond_unixctl_migrate(struct unixctl_conn *conn,
1432 int argc OVS_UNUSED, const char *argv[],
1433 void *aux OVS_UNUSED)
1435 const char *bond_s = argv[1];
1436 const char *hash_s = argv[2];
1437 const char *slave_s = argv[3];
1439 struct bond_slave *slave;
1440 struct bond_entry *entry;
1443 ovs_rwlock_wrlock(&rwlock);
1444 bond = bond_find(bond_s);
1446 unixctl_command_reply_error(conn, "no such bond");
1450 if (bond->balance != BM_SLB) {
1451 unixctl_command_reply_error(conn, "not an SLB bond");
1455 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1456 hash = atoi(hash_s) & BOND_MASK;
1458 unixctl_command_reply_error(conn, "bad hash");
1462 slave = bond_lookup_slave(bond, slave_s);
1464 unixctl_command_reply_error(conn, "no such slave");
1468 if (!slave->enabled) {
1469 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1473 entry = &bond->hash[hash];
1474 bond->bond_revalidate = true;
1475 entry->slave = slave;
1476 unixctl_command_reply(conn, "migrated");
1479 ovs_rwlock_unlock(&rwlock);
1483 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1484 int argc OVS_UNUSED, const char *argv[],
1485 void *aux OVS_UNUSED)
1487 const char *bond_s = argv[1];
1488 const char *slave_s = argv[2];
1490 struct bond_slave *slave;
1492 ovs_rwlock_wrlock(&rwlock);
1493 bond = bond_find(bond_s);
1495 unixctl_command_reply_error(conn, "no such bond");
1499 slave = bond_lookup_slave(bond, slave_s);
1501 unixctl_command_reply_error(conn, "no such slave");
1505 if (!slave->enabled) {
1506 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1510 if (bond->active_slave != slave) {
1511 bond->bond_revalidate = true;
1512 bond->active_slave = slave;
1513 VLOG_INFO("bond %s: active interface is now %s",
1514 bond->name, slave->name);
1515 bond->send_learning_packets = true;
1516 unixctl_command_reply(conn, "done");
1517 bond_active_slave_changed(bond);
1519 unixctl_command_reply(conn, "no change");
1522 ovs_rwlock_unlock(&rwlock);
1526 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1528 const char *bond_s = argv[1];
1529 const char *slave_s = argv[2];
1531 struct bond_slave *slave;
1533 ovs_rwlock_wrlock(&rwlock);
1534 bond = bond_find(bond_s);
1536 unixctl_command_reply_error(conn, "no such bond");
1540 slave = bond_lookup_slave(bond, slave_s);
1542 unixctl_command_reply_error(conn, "no such slave");
1546 bond_enable_slave(slave, enable);
1547 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1550 ovs_rwlock_unlock(&rwlock);
1554 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1555 int argc OVS_UNUSED, const char *argv[],
1556 void *aux OVS_UNUSED)
1558 enable_slave(conn, argv, true);
1562 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1563 int argc OVS_UNUSED, const char *argv[],
1564 void *aux OVS_UNUSED)
1566 enable_slave(conn, argv, false);
1570 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1571 void *aux OVS_UNUSED)
1573 const char *mac_s = argv[1];
1574 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1575 const char *basis_s = argc > 3 ? argv[3] : NULL;
1576 uint8_t mac[ETH_ADDR_LEN];
1583 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1584 unixctl_command_reply_error(conn, "invalid vlan");
1592 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1593 unixctl_command_reply_error(conn, "invalid basis");
1600 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1601 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1603 hash_cstr = xasprintf("%u", hash);
1604 unixctl_command_reply(conn, hash_cstr);
1607 unixctl_command_reply_error(conn, "invalid mac");
1614 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1615 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1617 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1618 bond_unixctl_migrate, NULL);
1619 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1620 bond_unixctl_set_active_slave, NULL);
1621 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1622 bond_unixctl_enable_slave, NULL);
1623 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1624 bond_unixctl_disable_slave, NULL);
1625 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1626 bond_unixctl_hash, NULL);
1630 bond_entry_reset(struct bond *bond)
1632 if (bond->balance != BM_AB) {
1633 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1636 bond->hash = xmalloc(hash_len);
1638 memset(bond->hash, 0, hash_len);
1640 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1647 static struct bond_slave *
1648 bond_slave_lookup(struct bond *bond, const void *slave_)
1650 struct bond_slave *slave;
1652 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1654 if (slave->aux == slave_) {
1663 bond_enable_slave(struct bond_slave *slave, bool enable)
1665 slave->delay_expires = LLONG_MAX;
1666 if (enable != slave->enabled) {
1667 slave->bond->bond_revalidate = true;
1668 slave->enabled = enable;
1670 ovs_mutex_lock(&slave->bond->mutex);
1672 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1674 list_remove(&slave->list_node);
1676 ovs_mutex_unlock(&slave->bond->mutex);
1678 VLOG_INFO("interface %s: %s", slave->name,
1679 slave->enabled ? "enabled" : "disabled");
1684 bond_link_status_update(struct bond_slave *slave)
1686 struct bond *bond = slave->bond;
1689 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1690 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1691 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1692 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1693 slave->name, up ? "up" : "down");
1694 if (up == slave->enabled) {
1695 slave->delay_expires = LLONG_MAX;
1696 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1697 slave->name, up ? "disabled" : "enabled");
1699 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1700 : up ? bond->updelay : bond->downdelay);
1701 slave->delay_expires = time_msec() + delay;
1703 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1706 up ? "enabled" : "disabled",
1713 if (time_msec() >= slave->delay_expires) {
1714 bond_enable_slave(slave, up);
1719 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1721 return hash_mac(mac, vlan, basis);
1725 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1727 struct flow hash_flow = *flow;
1728 hash_flow.vlan_tci = htons(vlan);
1730 /* The symmetric quality of this hash function is not required, but
1731 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1732 * purposes, so we use it out of convenience. */
1733 return flow_hash_symmetric_l4(&hash_flow, basis);
1737 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1739 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1741 return (bond->balance == BM_TCP
1742 ? bond_hash_tcp(flow, vlan, bond->basis)
1743 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1746 static struct bond_entry *
1747 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1750 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1753 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1754 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1756 static struct bond_slave *
1757 get_enabled_slave(struct bond *bond)
1761 ovs_mutex_lock(&bond->mutex);
1762 if (list_is_empty(&bond->enabled_slaves)) {
1763 ovs_mutex_unlock(&bond->mutex);
1767 node = list_pop_front(&bond->enabled_slaves);
1768 list_push_back(&bond->enabled_slaves, node);
1769 ovs_mutex_unlock(&bond->mutex);
1771 return CONTAINER_OF(node, struct bond_slave, list_node);
1774 static struct bond_slave *
1775 choose_output_slave(const struct bond *bond, const struct flow *flow,
1776 struct flow_wildcards *wc, uint16_t vlan)
1778 struct bond_entry *e;
1781 balance = bond->balance;
1782 if (bond->lacp_status == LACP_CONFIGURED) {
1783 /* LACP has been configured on this bond but negotiations were
1784 * unsuccussful. If lacp_fallback_ab is enabled use active-
1785 * backup mode else drop all traffic. */
1786 if (!bond->lacp_fallback_ab) {
1794 return bond->active_slave;
1797 if (bond->lacp_status != LACP_NEGOTIATED) {
1798 /* Must have LACP negotiations for TCP balanced bonds. */
1802 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1807 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1809 e = lookup_bond_entry(bond, flow, vlan);
1810 if (!e->slave || !e->slave->enabled) {
1811 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1820 static struct bond_slave *
1821 bond_choose_slave(const struct bond *bond)
1823 struct bond_slave *slave, *best;
1825 /* Find the last active slave. */
1826 slave = bond_find_slave_by_mac(bond, bond->active_slave_mac);
1827 if (slave && slave->enabled) {
1831 /* Find an enabled slave. */
1832 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1833 if (slave->enabled) {
1838 /* All interfaces are disabled. Find an interface that will be enabled
1839 * after its updelay expires. */
1841 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1842 if (slave->delay_expires != LLONG_MAX
1843 && slave->may_enable
1844 && (!best || slave->delay_expires < best->delay_expires)) {
1852 bond_choose_active_slave(struct bond *bond)
1854 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1855 struct bond_slave *old_active_slave = bond->active_slave;
1857 bond->active_slave = bond_choose_slave(bond);
1858 if (bond->active_slave) {
1859 if (bond->active_slave->enabled) {
1860 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1861 bond->name, bond->active_slave->name);
1863 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1864 "remaining %lld ms updelay (since no interface was "
1865 "enabled)", bond->name, bond->active_slave->name,
1866 bond->active_slave->delay_expires - time_msec());
1867 bond_enable_slave(bond->active_slave, true);
1870 bond->send_learning_packets = true;
1872 if (bond->active_slave != old_active_slave) {
1873 bond_active_slave_changed(bond);
1875 } else if (old_active_slave) {
1876 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1880 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1881 * bond interface. */
1883 bond_update_fake_slave_stats(struct bond *bond)
1885 struct netdev_stats bond_stats;
1886 struct bond_slave *slave;
1887 struct netdev *bond_dev;
1889 memset(&bond_stats, 0, sizeof bond_stats);
1891 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1892 struct netdev_stats slave_stats;
1894 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1895 /* XXX: We swap the stats here because they are swapped back when
1896 * reported by the internal device. The reason for this is
1897 * internal devices normally represent packets going into the
1898 * system but when used as fake bond device they represent packets
1899 * leaving the system. We really should do this in the internal
1900 * device itself because changing it here reverses the counts from
1901 * the perspective of the switch. However, the internal device
1902 * doesn't know what type of device it represents so we have to do
1903 * it here for now. */
1904 bond_stats.tx_packets += slave_stats.rx_packets;
1905 bond_stats.tx_bytes += slave_stats.rx_bytes;
1906 bond_stats.rx_packets += slave_stats.tx_packets;
1907 bond_stats.rx_bytes += slave_stats.tx_bytes;
1911 if (!netdev_open(bond->name, "system", &bond_dev)) {
1912 netdev_set_stats(bond_dev, &bond_stats);
1913 netdev_close(bond_dev);
1918 * Return true if bond has unstored active slave change.
1919 * If return true, 'mac' will store the bond's current active slave's
1922 bond_get_changed_active_slave(const char *name, uint8_t* mac, bool force)
1926 ovs_rwlock_wrlock(&rwlock);
1927 bond = bond_find(name);
1929 if (bond->active_slave_changed || force) {
1930 memcpy(mac, bond->active_slave_mac, ETH_ADDR_LEN);
1931 bond->active_slave_changed = false;
1932 ovs_rwlock_unlock(&rwlock);
1936 ovs_rwlock_unlock(&rwlock);