2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "ofp-actions.h"
29 #include "ofproto/ofproto-provider.h"
30 #include "ofproto/ofproto-dpif.h"
31 #include "connectivity.h"
33 #include "dynamic-string.h"
42 #include "poll-loop.h"
50 VLOG_DEFINE_THIS_MODULE(bond);
52 static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
53 static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
54 static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
56 /* Bit-mask for hashing a flow down to a bucket. */
57 #define BOND_MASK 0xff
58 #define BOND_BUCKETS (BOND_MASK + 1)
60 /* A hash bucket for mapping a flow to a slave.
61 * "struct bond" has an array of BOND_BUCKETS of these. */
63 struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */
64 uint64_t tx_bytes /* Count of bytes recently transmitted. */
65 OVS_GUARDED_BY(rwlock);
66 struct list list_node; /* In bond_slave's 'entries' list. */
70 * 'pr_rule' is the post-recirculation rule for this entry.
71 * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which
72 * is used to determine delta (applied to 'tx_bytes' above.) */
74 uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock);
77 /* A bond slave, that is, one of the links comprising a bond. */
79 struct hmap_node hmap_node; /* In struct bond's slaves hmap. */
80 struct list list_node; /* In struct bond's enabled_slaves list. */
81 struct bond *bond; /* The bond that contains this slave. */
82 void *aux; /* Client-provided handle for this slave. */
84 struct netdev *netdev; /* Network device, owned by the client. */
85 unsigned int change_seq; /* Tracks changes in 'netdev'. */
86 ofp_port_t ofp_port; /* Open flow port number */
87 char *name; /* Name (a copy of netdev_get_name(netdev)). */
90 long long delay_expires; /* Time after which 'enabled' may change. */
91 bool enabled; /* May be chosen for flows? */
92 bool may_enable; /* Client considers this slave bondable. */
94 /* Rebalancing info. Used only by bond_rebalance(). */
95 struct list bal_node; /* In bond_rebalance()'s 'bals' list. */
96 struct list entries; /* 'struct bond_entry's assigned here. */
97 uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */
100 /* A bond, that is, a set of network devices grouped to improve performance or
103 struct hmap_node hmap_node; /* In 'all_bonds' hmap. */
104 char *name; /* Name provided by client. */
105 struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */
112 * Any reader or writer of 'enabled_slaves' must hold 'mutex'.
113 * (To prevent the bond_slave from disappearing they must also hold
115 struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock);
116 struct list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */
119 enum bond_mode balance; /* Balancing mode, one of BM_*. */
120 struct bond_slave *active_slave;
121 int updelay, downdelay; /* Delay before slave goes up/down, in ms. */
122 enum lacp_status lacp_status; /* Status of LACP negotiations. */
123 bool bond_revalidate; /* True if flows need revalidation. */
124 uint32_t basis; /* Basis for flow hash function. */
126 /* SLB specific bonding info. */
127 struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
128 int rebalance_interval; /* Interval between rebalances, in ms. */
129 long long int next_rebalance; /* Next rebalancing time. */
130 bool send_learning_packets;
131 uint32_t recirc_id; /* Non zero if recirculation can be used.*/
132 struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/
134 /* Legacy compatibility. */
135 long long int next_fake_iface_update; /* LLONG_MAX if disabled. */
136 bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */
138 struct ovs_refcount ref_cnt;
141 /* What to do with an bond_recirc_rule. */
143 ADD, /* Add the rule to ofproto's flow table. */
144 DEL, /* Delete the rule from the ofproto's flow table. */
147 /* A rule to add to or delete from ofproto's internal flow table. */
148 struct bond_pr_rule_op {
149 struct hmap_node hmap_node;
151 ofp_port_t out_ofport;
153 struct rule **pr_rule;
156 static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock);
157 static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_)
158 OVS_REQ_RDLOCK(rwlock);
159 static void bond_enable_slave(struct bond_slave *, bool enable)
160 OVS_REQ_WRLOCK(rwlock);
161 static void bond_link_status_update(struct bond_slave *)
162 OVS_REQ_WRLOCK(rwlock);
163 static void bond_choose_active_slave(struct bond *)
164 OVS_REQ_WRLOCK(rwlock);
165 static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN],
166 uint16_t vlan, uint32_t basis);
167 static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
169 static struct bond_entry *lookup_bond_entry(const struct bond *,
172 OVS_REQ_RDLOCK(rwlock);
173 static struct bond_slave *get_enabled_slave(struct bond *)
174 OVS_REQ_RDLOCK(rwlock);
175 static struct bond_slave *choose_output_slave(const struct bond *,
177 struct flow_wildcards *,
179 OVS_REQ_RDLOCK(rwlock);
180 static void bond_update_fake_slave_stats(struct bond *)
181 OVS_REQ_RDLOCK(rwlock);
183 /* Attempts to parse 's' as the name of a bond balancing mode. If successful,
184 * stores the mode in '*balance' and returns true. Otherwise returns false
185 * without modifying '*balance'. */
187 bond_mode_from_string(enum bond_mode *balance, const char *s)
189 if (!strcmp(s, bond_mode_to_string(BM_TCP))) {
191 } else if (!strcmp(s, bond_mode_to_string(BM_SLB))) {
193 } else if (!strcmp(s, bond_mode_to_string(BM_AB))) {
201 /* Returns a string representing 'balance'. */
203 bond_mode_to_string(enum bond_mode balance) {
206 return "balance-tcp";
208 return "balance-slb";
210 return "active-backup";
216 /* Creates and returns a new bond whose configuration is initially taken from
219 * The caller should register each slave on the new bond by calling
220 * bond_slave_register(). */
222 bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto)
226 bond = xzalloc(sizeof *bond);
227 bond->ofproto = ofproto;
228 hmap_init(&bond->slaves);
229 list_init(&bond->enabled_slaves);
230 ovs_mutex_init(&bond->mutex);
231 bond->next_fake_iface_update = LLONG_MAX;
232 ovs_refcount_init(&bond->ref_cnt);
235 hmap_init(&bond->pr_rule_ops);
237 bond_reconfigure(bond, s);
242 bond_ref(const struct bond *bond_)
244 struct bond *bond = CONST_CAST(struct bond *, bond_);
247 ovs_refcount_ref(&bond->ref_cnt);
254 bond_unref(struct bond *bond)
256 struct bond_slave *slave, *next_slave;
257 struct bond_pr_rule_op *pr_op, *next_op;
259 if (!bond || ovs_refcount_unref(&bond->ref_cnt) != 1) {
263 ovs_rwlock_wrlock(&rwlock);
264 hmap_remove(all_bonds, &bond->hmap_node);
265 ovs_rwlock_unlock(&rwlock);
267 HMAP_FOR_EACH_SAFE (slave, next_slave, hmap_node, &bond->slaves) {
268 hmap_remove(&bond->slaves, &slave->hmap_node);
269 /* Client owns 'slave->netdev'. */
273 hmap_destroy(&bond->slaves);
275 ovs_mutex_destroy(&bond->mutex);
279 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
280 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
283 hmap_destroy(&bond->pr_rule_ops);
285 if (bond->recirc_id) {
286 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
293 add_pr_rule(struct bond *bond, const struct match *match,
294 ofp_port_t out_ofport, struct rule **rule)
296 uint32_t hash = match_hash(match, 0);
297 struct bond_pr_rule_op *pr_op;
299 HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) {
300 if (match_equal(&pr_op->match, match)) {
302 pr_op->out_ofport = out_ofport;
303 pr_op->pr_rule = rule;
308 pr_op = xmalloc(sizeof *pr_op);
309 pr_op->match = *match;
311 pr_op->out_ofport = out_ofport;
312 pr_op->pr_rule = rule;
313 hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash);
317 update_recirc_rules(struct bond *bond)
320 struct bond_pr_rule_op *pr_op, *next_op;
321 uint64_t ofpacts_stub[128 / 8];
322 struct ofpbuf ofpacts;
325 ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
327 HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
331 if (bond->hash && bond->recirc_id) {
332 for (i = 0; i < BOND_BUCKETS; i++) {
333 struct bond_slave *slave = bond->hash[i].slave;
336 match_init_catchall(&match);
337 match_set_recirc_id(&match, bond->recirc_id);
338 match_set_dp_hash_masked(&match, i, BOND_MASK);
340 add_pr_rule(bond, &match, slave->ofp_port,
341 &bond->hash[i].pr_rule);
346 HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
350 ofpbuf_clear(&ofpacts);
351 ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport;
352 error = ofproto_dpif_add_internal_flow(bond->ofproto,
354 RECIRC_RULE_PRIORITY, 0,
355 &ofpacts, pr_op->pr_rule);
357 char *err_s = match_to_string(&pr_op->match,
358 RECIRC_RULE_PRIORITY);
360 VLOG_ERR("failed to add post recirculation flow %s", err_s);
366 error = ofproto_dpif_delete_internal_flow(bond->ofproto,
368 RECIRC_RULE_PRIORITY);
370 char *err_s = match_to_string(&pr_op->match,
371 RECIRC_RULE_PRIORITY);
373 VLOG_ERR("failed to remove post recirculation flow %s", err_s);
377 hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
378 *pr_op->pr_rule = NULL;
384 ofpbuf_uninit(&ofpacts);
388 /* Updates 'bond''s overall configuration to 's'.
390 * The caller should register each slave on 'bond' by calling
391 * bond_slave_register(). This is optional if none of the slaves'
392 * configuration has changed. In any case it can't hurt.
394 * Returns true if the configuration has changed in such a way that requires
398 bond_reconfigure(struct bond *bond, const struct bond_settings *s)
400 bool revalidate = false;
402 ovs_rwlock_wrlock(&rwlock);
403 if (!bond->name || strcmp(bond->name, s->name)) {
405 hmap_remove(all_bonds, &bond->hmap_node);
408 bond->name = xstrdup(s->name);
409 hmap_insert(all_bonds, &bond->hmap_node, hash_string(bond->name, 0));
412 bond->updelay = s->up_delay;
413 bond->downdelay = s->down_delay;
415 if (bond->lacp_fallback_ab != s->lacp_fallback_ab_cfg) {
416 bond->lacp_fallback_ab = s->lacp_fallback_ab_cfg;
420 if (bond->rebalance_interval != s->rebalance_interval) {
421 bond->rebalance_interval = s->rebalance_interval;
425 if (bond->balance != s->balance) {
426 bond->balance = s->balance;
430 if (bond->basis != s->basis) {
431 bond->basis = s->basis;
436 if (bond->next_fake_iface_update == LLONG_MAX) {
437 bond->next_fake_iface_update = time_msec();
440 bond->next_fake_iface_update = LLONG_MAX;
443 if (bond->bond_revalidate) {
445 bond->bond_revalidate = false;
448 if (bond->balance != BM_AB) {
449 if (!bond->recirc_id) {
450 bond->recirc_id = ofproto_dpif_alloc_recirc_id(bond->ofproto);
452 } else if (bond->recirc_id) {
453 ofproto_dpif_free_recirc_id(bond->ofproto, bond->recirc_id);
457 if (bond->balance == BM_AB || !bond->hash || revalidate) {
458 bond_entry_reset(bond);
461 ovs_rwlock_unlock(&rwlock);
466 bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev)
467 OVS_REQ_WRLOCK(rwlock)
469 if (slave->netdev != netdev) {
470 slave->netdev = netdev;
471 slave->change_seq = 0;
475 /* Registers 'slave_' as a slave of 'bond'. The 'slave_' pointer is an
476 * arbitrary client-provided pointer that uniquely identifies a slave within a
477 * bond. If 'slave_' already exists within 'bond' then this function
478 * reconfigures the existing slave.
480 * 'netdev' must be the network device that 'slave_' represents. It is owned
481 * by the client, so the client must not close it before either unregistering
482 * 'slave_' or destroying 'bond'.
485 bond_slave_register(struct bond *bond, void *slave_,
486 ofp_port_t ofport, struct netdev *netdev)
488 struct bond_slave *slave;
490 ovs_rwlock_wrlock(&rwlock);
491 slave = bond_slave_lookup(bond, slave_);
493 slave = xzalloc(sizeof *slave);
495 hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0));
498 slave->ofp_port = ofport;
499 slave->delay_expires = LLONG_MAX;
500 slave->name = xstrdup(netdev_get_name(netdev));
501 bond->bond_revalidate = true;
503 slave->enabled = false;
504 bond_enable_slave(slave, netdev_get_carrier(netdev));
507 bond_slave_set_netdev__(slave, netdev);
510 slave->name = xstrdup(netdev_get_name(netdev));
511 ovs_rwlock_unlock(&rwlock);
514 /* Updates the network device to be used with 'slave_' to 'netdev'.
516 * This is useful if the caller closes and re-opens the network device
517 * registered with bond_slave_register() but doesn't need to change anything
520 bond_slave_set_netdev(struct bond *bond, void *slave_, struct netdev *netdev)
522 struct bond_slave *slave;
524 ovs_rwlock_wrlock(&rwlock);
525 slave = bond_slave_lookup(bond, slave_);
527 bond_slave_set_netdev__(slave, netdev);
529 ovs_rwlock_unlock(&rwlock);
532 /* Unregisters 'slave_' from 'bond'. If 'bond' does not contain such a slave
533 * then this function has no effect.
535 * Unregistering a slave invalidates all flows. */
537 bond_slave_unregister(struct bond *bond, const void *slave_)
539 struct bond_slave *slave;
542 ovs_rwlock_wrlock(&rwlock);
543 slave = bond_slave_lookup(bond, slave_);
548 bond->bond_revalidate = true;
549 bond_enable_slave(slave, false);
551 del_active = bond->active_slave == slave;
553 struct bond_entry *e;
554 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
555 if (e->slave == slave) {
563 hmap_remove(&bond->slaves, &slave->hmap_node);
564 /* Client owns 'slave->netdev'. */
568 bond_choose_active_slave(bond);
569 bond->send_learning_packets = true;
572 ovs_rwlock_unlock(&rwlock);
575 /* Should be called on each slave in 'bond' before bond_run() to indicate
576 * whether or not 'slave_' may be enabled. This function is intended to allow
577 * other protocols to have some impact on bonding decisions. For example LACP
578 * or high level link monitoring protocols may decide that a given slave should
579 * not be able to send traffic. */
581 bond_slave_set_may_enable(struct bond *bond, void *slave_, bool may_enable)
583 ovs_rwlock_wrlock(&rwlock);
584 bond_slave_lookup(bond, slave_)->may_enable = may_enable;
585 ovs_rwlock_unlock(&rwlock);
588 /* Performs periodic maintenance on 'bond'.
590 * Returns true if the caller should revalidate its flows.
592 * The caller should check bond_should_send_learning_packets() afterward. */
594 bond_run(struct bond *bond, enum lacp_status lacp_status)
596 struct bond_slave *slave;
599 ovs_rwlock_wrlock(&rwlock);
600 if (bond->lacp_status != lacp_status) {
601 bond->lacp_status = lacp_status;
602 bond->bond_revalidate = true;
605 /* Enable slaves based on link status and LACP feedback. */
606 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
607 bond_link_status_update(slave);
608 slave->change_seq = seq_read(connectivity_seq_get());
610 if (!bond->active_slave || !bond->active_slave->enabled) {
611 bond_choose_active_slave(bond);
614 /* Update fake bond interface stats. */
615 if (time_msec() >= bond->next_fake_iface_update) {
616 bond_update_fake_slave_stats(bond);
617 bond->next_fake_iface_update = time_msec() + 1000;
620 revalidate = bond->bond_revalidate;
621 bond->bond_revalidate = false;
622 ovs_rwlock_unlock(&rwlock);
627 /* Causes poll_block() to wake up when 'bond' needs something to be done. */
629 bond_wait(struct bond *bond)
631 struct bond_slave *slave;
633 ovs_rwlock_rdlock(&rwlock);
634 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
635 if (slave->delay_expires != LLONG_MAX) {
636 poll_timer_wait_until(slave->delay_expires);
639 seq_wait(connectivity_seq_get(), slave->change_seq);
642 if (bond->next_fake_iface_update != LLONG_MAX) {
643 poll_timer_wait_until(bond->next_fake_iface_update);
646 if (bond->bond_revalidate) {
647 poll_immediate_wake();
649 ovs_rwlock_unlock(&rwlock);
651 /* We don't wait for bond->next_rebalance because rebalancing can only run
652 * at a flow account checkpoint. ofproto does checkpointing on its own
653 * schedule and bond_rebalance() gets called afterward, so we'd just be
654 * waking up for no purpose. */
657 /* MAC learning table interaction. */
660 may_send_learning_packets(const struct bond *bond)
662 return ((bond->lacp_status == LACP_DISABLED
663 && (bond->balance == BM_SLB || bond->balance == BM_AB))
664 || (bond->lacp_fallback_ab && bond->lacp_status == LACP_CONFIGURED))
665 && bond->active_slave;
668 /* Returns true if 'bond' needs the client to send out packets to assist with
669 * MAC learning on 'bond'. If this function returns true, then the client
670 * should iterate through its MAC learning table for the bridge on which 'bond'
671 * is located. For each MAC that has been learned on a port other than 'bond',
672 * it should call bond_compose_learning_packet().
674 * This function will only return true if 'bond' is in SLB or active-backup
675 * mode and LACP is not negotiated. Otherwise sending learning packets isn't
678 * Calling this function resets the state that it checks. */
680 bond_should_send_learning_packets(struct bond *bond)
684 ovs_rwlock_wrlock(&rwlock);
685 send = bond->send_learning_packets && may_send_learning_packets(bond);
686 bond->send_learning_packets = false;
687 ovs_rwlock_unlock(&rwlock);
691 /* Sends a gratuitous learning packet on 'bond' from 'eth_src' on 'vlan'.
693 * See bond_should_send_learning_packets() for description of usage. The
694 * caller should send the composed packet on the port associated with
695 * port_aux and takes ownership of the returned ofpbuf. */
697 bond_compose_learning_packet(struct bond *bond,
698 const uint8_t eth_src[ETH_ADDR_LEN],
699 uint16_t vlan, void **port_aux)
701 struct bond_slave *slave;
702 struct ofpbuf *packet;
705 ovs_rwlock_rdlock(&rwlock);
706 ovs_assert(may_send_learning_packets(bond));
707 memset(&flow, 0, sizeof flow);
708 memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN);
709 slave = choose_output_slave(bond, &flow, NULL, vlan);
711 packet = ofpbuf_new(0);
712 compose_rarp(packet, eth_src);
714 eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan));
717 *port_aux = slave->aux;
718 ovs_rwlock_unlock(&rwlock);
722 /* Checks whether a packet that arrived on 'slave_' within 'bond', with an
723 * Ethernet destination address of 'eth_dst', should be admitted.
725 * The return value is one of the following:
727 * - BV_ACCEPT: Admit the packet.
729 * - BV_DROP: Drop the packet.
731 * - BV_DROP_IF_MOVED: Consult the MAC learning table for the packet's
732 * Ethernet source address and VLAN. If there is none, or if the packet
733 * is on the learned port, then admit the packet. If a different port has
734 * been learned, however, drop the packet (and do not use it for MAC
738 bond_check_admissibility(struct bond *bond, const void *slave_,
739 const uint8_t eth_dst[ETH_ADDR_LEN])
741 enum bond_verdict verdict = BV_DROP;
742 struct bond_slave *slave;
744 ovs_rwlock_rdlock(&rwlock);
745 slave = bond_slave_lookup(bond, slave_);
750 /* LACP bonds have very loose admissibility restrictions because we can
751 * assume the remote switch is aware of the bond and will "do the right
752 * thing". However, as a precaution we drop packets on disabled slaves
753 * because no correctly implemented partner switch should be sending
756 * If LACP is configured, but LACP negotiations have been unsuccessful, we
757 * drop all incoming traffic except if lacp_fallback_ab is enabled. */
758 switch (bond->lacp_status) {
759 case LACP_NEGOTIATED:
760 verdict = slave->enabled ? BV_ACCEPT : BV_DROP;
762 case LACP_CONFIGURED:
763 if (!bond->lacp_fallback_ab) {
770 /* Drop all multicast packets on inactive slaves. */
771 if (eth_addr_is_multicast(eth_dst)) {
772 if (bond->active_slave != slave) {
777 switch (bond->balance) {
779 /* TCP balanced bonds require successful LACP negotiations. Based on the
780 * above check, LACP is off or lacp_fallback_ab is true on this bond.
781 * If lacp_fallback_ab is true fall through to BM_AB case else, we
782 * drop all incoming traffic. */
783 if (!bond->lacp_fallback_ab) {
788 /* Drop all packets which arrive on backup slaves. This is similar to
789 * how Linux bonding handles active-backup bonds. */
790 if (bond->active_slave != slave) {
791 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
793 VLOG_DBG_RL(&rl, "active-backup bond received packet on backup"
794 " slave (%s) destined for " ETH_ADDR_FMT,
795 slave->name, ETH_ADDR_ARGS(eth_dst));
802 /* Drop all packets for which we have learned a different input port,
803 * because we probably sent the packet on one slave and got it back on
804 * the other. Gratuitous ARP packets are an exception to this rule:
805 * the host has moved to another switch. The exception to the
806 * exception is if we locked the learning table to avoid reflections on
808 verdict = BV_DROP_IF_MOVED;
814 ovs_rwlock_unlock(&rwlock);
819 /* Returns the slave (registered on 'bond' by bond_slave_register()) to which
820 * a packet with the given 'flow' and 'vlan' should be forwarded. Returns
821 * NULL if the packet should be dropped because no slaves are enabled.
823 * 'vlan' is not necessarily the same as 'flow->vlan_tci'. First, 'vlan'
824 * should be a VID only (i.e. excluding the PCP bits). Second,
825 * 'flow->vlan_tci' is the VLAN TCI that appeared on the packet (so it will be
826 * nonzero only for trunk ports), whereas 'vlan' is the logical VLAN that the
827 * packet belongs to (so for an access port it will be the access port's VLAN).
829 * If 'wc' is non-NULL, bitwise-OR's 'wc' with the set of bits that were
830 * significant in the selection. At some point earlier, 'wc' should
831 * have been initialized (e.g., by flow_wildcards_init_catchall()).
834 bond_choose_output_slave(struct bond *bond, const struct flow *flow,
835 struct flow_wildcards *wc, uint16_t vlan)
837 struct bond_slave *slave;
840 ovs_rwlock_rdlock(&rwlock);
841 slave = choose_output_slave(bond, flow, wc, vlan);
842 aux = slave ? slave->aux : NULL;
843 ovs_rwlock_unlock(&rwlock);
850 bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes)
851 OVS_REQ_WRLOCK(rwlock)
856 delta = rule_tx_bytes - entry->pr_tx_bytes;
857 entry->tx_bytes += delta;
858 entry->pr_tx_bytes = rule_tx_bytes;
862 /* Maintain bond stats using post recirculation rule byte counters.*/
864 bond_recirculation_account(struct bond *bond)
865 OVS_REQ_WRLOCK(rwlock)
869 for (i=0; i<=BOND_MASK; i++) {
870 struct bond_entry *entry = &bond->hash[i];
871 struct rule *rule = entry->pr_rule;
874 uint64_t n_packets OVS_UNUSED;
875 long long int used OVS_UNUSED;
878 rule->ofproto->ofproto_class->rule_get_stats(
879 rule, &n_packets, &n_bytes, &used);
880 bond_entry_account(entry, n_bytes);
886 bond_may_recirc(const struct bond *bond, uint32_t *recirc_id,
889 if (bond->balance == BM_TCP && bond->recirc_id) {
891 *recirc_id = bond->recirc_id;
894 *hash_bias = bond->basis;
903 bond_update_post_recirc_rules(struct bond* bond, const bool force)
905 struct bond_entry *e;
906 bool update_rules = force; /* Always update rules if caller forces it. */
908 /* Make sure all bond entries are populated */
909 for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) {
910 if (!e->slave || !e->slave->enabled) {
912 e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves),
913 struct bond_slave, hmap_node);
914 if (!e->slave->enabled) {
915 e->slave = bond->active_slave;
921 update_recirc_rules(bond);
928 bond_is_balanced(const struct bond *bond) OVS_REQ_RDLOCK(rwlock)
930 return bond->rebalance_interval
931 && (bond->balance == BM_SLB || bond->balance == BM_TCP);
934 /* Notifies 'bond' that 'n_bytes' bytes were sent in 'flow' within 'vlan'. */
936 bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan,
939 ovs_rwlock_wrlock(&rwlock);
940 if (bond_is_balanced(bond)) {
941 lookup_bond_entry(bond, flow, vlan)->tx_bytes += n_bytes;
943 ovs_rwlock_unlock(&rwlock);
946 static struct bond_slave *
947 bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock)
949 return CONTAINER_OF(bal, struct bond_slave, bal_node);
953 log_bals(struct bond *bond, const struct list *bals)
954 OVS_REQ_RDLOCK(rwlock)
956 if (VLOG_IS_DBG_ENABLED()) {
957 struct ds ds = DS_EMPTY_INITIALIZER;
958 const struct bond_slave *slave;
960 LIST_FOR_EACH (slave, bal_node, bals) {
962 ds_put_char(&ds, ',');
964 ds_put_format(&ds, " %s %"PRIu64"kB",
965 slave->name, slave->tx_bytes / 1024);
967 if (!slave->enabled) {
968 ds_put_cstr(&ds, " (disabled)");
970 if (!list_is_empty(&slave->entries)) {
971 struct bond_entry *e;
973 ds_put_cstr(&ds, " (");
974 LIST_FOR_EACH (e, list_node, &slave->entries) {
975 if (&e->list_node != list_front(&slave->entries)) {
976 ds_put_cstr(&ds, " + ");
978 ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
979 e - bond->hash, e->tx_bytes / 1024);
981 ds_put_cstr(&ds, ")");
984 VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
989 /* Shifts 'hash' from its current slave to 'to'. */
991 bond_shift_load(struct bond_entry *hash, struct bond_slave *to)
992 OVS_REQ_WRLOCK(rwlock)
994 struct bond_slave *from = hash->slave;
995 struct bond *bond = from->bond;
996 uint64_t delta = hash->tx_bytes;
998 VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
999 "from %s to %s (now carrying %"PRIu64"kB and "
1000 "%"PRIu64"kB load, respectively)",
1001 bond->name, delta / 1024, hash - bond->hash,
1002 from->name, to->name,
1003 (from->tx_bytes - delta) / 1024,
1004 (to->tx_bytes + delta) / 1024);
1006 /* Shift load away from 'from' to 'to'. */
1007 from->tx_bytes -= delta;
1008 to->tx_bytes += delta;
1010 /* Arrange for flows to be revalidated. */
1012 bond->bond_revalidate = true;
1015 /* Picks and returns a bond_entry to migrate from 'from' (the most heavily
1016 * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
1017 * given that doing so must decrease the ratio of the load on the two slaves by
1018 * at least 0.1. Returns NULL if there is no appropriate entry.
1020 * The list of entries isn't sorted. I don't know of a reason to prefer to
1021 * shift away small hashes or large hashes. */
1022 static struct bond_entry *
1023 choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
1024 OVS_REQ_WRLOCK(rwlock)
1026 struct bond_entry *e;
1028 if (list_is_short(&from->entries)) {
1029 /* 'from' carries no more than one MAC hash, so shifting load away from
1030 * it would be pointless. */
1034 LIST_FOR_EACH (e, list_node, &from->entries) {
1035 double old_ratio, new_ratio;
1038 if (to_tx_bytes == 0) {
1039 /* Nothing on the new slave, move it. */
1043 delta = e->tx_bytes;
1044 old_ratio = (double)from->tx_bytes / to_tx_bytes;
1045 new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta);
1046 if (old_ratio - new_ratio > 0.1
1047 && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) {
1048 /* We're aiming for an ideal ratio of 1, meaning both the 'from'
1049 and 'to' slave have the same load. Therefore, we only move an
1050 entry if it decreases the load on 'from', and brings us closer
1051 to equal traffic load. */
1059 /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is
1062 insert_bal(struct list *bals, struct bond_slave *slave)
1064 struct bond_slave *pos;
1066 LIST_FOR_EACH (pos, bal_node, bals) {
1067 if (slave->tx_bytes > pos->tx_bytes) {
1071 list_insert(&pos->bal_node, &slave->bal_node);
1074 /* Removes 'slave' from its current list and then inserts it into 'bals' so
1075 * that descending order of 'tx_bytes' is maintained. */
1077 reinsert_bal(struct list *bals, struct bond_slave *slave)
1079 list_remove(&slave->bal_node);
1080 insert_bal(bals, slave);
1083 /* If 'bond' needs rebalancing, does so.
1085 * The caller should have called bond_account() for each active flow, or in case
1086 * of recirculation is used, have called bond_recirculation_account(bond),
1087 * to ensure that flow data is consistently accounted at this point.
1090 bond_rebalance(struct bond *bond)
1092 struct bond_slave *slave;
1093 struct bond_entry *e;
1095 bool rebalanced = false;
1098 ovs_rwlock_wrlock(&rwlock);
1099 if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) {
1102 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1104 use_recirc = ofproto_dpif_get_enable_recirc(bond->ofproto) &&
1105 bond_may_recirc(bond, NULL, NULL);
1108 bond_recirculation_account(bond);
1111 /* Add each bond_entry to its slave's 'entries' list.
1112 * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */
1113 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1114 slave->tx_bytes = 0;
1115 list_init(&slave->entries);
1117 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1118 if (e->slave && e->tx_bytes) {
1119 e->slave->tx_bytes += e->tx_bytes;
1120 list_push_back(&e->slave->entries, &e->list_node);
1124 /* Add enabled slaves to 'bals' in descending order of tx_bytes.
1126 * XXX This is O(n**2) in the number of slaves but it could be O(n lg n)
1127 * with a proper list sort algorithm. */
1129 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1130 if (slave->enabled) {
1131 insert_bal(&bals, slave);
1134 log_bals(bond, &bals);
1136 /* Shift load from the most-loaded slaves to the least-loaded slaves. */
1137 while (!list_is_short(&bals)) {
1138 struct bond_slave *from = bond_slave_from_bal_node(list_front(&bals));
1139 struct bond_slave *to = bond_slave_from_bal_node(list_back(&bals));
1142 overload = from->tx_bytes - to->tx_bytes;
1143 if (overload < to->tx_bytes >> 5 || overload < 100000) {
1144 /* The extra load on 'from' (and all less-loaded slaves), compared
1145 * to that of 'to' (the least-loaded slave), is less than ~3%, or
1146 * it is less than ~1Mbps. No point in rebalancing. */
1150 /* 'from' is carrying significantly more load than 'to'. Pick a hash
1151 * to move from 'from' to 'to'. */
1152 e = choose_entry_to_migrate(from, to->tx_bytes);
1154 bond_shift_load(e, to);
1156 /* Delete element from from->entries.
1158 * We don't add the element to to->hashes. That would only allow
1159 * 'e' to be migrated to another slave in this rebalancing run, and
1160 * there is no point in doing that. */
1161 list_remove(&e->list_node);
1163 /* Re-sort 'bals'. */
1164 reinsert_bal(&bals, from);
1165 reinsert_bal(&bals, to);
1168 /* Can't usefully migrate anything away from 'from'.
1169 * Don't reconsider it. */
1170 list_remove(&from->bal_node);
1174 /* Implement exponentially weighted moving average. A weight of 1/2 causes
1175 * historical data to decay to <1% in 7 rebalancing runs. 1,000,000 bytes
1176 * take 20 rebalancing runs to decay to 0 and get deleted entirely. */
1177 for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) {
1181 if (use_recirc && rebalanced) {
1182 bond_update_post_recirc_rules(bond,true);
1186 ovs_rwlock_unlock(&rwlock);
1189 /* Bonding unixctl user interface functions. */
1191 static struct bond *
1192 bond_find(const char *name) OVS_REQ_RDLOCK(rwlock)
1196 HMAP_FOR_EACH_WITH_HASH (bond, hmap_node, hash_string(name, 0),
1198 if (!strcmp(bond->name, name)) {
1205 static struct bond_slave *
1206 bond_lookup_slave(struct bond *bond, const char *slave_name)
1208 struct bond_slave *slave;
1210 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1211 if (!strcmp(slave->name, slave_name)) {
1219 bond_unixctl_list(struct unixctl_conn *conn,
1220 int argc OVS_UNUSED, const char *argv[] OVS_UNUSED,
1221 void *aux OVS_UNUSED)
1223 struct ds ds = DS_EMPTY_INITIALIZER;
1224 const struct bond *bond;
1226 ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n");
1228 ovs_rwlock_rdlock(&rwlock);
1229 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1230 const struct bond_slave *slave;
1233 ds_put_format(&ds, "%s\t%s\t%d\t", bond->name,
1234 bond_mode_to_string(bond->balance), bond->recirc_id);
1237 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1239 ds_put_cstr(&ds, ", ");
1241 ds_put_cstr(&ds, slave->name);
1243 ds_put_char(&ds, '\n');
1245 ovs_rwlock_unlock(&rwlock);
1246 unixctl_command_reply(conn, ds_cstr(&ds));
1251 bond_print_details(struct ds *ds, const struct bond *bond)
1252 OVS_REQ_RDLOCK(rwlock)
1254 struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
1255 const struct shash_node **sorted_slaves = NULL;
1256 const struct bond_slave *slave;
1261 ds_put_format(ds, "---- %s ----\n", bond->name);
1262 ds_put_format(ds, "bond_mode: %s\n",
1263 bond_mode_to_string(bond->balance));
1265 may_recirc = bond_may_recirc(bond, &recirc_id, NULL);
1266 ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n",
1267 may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1);
1269 ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
1271 ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
1272 ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
1274 if (bond_is_balanced(bond)) {
1275 ds_put_format(ds, "next rebalance: %lld ms\n",
1276 bond->next_rebalance - time_msec());
1279 ds_put_cstr(ds, "lacp_status: ");
1280 switch (bond->lacp_status) {
1281 case LACP_NEGOTIATED:
1282 ds_put_cstr(ds, "negotiated\n");
1284 case LACP_CONFIGURED:
1285 ds_put_cstr(ds, "configured\n");
1288 ds_put_cstr(ds, "off\n");
1291 ds_put_cstr(ds, "<unknown>\n");
1295 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1296 shash_add(&slave_shash, slave->name, slave);
1298 sorted_slaves = shash_sort(&slave_shash);
1300 for (i = 0; i < shash_count(&slave_shash); i++) {
1301 struct bond_entry *be;
1303 slave = sorted_slaves[i]->data;
1306 ds_put_format(ds, "\nslave %s: %s\n",
1307 slave->name, slave->enabled ? "enabled" : "disabled");
1308 if (slave == bond->active_slave) {
1309 ds_put_cstr(ds, "\tactive slave\n");
1311 if (slave->delay_expires != LLONG_MAX) {
1312 ds_put_format(ds, "\t%s expires in %lld ms\n",
1313 slave->enabled ? "downdelay" : "updelay",
1314 slave->delay_expires - time_msec());
1317 ds_put_format(ds, "\tmay_enable: %s\n",
1318 slave->may_enable ? "true" : "false");
1320 if (!bond_is_balanced(bond)) {
1325 for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) {
1326 int hash = be - bond->hash;
1329 if (be->slave != slave) {
1333 be_tx_k = be->tx_bytes / 1024;
1335 ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n",
1339 /* XXX How can we list the MACs assigned to hashes of SLB bonds? */
1342 shash_destroy(&slave_shash);
1343 free(sorted_slaves);
1344 ds_put_cstr(ds, "\n");
1348 bond_unixctl_show(struct unixctl_conn *conn,
1349 int argc, const char *argv[],
1350 void *aux OVS_UNUSED)
1352 struct ds ds = DS_EMPTY_INITIALIZER;
1354 ovs_rwlock_rdlock(&rwlock);
1356 const struct bond *bond = bond_find(argv[1]);
1359 unixctl_command_reply_error(conn, "no such bond");
1362 bond_print_details(&ds, bond);
1364 const struct bond *bond;
1366 HMAP_FOR_EACH (bond, hmap_node, all_bonds) {
1367 bond_print_details(&ds, bond);
1371 unixctl_command_reply(conn, ds_cstr(&ds));
1375 ovs_rwlock_unlock(&rwlock);
1379 bond_unixctl_migrate(struct unixctl_conn *conn,
1380 int argc OVS_UNUSED, const char *argv[],
1381 void *aux OVS_UNUSED)
1383 const char *bond_s = argv[1];
1384 const char *hash_s = argv[2];
1385 const char *slave_s = argv[3];
1387 struct bond_slave *slave;
1388 struct bond_entry *entry;
1391 ovs_rwlock_wrlock(&rwlock);
1392 bond = bond_find(bond_s);
1394 unixctl_command_reply_error(conn, "no such bond");
1398 if (bond->balance != BM_SLB) {
1399 unixctl_command_reply_error(conn, "not an SLB bond");
1403 if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
1404 hash = atoi(hash_s) & BOND_MASK;
1406 unixctl_command_reply_error(conn, "bad hash");
1410 slave = bond_lookup_slave(bond, slave_s);
1412 unixctl_command_reply_error(conn, "no such slave");
1416 if (!slave->enabled) {
1417 unixctl_command_reply_error(conn, "cannot migrate to disabled slave");
1421 entry = &bond->hash[hash];
1422 bond->bond_revalidate = true;
1423 entry->slave = slave;
1424 unixctl_command_reply(conn, "migrated");
1427 ovs_rwlock_unlock(&rwlock);
1431 bond_unixctl_set_active_slave(struct unixctl_conn *conn,
1432 int argc OVS_UNUSED, const char *argv[],
1433 void *aux OVS_UNUSED)
1435 const char *bond_s = argv[1];
1436 const char *slave_s = argv[2];
1438 struct bond_slave *slave;
1440 ovs_rwlock_wrlock(&rwlock);
1441 bond = bond_find(bond_s);
1443 unixctl_command_reply_error(conn, "no such bond");
1447 slave = bond_lookup_slave(bond, slave_s);
1449 unixctl_command_reply_error(conn, "no such slave");
1453 if (!slave->enabled) {
1454 unixctl_command_reply_error(conn, "cannot make disabled slave active");
1458 if (bond->active_slave != slave) {
1459 bond->bond_revalidate = true;
1460 bond->active_slave = slave;
1461 VLOG_INFO("bond %s: active interface is now %s",
1462 bond->name, slave->name);
1463 bond->send_learning_packets = true;
1464 unixctl_command_reply(conn, "done");
1466 unixctl_command_reply(conn, "no change");
1469 ovs_rwlock_unlock(&rwlock);
1473 enable_slave(struct unixctl_conn *conn, const char *argv[], bool enable)
1475 const char *bond_s = argv[1];
1476 const char *slave_s = argv[2];
1478 struct bond_slave *slave;
1480 ovs_rwlock_wrlock(&rwlock);
1481 bond = bond_find(bond_s);
1483 unixctl_command_reply_error(conn, "no such bond");
1487 slave = bond_lookup_slave(bond, slave_s);
1489 unixctl_command_reply_error(conn, "no such slave");
1493 bond_enable_slave(slave, enable);
1494 unixctl_command_reply(conn, enable ? "enabled" : "disabled");
1497 ovs_rwlock_unlock(&rwlock);
1501 bond_unixctl_enable_slave(struct unixctl_conn *conn,
1502 int argc OVS_UNUSED, const char *argv[],
1503 void *aux OVS_UNUSED)
1505 enable_slave(conn, argv, true);
1509 bond_unixctl_disable_slave(struct unixctl_conn *conn,
1510 int argc OVS_UNUSED, const char *argv[],
1511 void *aux OVS_UNUSED)
1513 enable_slave(conn, argv, false);
1517 bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[],
1518 void *aux OVS_UNUSED)
1520 const char *mac_s = argv[1];
1521 const char *vlan_s = argc > 2 ? argv[2] : NULL;
1522 const char *basis_s = argc > 3 ? argv[3] : NULL;
1523 uint8_t mac[ETH_ADDR_LEN];
1530 if (!ovs_scan(vlan_s, "%u", &vlan)) {
1531 unixctl_command_reply_error(conn, "invalid vlan");
1539 if (!ovs_scan(basis_s, "%"SCNu32, &basis)) {
1540 unixctl_command_reply_error(conn, "invalid basis");
1547 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
1548 hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
1550 hash_cstr = xasprintf("%u", hash);
1551 unixctl_command_reply(conn, hash_cstr);
1554 unixctl_command_reply_error(conn, "invalid mac");
1561 unixctl_command_register("bond/list", "", 0, 0, bond_unixctl_list, NULL);
1562 unixctl_command_register("bond/show", "[port]", 0, 1, bond_unixctl_show,
1564 unixctl_command_register("bond/migrate", "port hash slave", 3, 3,
1565 bond_unixctl_migrate, NULL);
1566 unixctl_command_register("bond/set-active-slave", "port slave", 2, 2,
1567 bond_unixctl_set_active_slave, NULL);
1568 unixctl_command_register("bond/enable-slave", "port slave", 2, 2,
1569 bond_unixctl_enable_slave, NULL);
1570 unixctl_command_register("bond/disable-slave", "port slave", 2, 2,
1571 bond_unixctl_disable_slave, NULL);
1572 unixctl_command_register("bond/hash", "mac [vlan] [basis]", 1, 3,
1573 bond_unixctl_hash, NULL);
1577 bond_entry_reset(struct bond *bond)
1579 if (bond->balance != BM_AB) {
1580 size_t hash_len = BOND_BUCKETS * sizeof *bond->hash;
1583 bond->hash = xmalloc(hash_len);
1585 memset(bond->hash, 0, hash_len);
1587 bond->next_rebalance = time_msec() + bond->rebalance_interval;
1594 static struct bond_slave *
1595 bond_slave_lookup(struct bond *bond, const void *slave_)
1597 struct bond_slave *slave;
1599 HMAP_FOR_EACH_IN_BUCKET (slave, hmap_node, hash_pointer(slave_, 0),
1601 if (slave->aux == slave_) {
1610 bond_enable_slave(struct bond_slave *slave, bool enable)
1612 slave->delay_expires = LLONG_MAX;
1613 if (enable != slave->enabled) {
1614 slave->bond->bond_revalidate = true;
1615 slave->enabled = enable;
1617 ovs_mutex_lock(&slave->bond->mutex);
1619 list_insert(&slave->bond->enabled_slaves, &slave->list_node);
1621 list_remove(&slave->list_node);
1623 ovs_mutex_unlock(&slave->bond->mutex);
1625 VLOG_INFO("interface %s: %s", slave->name,
1626 slave->enabled ? "enabled" : "disabled");
1631 bond_link_status_update(struct bond_slave *slave)
1633 struct bond *bond = slave->bond;
1636 up = netdev_get_carrier(slave->netdev) && slave->may_enable;
1637 if ((up == slave->enabled) != (slave->delay_expires == LLONG_MAX)) {
1638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1639 VLOG_INFO_RL(&rl, "interface %s: link state %s",
1640 slave->name, up ? "up" : "down");
1641 if (up == slave->enabled) {
1642 slave->delay_expires = LLONG_MAX;
1643 VLOG_INFO_RL(&rl, "interface %s: will not be %s",
1644 slave->name, up ? "disabled" : "enabled");
1646 int delay = (bond->lacp_status != LACP_DISABLED ? 0
1647 : up ? bond->updelay : bond->downdelay);
1648 slave->delay_expires = time_msec() + delay;
1650 VLOG_INFO_RL(&rl, "interface %s: will be %s if it stays %s "
1653 up ? "enabled" : "disabled",
1660 if (time_msec() >= slave->delay_expires) {
1661 bond_enable_slave(slave, up);
1666 bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis)
1668 return hash_mac(mac, vlan, basis);
1672 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
1674 struct flow hash_flow = *flow;
1675 hash_flow.vlan_tci = htons(vlan);
1677 /* The symmetric quality of this hash function is not required, but
1678 * flow_hash_symmetric_l4 already exists, and is sufficient for our
1679 * purposes, so we use it out of convenience. */
1680 return flow_hash_symmetric_l4(&hash_flow, basis);
1684 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
1686 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
1688 return (bond->balance == BM_TCP
1689 ? bond_hash_tcp(flow, vlan, bond->basis)
1690 : bond_hash_src(flow->dl_src, vlan, bond->basis));
1693 static struct bond_entry *
1694 lookup_bond_entry(const struct bond *bond, const struct flow *flow,
1697 return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK];
1700 /* Selects and returns an enabled slave from the 'enabled_slaves' list
1701 * in a round-robin fashion. If the 'enabled_slaves' list is empty,
1703 static struct bond_slave *
1704 get_enabled_slave(struct bond *bond)
1708 ovs_mutex_lock(&bond->mutex);
1709 if (list_is_empty(&bond->enabled_slaves)) {
1710 ovs_mutex_unlock(&bond->mutex);
1714 node = list_pop_front(&bond->enabled_slaves);
1715 list_push_back(&bond->enabled_slaves, node);
1716 ovs_mutex_unlock(&bond->mutex);
1718 return CONTAINER_OF(node, struct bond_slave, list_node);
1721 static struct bond_slave *
1722 choose_output_slave(const struct bond *bond, const struct flow *flow,
1723 struct flow_wildcards *wc, uint16_t vlan)
1725 struct bond_entry *e;
1728 balance = bond->balance;
1729 if (bond->lacp_status == LACP_CONFIGURED) {
1730 /* LACP has been configured on this bond but negotiations were
1731 * unsuccussful. If lacp_fallback_ab is enabled use active-
1732 * backup mode else drop all traffic. */
1733 if (!bond->lacp_fallback_ab) {
1741 return bond->active_slave;
1744 if (bond->lacp_status != LACP_NEGOTIATED) {
1745 /* Must have LACP negotiations for TCP balanced bonds. */
1749 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_SYMMETRIC_L4);
1754 flow_mask_hash_fields(flow, wc, NX_HASH_FIELDS_ETH_SRC);
1756 e = lookup_bond_entry(bond, flow, vlan);
1757 if (!e->slave || !e->slave->enabled) {
1758 e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond));
1767 static struct bond_slave *
1768 bond_choose_slave(const struct bond *bond)
1770 struct bond_slave *slave, *best;
1772 /* Find an enabled slave. */
1773 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1774 if (slave->enabled) {
1779 /* All interfaces are disabled. Find an interface that will be enabled
1780 * after its updelay expires. */
1782 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1783 if (slave->delay_expires != LLONG_MAX
1784 && slave->may_enable
1785 && (!best || slave->delay_expires < best->delay_expires)) {
1793 bond_choose_active_slave(struct bond *bond)
1795 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
1796 struct bond_slave *old_active_slave = bond->active_slave;
1798 bond->active_slave = bond_choose_slave(bond);
1799 if (bond->active_slave) {
1800 if (bond->active_slave->enabled) {
1801 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s",
1802 bond->name, bond->active_slave->name);
1804 VLOG_INFO_RL(&rl, "bond %s: active interface is now %s, skipping "
1805 "remaining %lld ms updelay (since no interface was "
1806 "enabled)", bond->name, bond->active_slave->name,
1807 bond->active_slave->delay_expires - time_msec());
1808 bond_enable_slave(bond->active_slave, true);
1811 bond->send_learning_packets = true;
1812 } else if (old_active_slave) {
1813 VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name);
1817 /* Attempts to make the sum of the bond slaves' statistics appear on the fake
1818 * bond interface. */
1820 bond_update_fake_slave_stats(struct bond *bond)
1822 struct netdev_stats bond_stats;
1823 struct bond_slave *slave;
1824 struct netdev *bond_dev;
1826 memset(&bond_stats, 0, sizeof bond_stats);
1828 HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) {
1829 struct netdev_stats slave_stats;
1831 if (!netdev_get_stats(slave->netdev, &slave_stats)) {
1832 /* XXX: We swap the stats here because they are swapped back when
1833 * reported by the internal device. The reason for this is
1834 * internal devices normally represent packets going into the
1835 * system but when used as fake bond device they represent packets
1836 * leaving the system. We really should do this in the internal
1837 * device itself because changing it here reverses the counts from
1838 * the perspective of the switch. However, the internal device
1839 * doesn't know what type of device it represents so we have to do
1840 * it here for now. */
1841 bond_stats.tx_packets += slave_stats.rx_packets;
1842 bond_stats.tx_bytes += slave_stats.rx_bytes;
1843 bond_stats.rx_packets += slave_stats.tx_packets;
1844 bond_stats.rx_bytes += slave_stats.tx_bytes;
1848 if (!netdev_open(bond->name, "system", &bond_dev)) {
1849 netdev_set_stats(bond_dev, &bond_stats);
1850 netdev_close(bond_dev);