X-Git-Url: http://git.cascardo.eti.br/?a=blobdiff_plain;f=ofproto%2Fbond.c;h=1dbf8f112b62a792901cdc387a8fc1fbff8cfc9a;hb=ac6d120f8e8ad1802b7d89dcf3c6e6d9d399cdf7;hp=ff050f19e8f8001820d20f2c390c55e3f640a525;hpb=428b2eddc9c47d8306252f0fc5218839d2ff017c;p=cascardo%2Fovs.git diff --git a/ofproto/bond.c b/ofproto/bond.c index ff050f19e..1dbf8f112 100644 --- a/ofproto/bond.c +++ b/ofproto/bond.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc. + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,12 @@ #include #include +#include "ofp-util.h" +#include "ofp-actions.h" +#include "ofpbuf.h" +#include "ofproto/ofproto-provider.h" +#include "ofproto/ofproto-dpif.h" +#include "ofproto/ofproto-dpif-rid.h" #include "connectivity.h" #include "coverage.h" #include "dynamic-string.h" @@ -34,35 +40,52 @@ #include "odp-util.h" #include "ofpbuf.h" #include "packets.h" +#include "dp-packet.h" #include "poll-loop.h" #include "seq.h" +#include "match.h" #include "shash.h" #include "timeval.h" #include "unixctl.h" -#include "vlog.h" +#include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(bond); -/* Bit-mask for hashing a flow down to a bucket. - * There are (BOND_MASK + 1) buckets. */ +static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER; +static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__); +static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__; + +/* Bit-mask for hashing a flow down to a bucket. */ #define BOND_MASK 0xff +#define BOND_BUCKETS (BOND_MASK + 1) /* A hash bucket for mapping a flow to a slave. - * "struct bond" has an array of (BOND_MASK + 1) of these. */ + * "struct bond" has an array of BOND_BUCKETS of these. */ struct bond_entry { struct bond_slave *slave; /* Assigned slave, NULL if unassigned. */ - uint64_t tx_bytes; /* Count of bytes recently transmitted. */ - struct list list_node; /* In bond_slave's 'entries' list. */ + uint64_t tx_bytes /* Count of bytes recently transmitted. */ + OVS_GUARDED_BY(rwlock); + struct ovs_list list_node; /* In bond_slave's 'entries' list. */ + + /* Recirculation. + * + * 'pr_rule' is the post-recirculation rule for this entry. + * 'pr_tx_bytes' is the most recently seen statistics for 'pr_rule', which + * is used to determine delta (applied to 'tx_bytes' above.) */ + struct rule *pr_rule; + uint64_t pr_tx_bytes OVS_GUARDED_BY(rwlock); }; /* A bond slave, that is, one of the links comprising a bond. */ struct bond_slave { struct hmap_node hmap_node; /* In struct bond's slaves hmap. */ + struct ovs_list list_node; /* In struct bond's enabled_slaves list. */ struct bond *bond; /* The bond that contains this slave. */ void *aux; /* Client-provided handle for this slave. */ struct netdev *netdev; /* Network device, owned by the client. */ unsigned int change_seq; /* Tracks changes in 'netdev'. */ + ofp_port_t ofp_port; /* OpenFlow port number. */ char *name; /* Name (a copy of netdev_get_name(netdev)). */ /* Link status. */ @@ -71,8 +94,8 @@ struct bond_slave { bool may_enable; /* Client considers this slave bondable. */ /* Rebalancing info. Used only by bond_rebalance(). */ - struct list bal_node; /* In bond_rebalance()'s 'bals' list. */ - struct list entries; /* 'struct bond_entry's assigned here. */ + struct ovs_list bal_node; /* In bond_rebalance()'s 'bals' list. */ + struct ovs_list entries; /* 'struct bond_entry's assigned here. */ uint64_t tx_bytes; /* Sum across 'tx_bytes' of entries. */ }; @@ -81,10 +104,19 @@ struct bond_slave { struct bond { struct hmap_node hmap_node; /* In 'all_bonds' hmap. */ char *name; /* Name provided by client. */ + struct ofproto_dpif *ofproto; /* The bridge this bond belongs to. */ /* Slaves. */ struct hmap slaves; + /* Enabled slaves. + * + * Any reader or writer of 'enabled_slaves' must hold 'mutex'. + * (To prevent the bond_slave from disappearing they must also hold + * 'rwlock'.) */ + struct ovs_mutex mutex OVS_ACQ_AFTER(rwlock); + struct ovs_list enabled_slaves OVS_GUARDED; /* Contains struct bond_slaves. */ + /* Bonding info. */ enum bond_mode balance; /* Balancing mode, one of BM_*. */ struct bond_slave *active_slave; @@ -94,21 +126,42 @@ struct bond { uint32_t basis; /* Basis for flow hash function. */ /* SLB specific bonding info. */ - struct bond_entry *hash; /* An array of (BOND_MASK + 1) elements. */ + struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */ int rebalance_interval; /* Interval between rebalances, in ms. */ long long int next_rebalance; /* Next rebalancing time. */ bool send_learning_packets; - + uint32_t recirc_id; /* Non zero if recirculation can be used.*/ + struct hmap pr_rule_ops; /* Helps to maintain post recirculation rules.*/ + + /* Store active slave to OVSDB. */ + bool active_slave_changed; /* Set to true whenever the bond changes + active slave. It will be reset to false + after it is stored into OVSDB */ + + /* Interface name may not be persistent across an OS reboot, use + * MAC address for identifing the active slave */ + struct eth_addr active_slave_mac; + /* The MAC address of the active interface. */ /* Legacy compatibility. */ - long long int next_fake_iface_update; /* LLONG_MAX if disabled. */ bool lacp_fallback_ab; /* Fallback to active-backup on LACP failure. */ - atomic_int ref_cnt; + struct ovs_refcount ref_cnt; }; -static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER; -static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__); -static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__; +/* What to do with an bond_recirc_rule. */ +enum bond_op { + ADD, /* Add the rule to ofproto's flow table. */ + DEL, /* Delete the rule from the ofproto's flow table. */ +}; + +/* A rule to add to or delete from ofproto's internal flow table. */ +struct bond_pr_rule_op { + struct hmap_node hmap_node; + struct match match; + ofp_port_t out_ofport; + enum bond_op op; + struct rule **pr_rule; +}; static void bond_entry_reset(struct bond *) OVS_REQ_WRLOCK(rwlock); static struct bond_slave *bond_slave_lookup(struct bond *, const void *slave_) @@ -118,8 +171,8 @@ static void bond_enable_slave(struct bond_slave *, bool enable) static void bond_link_status_update(struct bond_slave *) OVS_REQ_WRLOCK(rwlock); static void bond_choose_active_slave(struct bond *) - OVS_REQ_WRLOCK(rwlock);; -static unsigned int bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], + OVS_REQ_WRLOCK(rwlock); +static unsigned int bond_hash_src(const struct eth_addr mac, uint16_t vlan, uint32_t basis); static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan, uint32_t basis); @@ -127,13 +180,13 @@ static struct bond_entry *lookup_bond_entry(const struct bond *, const struct flow *, uint16_t vlan) OVS_REQ_RDLOCK(rwlock); +static struct bond_slave *get_enabled_slave(struct bond *) + OVS_REQ_RDLOCK(rwlock); static struct bond_slave *choose_output_slave(const struct bond *, const struct flow *, struct flow_wildcards *, uint16_t vlan) OVS_REQ_RDLOCK(rwlock); -static void bond_update_fake_slave_stats(struct bond *) - OVS_REQ_RDLOCK(rwlock); /* Attempts to parse 's' as the name of a bond balancing mode. If successful, * stores the mode in '*balance' and returns true. Otherwise returns false @@ -174,14 +227,19 @@ bond_mode_to_string(enum bond_mode balance) { * The caller should register each slave on the new bond by calling * bond_slave_register(). */ struct bond * -bond_create(const struct bond_settings *s) +bond_create(const struct bond_settings *s, struct ofproto_dpif *ofproto) { struct bond *bond; bond = xzalloc(sizeof *bond); + bond->ofproto = ofproto; hmap_init(&bond->slaves); - bond->next_fake_iface_update = LLONG_MAX; - atomic_init(&bond->ref_cnt, 1); + list_init(&bond->enabled_slaves); + ovs_mutex_init(&bond->mutex); + ovs_refcount_init(&bond->ref_cnt); + + bond->recirc_id = 0; + hmap_init(&bond->pr_rule_ops); bond_reconfigure(bond, s); return bond; @@ -193,9 +251,7 @@ bond_ref(const struct bond *bond_) struct bond *bond = CONST_CAST(struct bond *, bond_); if (bond) { - int orig; - atomic_add(&bond->ref_cnt, 1, &orig); - ovs_assert(orig > 0); + ovs_refcount_ref(&bond->ref_cnt); } return bond; } @@ -205,15 +261,9 @@ void bond_unref(struct bond *bond) { struct bond_slave *slave, *next_slave; - int orig; - - if (!bond) { - return; - } + struct bond_pr_rule_op *pr_op, *next_op; - atomic_sub(&bond->ref_cnt, 1, &orig); - ovs_assert(orig > 0); - if (orig != 1) { + if (!bond || ovs_refcount_unref_relaxed(&bond->ref_cnt) != 1) { return; } @@ -229,11 +279,120 @@ bond_unref(struct bond *bond) } hmap_destroy(&bond->slaves); + ovs_mutex_destroy(&bond->mutex); free(bond->hash); free(bond->name); + + HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) { + hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node); + free(pr_op); + } + hmap_destroy(&bond->pr_rule_ops); + + if (bond->recirc_id) { + recirc_free_id(bond->recirc_id); + } + free(bond); } +static void +add_pr_rule(struct bond *bond, const struct match *match, + ofp_port_t out_ofport, struct rule **rule) +{ + uint32_t hash = match_hash(match, 0); + struct bond_pr_rule_op *pr_op; + + HMAP_FOR_EACH_WITH_HASH(pr_op, hmap_node, hash, &bond->pr_rule_ops) { + if (match_equal(&pr_op->match, match)) { + pr_op->op = ADD; + pr_op->out_ofport = out_ofport; + pr_op->pr_rule = rule; + return; + } + } + + pr_op = xmalloc(sizeof *pr_op); + pr_op->match = *match; + pr_op->op = ADD; + pr_op->out_ofport = out_ofport; + pr_op->pr_rule = rule; + hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash); +} + +static void +update_recirc_rules(struct bond *bond) + OVS_REQ_WRLOCK(rwlock) +{ + struct match match; + struct bond_pr_rule_op *pr_op, *next_op; + uint64_t ofpacts_stub[128 / 8]; + struct ofpbuf ofpacts; + int i; + + ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub); + + HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) { + pr_op->op = DEL; + } + + if (bond->hash && bond->recirc_id) { + for (i = 0; i < BOND_BUCKETS; i++) { + struct bond_slave *slave = bond->hash[i].slave; + + if (slave) { + match_init_catchall(&match); + match_set_recirc_id(&match, bond->recirc_id); + match_set_dp_hash_masked(&match, i, BOND_MASK); + + add_pr_rule(bond, &match, slave->ofp_port, + &bond->hash[i].pr_rule); + } + } + } + + HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) { + int error; + switch (pr_op->op) { + case ADD: + ofpbuf_clear(&ofpacts); + ofpact_put_OUTPUT(&ofpacts)->port = pr_op->out_ofport; + error = ofproto_dpif_add_internal_flow(bond->ofproto, + &pr_op->match, + RECIRC_RULE_PRIORITY, 0, + &ofpacts, pr_op->pr_rule); + if (error) { + char *err_s = match_to_string(&pr_op->match, + RECIRC_RULE_PRIORITY); + + VLOG_ERR("failed to add post recirculation flow %s", err_s); + free(err_s); + } + break; + + case DEL: + error = ofproto_dpif_delete_internal_flow(bond->ofproto, + &pr_op->match, + RECIRC_RULE_PRIORITY); + if (error) { + char *err_s = match_to_string(&pr_op->match, + RECIRC_RULE_PRIORITY); + + VLOG_ERR("failed to remove post recirculation flow %s", err_s); + free(err_s); + } + + hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node); + *pr_op->pr_rule = NULL; + free(pr_op); + break; + } + } + + ofpbuf_uninit(&ofpacts); +} + + /* Updates 'bond''s overall configuration to 's'. * * The caller should register each slave on 'bond' by calling @@ -281,27 +440,63 @@ bond_reconfigure(struct bond *bond, const struct bond_settings *s) revalidate = true; } - if (s->fake_iface) { - if (bond->next_fake_iface_update == LLONG_MAX) { - bond->next_fake_iface_update = time_msec(); - } - } else { - bond->next_fake_iface_update = LLONG_MAX; - } - if (bond->bond_revalidate) { revalidate = true; bond->bond_revalidate = false; } + if (bond->balance != BM_AB) { + if (!bond->recirc_id) { + bond->recirc_id = recirc_alloc_id(bond->ofproto); + } + } else if (bond->recirc_id) { + recirc_free_id(bond->recirc_id); + bond->recirc_id = 0; + } + if (bond->balance == BM_AB || !bond->hash || revalidate) { bond_entry_reset(bond); } + bond->active_slave_mac = s->active_slave_mac; + bond->active_slave_changed = false; + ovs_rwlock_unlock(&rwlock); return revalidate; } +static struct bond_slave * +bond_find_slave_by_mac(const struct bond *bond, const struct eth_addr mac) +{ + struct bond_slave *slave; + + /* Find the last active slave */ + HMAP_FOR_EACH(slave, hmap_node, &bond->slaves) { + struct eth_addr slave_mac; + + if (netdev_get_etheraddr(slave->netdev, &slave_mac)) { + continue; + } + + if (eth_addr_equals(slave_mac, mac)) { + return slave; + } + } + + return NULL; +} + +static void +bond_active_slave_changed(struct bond *bond) +{ + struct eth_addr mac; + + netdev_get_etheraddr(bond->active_slave->netdev, &mac); + bond->active_slave_mac = mac; + bond->active_slave_changed = true; + seq_change(connectivity_seq_get()); +} + static void bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev) OVS_REQ_WRLOCK(rwlock) @@ -322,7 +517,8 @@ bond_slave_set_netdev__(struct bond_slave *slave, struct netdev *netdev) * 'slave_' or destroying 'bond'. */ void -bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev) +bond_slave_register(struct bond *bond, void *slave_, + ofp_port_t ofport, struct netdev *netdev) { struct bond_slave *slave; @@ -334,6 +530,7 @@ bond_slave_register(struct bond *bond, void *slave_, struct netdev *netdev) hmap_insert(&bond->slaves, &slave->hmap_node, hash_pointer(slave_, 0)); slave->bond = bond; slave->aux = slave_; + slave->ofp_port = ofport; slave->delay_expires = LLONG_MAX; slave->name = xstrdup(netdev_get_name(netdev)); bond->bond_revalidate = true; @@ -449,12 +646,6 @@ bond_run(struct bond *bond, enum lacp_status lacp_status) bond_choose_active_slave(bond); } - /* Update fake bond interface stats. */ - if (time_msec() >= bond->next_fake_iface_update) { - bond_update_fake_slave_stats(bond); - bond->next_fake_iface_update = time_msec() + 1000; - } - revalidate = bond->bond_revalidate; bond->bond_revalidate = false; ovs_rwlock_unlock(&rwlock); @@ -477,10 +668,6 @@ bond_wait(struct bond *bond) seq_wait(connectivity_seq_get(), slave->change_seq); } - if (bond->next_fake_iface_update != LLONG_MAX) { - poll_timer_wait_until(bond->next_fake_iface_update); - } - if (bond->bond_revalidate) { poll_immediate_wake(); } @@ -531,25 +718,24 @@ bond_should_send_learning_packets(struct bond *bond) * See bond_should_send_learning_packets() for description of usage. The * caller should send the composed packet on the port associated with * port_aux and takes ownership of the returned ofpbuf. */ -struct ofpbuf * -bond_compose_learning_packet(struct bond *bond, - const uint8_t eth_src[ETH_ADDR_LEN], +struct dp_packet * +bond_compose_learning_packet(struct bond *bond, const struct eth_addr eth_src, uint16_t vlan, void **port_aux) { struct bond_slave *slave; - struct ofpbuf *packet; + struct dp_packet *packet; struct flow flow; ovs_rwlock_rdlock(&rwlock); ovs_assert(may_send_learning_packets(bond)); memset(&flow, 0, sizeof flow); - memcpy(flow.dl_src, eth_src, ETH_ADDR_LEN); + flow.dl_src = eth_src; slave = choose_output_slave(bond, &flow, NULL, vlan); - packet = ofpbuf_new(0); + packet = dp_packet_new(0); compose_rarp(packet, eth_src); if (vlan) { - eth_push_vlan(packet, htons(vlan)); + eth_push_vlan(packet, htons(ETH_TYPE_VLAN), htons(vlan)); } *port_aux = slave->aux; @@ -574,7 +760,7 @@ bond_compose_learning_packet(struct bond *bond, */ enum bond_verdict bond_check_admissibility(struct bond *bond, const void *slave_, - const uint8_t eth_dst[ETH_ADDR_LEN]) + const struct eth_addr eth_dst) { enum bond_verdict verdict = BV_DROP; struct bond_slave *slave; @@ -683,6 +869,92 @@ bond_choose_output_slave(struct bond *bond, const struct flow *flow, return aux; } +/* Recirculation. */ +static void +bond_entry_account(struct bond_entry *entry, uint64_t rule_tx_bytes) + OVS_REQ_WRLOCK(rwlock) +{ + if (entry->slave) { + uint64_t delta; + + delta = rule_tx_bytes - entry->pr_tx_bytes; + entry->tx_bytes += delta; + entry->pr_tx_bytes = rule_tx_bytes; + } +} + +/* Maintain bond stats using post recirculation rule byte counters.*/ +static void +bond_recirculation_account(struct bond *bond) + OVS_REQ_WRLOCK(rwlock) +{ + int i; + + for (i=0; i<=BOND_MASK; i++) { + struct bond_entry *entry = &bond->hash[i]; + struct rule *rule = entry->pr_rule; + + if (rule) { + uint64_t n_packets OVS_UNUSED; + long long int used OVS_UNUSED; + uint64_t n_bytes; + + rule->ofproto->ofproto_class->rule_get_stats( + rule, &n_packets, &n_bytes, &used); + bond_entry_account(entry, n_bytes); + } + } +} + +bool +bond_may_recirc(const struct bond *bond, uint32_t *recirc_id, + uint32_t *hash_bias) +{ + if (bond->balance == BM_TCP && bond->recirc_id) { + if (recirc_id) { + *recirc_id = bond->recirc_id; + } + if (hash_bias) { + *hash_bias = bond->basis; + } + return true; + } else { + return false; + } +} + +static void +bond_update_post_recirc_rules__(struct bond* bond, const bool force) + OVS_REQ_WRLOCK(rwlock) +{ + struct bond_entry *e; + bool update_rules = force; /* Always update rules if caller forces it. */ + + /* Make sure all bond entries are populated */ + for (e = bond->hash; e <= &bond->hash[BOND_MASK]; e++) { + if (!e->slave || !e->slave->enabled) { + update_rules = true; + e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves), + struct bond_slave, hmap_node); + if (!e->slave->enabled) { + e->slave = bond->active_slave; + } + } + } + + if (update_rules) { + update_recirc_rules(bond); + } +} + +void +bond_update_post_recirc_rules(struct bond* bond, const bool force) +{ + ovs_rwlock_wrlock(&rwlock); + bond_update_post_recirc_rules__(bond, force); + ovs_rwlock_unlock(&rwlock); +} + /* Rebalancing. */ static bool @@ -705,13 +977,14 @@ bond_account(struct bond *bond, const struct flow *flow, uint16_t vlan, } static struct bond_slave * -bond_slave_from_bal_node(struct list *bal) OVS_REQ_RDLOCK(rwlock) +bond_slave_from_bal_node(struct ovs_list *bal) OVS_REQ_RDLOCK(rwlock) { return CONTAINER_OF(bal, struct bond_slave, bal_node); } static void -log_bals(struct bond *bond, const struct list *bals) +log_bals(struct bond *bond, const struct ovs_list *bals) + OVS_REQ_RDLOCK(rwlock) { if (VLOG_IS_DBG_ENABLED()) { struct ds ds = DS_EMPTY_INITIALIZER; @@ -749,6 +1022,7 @@ log_bals(struct bond *bond, const struct list *bals) /* Shifts 'hash' from its current slave to 'to'. */ static void bond_shift_load(struct bond_entry *hash, struct bond_slave *to) + OVS_REQ_WRLOCK(rwlock) { struct bond_slave *from = hash->slave; struct bond *bond = from->bond; @@ -780,6 +1054,7 @@ bond_shift_load(struct bond_entry *hash, struct bond_slave *to) * shift away small hashes or large hashes. */ static struct bond_entry * choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes) + OVS_REQ_WRLOCK(rwlock) { struct bond_entry *e; @@ -790,23 +1065,24 @@ choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes) } LIST_FOR_EACH (e, list_node, &from->entries) { - double old_ratio, new_ratio; - uint64_t delta; - - if (to_tx_bytes == 0) { - /* Nothing on the new slave, move it. */ - return e; - } - - delta = e->tx_bytes; - old_ratio = (double)from->tx_bytes / to_tx_bytes; - new_ratio = (double)(from->tx_bytes - delta) / (to_tx_bytes + delta); - if (old_ratio - new_ratio > 0.1 - && fabs(new_ratio - 1.0) < fabs(old_ratio - 1.0)) { - /* We're aiming for an ideal ratio of 1, meaning both the 'from' - and 'to' slave have the same load. Therefore, we only move an - entry if it decreases the load on 'from', and brings us closer - to equal traffic load. */ + uint64_t delta = e->tx_bytes; /* The amount to rebalance. */ + uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes)/2; + /* Note, the ideal traffic is the mid point + * between 'from' and 'to'. This value does + * not change by rebalancing. */ + uint64_t new_low; /* The lower bandwidth between 'to' and 'from' + after rebalancing. */ + + new_low = MIN(from->tx_bytes - delta, to_tx_bytes + delta); + + if ((new_low > to_tx_bytes) && + (new_low - to_tx_bytes >= (ideal_tx_bytes - to_tx_bytes) / 10)) { + /* Only rebalance if the new 'low' is closer to to the mid point, + * and the improvement exceeds 10% of current traffic + * deviation from the ideal split. + * + * The improvement on the 'high' side is always the same as the + * 'low' side. Thus consider 'low' side is sufficient. */ return e; } } @@ -817,7 +1093,7 @@ choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes) /* Inserts 'slave' into 'bals' so that descending order of 'tx_bytes' is * maintained. */ static void -insert_bal(struct list *bals, struct bond_slave *slave) +insert_bal(struct ovs_list *bals, struct bond_slave *slave) { struct bond_slave *pos; @@ -832,7 +1108,7 @@ insert_bal(struct list *bals, struct bond_slave *slave) /* Removes 'slave' from its current list and then inserts it into 'bals' so * that descending order of 'tx_bytes' is maintained. */ static void -reinsert_bal(struct list *bals, struct bond_slave *slave) +reinsert_bal(struct ovs_list *bals, struct bond_slave *slave) { list_remove(&slave->bal_node); insert_bal(bals, slave); @@ -840,22 +1116,32 @@ reinsert_bal(struct list *bals, struct bond_slave *slave) /* If 'bond' needs rebalancing, does so. * - * The caller should have called bond_account() for each active flow, to ensure - * that flow data is consistently accounted at this point. */ + * The caller should have called bond_account() for each active flow, or in case + * of recirculation is used, have called bond_recirculation_account(bond), + * to ensure that flow data is consistently accounted at this point. + */ void bond_rebalance(struct bond *bond) { struct bond_slave *slave; struct bond_entry *e; - struct list bals; + struct ovs_list bals; + bool rebalanced = false; + bool use_recirc; ovs_rwlock_wrlock(&rwlock); if (!bond_is_balanced(bond) || time_msec() < bond->next_rebalance) { - ovs_rwlock_unlock(&rwlock); - return; + goto done; } bond->next_rebalance = time_msec() + bond->rebalance_interval; + use_recirc = ofproto_dpif_get_support(bond->ofproto)->odp.recirc && + bond_may_recirc(bond, NULL, NULL); + + if (use_recirc) { + bond_recirculation_account(bond); + } + /* Add each bond_entry to its slave's 'entries' list. * Compute each slave's tx_bytes as the sum of its entries' tx_bytes. */ HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) { @@ -911,6 +1197,7 @@ bond_rebalance(struct bond *bond) /* Re-sort 'bals'. */ reinsert_bal(&bals, from); reinsert_bal(&bals, to); + rebalanced = true; } else { /* Can't usefully migrate anything away from 'from'. * Don't reconsider it. */ @@ -923,10 +1210,13 @@ bond_rebalance(struct bond *bond) * take 20 rebalancing runs to decay to 0 and get deleted entirely. */ for (e = &bond->hash[0]; e <= &bond->hash[BOND_MASK]; e++) { e->tx_bytes /= 2; - if (!e->tx_bytes) { - e->slave = NULL; - } } + + if (use_recirc && rebalanced) { + bond_update_post_recirc_rules__(bond,true); + } + +done: ovs_rwlock_unlock(&rwlock); } @@ -967,15 +1257,15 @@ bond_unixctl_list(struct unixctl_conn *conn, struct ds ds = DS_EMPTY_INITIALIZER; const struct bond *bond; - ds_put_cstr(&ds, "bond\ttype\tslaves\n"); + ds_put_cstr(&ds, "bond\ttype\trecircID\tslaves\n"); ovs_rwlock_rdlock(&rwlock); HMAP_FOR_EACH (bond, hmap_node, all_bonds) { const struct bond_slave *slave; size_t i; - ds_put_format(&ds, "%s\t%s\t", - bond->name, bond_mode_to_string(bond->balance)); + ds_put_format(&ds, "%s\t%s\t%d\t", bond->name, + bond_mode_to_string(bond->balance), bond->recirc_id); i = 0; HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) { @@ -998,12 +1288,18 @@ bond_print_details(struct ds *ds, const struct bond *bond) struct shash slave_shash = SHASH_INITIALIZER(&slave_shash); const struct shash_node **sorted_slaves = NULL; const struct bond_slave *slave; + bool may_recirc; + uint32_t recirc_id; int i; ds_put_format(ds, "---- %s ----\n", bond->name); ds_put_format(ds, "bond_mode: %s\n", bond_mode_to_string(bond->balance)); + may_recirc = bond_may_recirc(bond, &recirc_id, NULL); + ds_put_format(ds, "bond may use recirculation: %s, Recirc-ID : %d\n", + may_recirc ? "yes" : "no", may_recirc ? recirc_id: -1); + ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis); ds_put_format(ds, "updelay: %d ms\n", bond->updelay); @@ -1030,6 +1326,11 @@ bond_print_details(struct ds *ds, const struct bond *bond) break; } + ds_put_cstr(ds, "active slave mac: "); + ds_put_format(ds, ETH_ADDR_FMT, ETH_ADDR_ARGS(bond->active_slave_mac)); + slave = bond_find_slave_by_mac(bond, bond->active_slave_mac); + ds_put_format(ds,"(%s)\n", slave ? slave->name : "none"); + HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) { shash_add(&slave_shash, slave->name, slave); } @@ -1062,13 +1363,17 @@ bond_print_details(struct ds *ds, const struct bond *bond) /* Hashes. */ for (be = bond->hash; be <= &bond->hash[BOND_MASK]; be++) { int hash = be - bond->hash; + uint64_t be_tx_k; if (be->slave != slave) { continue; } - ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n", - hash, be->tx_bytes / 1024); + be_tx_k = be->tx_bytes / 1024; + if (be_tx_k) { + ds_put_format(ds, "\thash %d: %"PRIu64" kB load\n", + hash, be_tx_k); + } /* XXX How can we list the MACs assigned to hashes of SLB bonds? */ } @@ -1196,6 +1501,7 @@ bond_unixctl_set_active_slave(struct unixctl_conn *conn, bond->name, slave->name); bond->send_learning_packets = true; unixctl_command_reply(conn, "done"); + bond_active_slave_changed(bond); } else { unixctl_command_reply(conn, "no change"); } @@ -1254,7 +1560,7 @@ bond_unixctl_hash(struct unixctl_conn *conn, int argc, const char *argv[], const char *mac_s = argv[1]; const char *vlan_s = argc > 2 ? argv[2] : NULL; const char *basis_s = argc > 3 ? argv[3] : NULL; - uint8_t mac[ETH_ADDR_LEN]; + struct eth_addr mac; uint8_t hash; char *hash_cstr; unsigned int vlan; @@ -1311,7 +1617,7 @@ static void bond_entry_reset(struct bond *bond) { if (bond->balance != BM_AB) { - size_t hash_len = (BOND_MASK + 1) * sizeof *bond->hash; + size_t hash_len = BOND_BUCKETS * sizeof *bond->hash; if (!bond->hash) { bond->hash = xmalloc(hash_len); @@ -1347,6 +1653,15 @@ bond_enable_slave(struct bond_slave *slave, bool enable) if (enable != slave->enabled) { slave->bond->bond_revalidate = true; slave->enabled = enable; + + ovs_mutex_lock(&slave->bond->mutex); + if (enable) { + list_insert(&slave->bond->enabled_slaves, &slave->list_node); + } else { + list_remove(&slave->list_node); + } + ovs_mutex_unlock(&slave->bond->mutex); + VLOG_INFO("interface %s: %s", slave->name, slave->enabled ? "enabled" : "disabled"); } @@ -1388,9 +1703,9 @@ bond_link_status_update(struct bond_slave *slave) } static unsigned int -bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan, uint32_t basis) +bond_hash_src(const struct eth_addr mac, uint16_t vlan, uint32_t basis) { - return hash_3words(hash_bytes(mac, ETH_ADDR_LEN, 0), vlan, basis); + return hash_mac(mac, vlan, basis); } static unsigned int @@ -1422,6 +1737,27 @@ lookup_bond_entry(const struct bond *bond, const struct flow *flow, return &bond->hash[bond_hash(bond, flow, vlan) & BOND_MASK]; } +/* Selects and returns an enabled slave from the 'enabled_slaves' list + * in a round-robin fashion. If the 'enabled_slaves' list is empty, + * returns NULL. */ +static struct bond_slave * +get_enabled_slave(struct bond *bond) +{ + struct ovs_list *node; + + ovs_mutex_lock(&bond->mutex); + if (list_is_empty(&bond->enabled_slaves)) { + ovs_mutex_unlock(&bond->mutex); + return NULL; + } + + node = list_pop_front(&bond->enabled_slaves); + list_push_back(&bond->enabled_slaves, node); + ovs_mutex_unlock(&bond->mutex); + + return CONTAINER_OF(node, struct bond_slave, list_node); +} + static struct bond_slave * choose_output_slave(const struct bond *bond, const struct flow *flow, struct flow_wildcards *wc, uint16_t vlan) @@ -1459,11 +1795,7 @@ choose_output_slave(const struct bond *bond, const struct flow *flow, } e = lookup_bond_entry(bond, flow, vlan); if (!e->slave || !e->slave->enabled) { - e->slave = CONTAINER_OF(hmap_random_node(&bond->slaves), - struct bond_slave, hmap_node); - if (!e->slave->enabled) { - e->slave = bond->active_slave; - } + e->slave = get_enabled_slave(CONST_CAST(struct bond*, bond)); } return e->slave; @@ -1477,6 +1809,12 @@ bond_choose_slave(const struct bond *bond) { struct bond_slave *slave, *best; + /* Find the last active slave. */ + slave = bond_find_slave_by_mac(bond, bond->active_slave_mac); + if (slave && slave->enabled) { + return slave; + } + /* Find an enabled slave. */ HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) { if (slave->enabled) { @@ -1517,44 +1855,36 @@ bond_choose_active_slave(struct bond *bond) } bond->send_learning_packets = true; + + if (bond->active_slave != old_active_slave) { + bond_active_slave_changed(bond); + } } else if (old_active_slave) { VLOG_INFO_RL(&rl, "bond %s: all interfaces disabled", bond->name); } } -/* Attempts to make the sum of the bond slaves' statistics appear on the fake - * bond interface. */ -static void -bond_update_fake_slave_stats(struct bond *bond) +/* + * Return true if bond has unstored active slave change. + * If return true, 'mac' will store the bond's current active slave's + * MAC address. */ +bool +bond_get_changed_active_slave(const char *name, struct eth_addr *mac, + bool force) { - struct netdev_stats bond_stats; - struct bond_slave *slave; - struct netdev *bond_dev; - - memset(&bond_stats, 0, sizeof bond_stats); + struct bond *bond; - HMAP_FOR_EACH (slave, hmap_node, &bond->slaves) { - struct netdev_stats slave_stats; - - if (!netdev_get_stats(slave->netdev, &slave_stats)) { - /* XXX: We swap the stats here because they are swapped back when - * reported by the internal device. The reason for this is - * internal devices normally represent packets going into the - * system but when used as fake bond device they represent packets - * leaving the system. We really should do this in the internal - * device itself because changing it here reverses the counts from - * the perspective of the switch. However, the internal device - * doesn't know what type of device it represents so we have to do - * it here for now. */ - bond_stats.tx_packets += slave_stats.rx_packets; - bond_stats.tx_bytes += slave_stats.rx_bytes; - bond_stats.rx_packets += slave_stats.tx_packets; - bond_stats.rx_bytes += slave_stats.tx_bytes; + ovs_rwlock_wrlock(&rwlock); + bond = bond_find(name); + if (bond) { + if (bond->active_slave_changed || force) { + *mac = bond->active_slave_mac; + bond->active_slave_changed = false; + ovs_rwlock_unlock(&rwlock); + return true; } } + ovs_rwlock_unlock(&rwlock); - if (!netdev_open(bond->name, "system", &bond_dev)) { - netdev_set_stats(bond_dev, &bond_stats); - netdev_close(bond_dev); - } + return false; }