X-Git-Url: http://git.cascardo.eti.br/?a=blobdiff_plain;f=lib%2Fdpif-netdev.c;h=500e7cc43a947f671853b0ad65ebe661486cb9ec;hb=cc245ce87d3de9c2a66ee42719ab413e464fb2de;hp=79c4612ea5d334e988155070c1e5547855d055f1;hpb=361d808dd9e4e27fb04c76d3da0cd7a2e9447622;p=cascardo%2Fovs.git diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 79c4612ea..500e7cc43 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. + * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ #include #include +#include "bitmap.h" #include "cmap.h" #include "csum.h" #include "dp-packet.h" @@ -41,10 +42,11 @@ #include "fat-rwlock.h" #include "flow.h" #include "cmap.h" +#include "coverage.h" +#include "hmapx.h" #include "latch.h" #include "list.h" #include "match.h" -#include "meta-flow.h" #include "netdev.h" #include "netdev-dpdk.h" #include "netdev-vport.h" @@ -63,7 +65,8 @@ #include "shash.h" #include "sset.h" #include "timeval.h" -#include "tnl-arp-cache.h" +#include "tnl-neigh-cache.h" +#include "tnl-ports.h" #include "unixctl.h" #include "util.h" #include "openvswitch/vlog.h" @@ -203,6 +206,11 @@ struct dp_netdev { upcall_callback *upcall_cb; /* Callback function for executing upcalls. */ void *upcall_aux; + /* Callback function for notifying the purging of dp flows (during + * reseting pmd deletion). */ + dp_purge_callback *dp_purge_cb; + void *dp_purge_aux; + /* Stores all 'struct dp_netdev_pmd_thread's. */ struct cmap poll_threads; @@ -214,9 +222,7 @@ struct dp_netdev { * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */ ovsthread_key_t per_pmd_key; - /* Number of rx queues for each dpdk interface and the cpu mask - * for pin of pmd threads. */ - size_t n_dpdk_rxqs; + /* Cpu mask for pin of pmd threads. */ char *pmd_cmask; uint64_t last_tnl_conf_seq; }; @@ -247,6 +253,8 @@ struct dp_netdev_port { struct netdev_rxq **rxq; struct ovs_refcount ref_cnt; char *type; /* Port type as requested by user. */ + int latest_requested_n_rxq; /* Latest requested from netdev number + of rx queues. */ }; /* Contained by struct dp_netdev_flow's 'stats' member. */ @@ -365,6 +373,13 @@ struct dp_netdev_pmd_cycles { atomic_ullong n[PMD_N_CYCLES]; }; +/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */ +struct rxq_poll { + struct dp_netdev_port *port; + struct netdev_rxq *rx; + struct ovs_list node; +}; + /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate * the performance overhead of interrupt processing. Therefore netdev can * not implement rx-wait for these devices. dpif-netdev needs to poll @@ -420,9 +435,14 @@ struct dp_netdev_pmd_thread { /* threads on same numa node. */ unsigned core_id; /* CPU core id of this pmd thread. */ int numa_id; /* numa node id of this pmd thread. */ - int tx_qid; /* Queue id used by this pmd thread to + atomic_int tx_qid; /* Queue id used by this pmd thread to * send packets on all netdevs */ + struct ovs_mutex poll_mutex; /* Mutex for poll_list. */ + /* List of rx queues to poll. */ + struct ovs_list poll_list OVS_GUARDED; + int poll_cnt; /* Number of elemints in poll_list. */ + /* Only a pmd thread can write on its own 'cycles' and 'stats'. * The main thread keeps 'stats_zero' and 'cycles_zero' as base * values and subtracts them from 'stats' and 'cycles' before @@ -459,10 +479,12 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, const struct nlattr *actions, size_t actions_len); static void dp_netdev_input(struct dp_netdev_pmd_thread *, - struct dp_packet **, int cnt); + struct dp_packet **, int cnt, odp_port_t port_no); +static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, + struct dp_packet **, int cnt); static void dp_netdev_disable_upcall(struct dp_netdev *); -void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, int index, unsigned core_id, int numa_id); @@ -475,6 +497,18 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp); static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id); static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id); +static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_del_port_from_pmd(struct dp_netdev_port *port, + struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp, + struct dp_netdev_port *port); +static void +dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port); +static void +dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_port *port, struct netdev_rxq *rx); +static struct dp_netdev_pmd_thread * +dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id); static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp); static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd); @@ -488,15 +522,12 @@ emc_cache_init(struct emc_cache *flow_cache) { int i; - BUILD_ASSERT(sizeof(struct miniflow) == 2 * sizeof(uint64_t)); - flow_cache->sweep_idx = 0; for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) { flow_cache->entries[i].flow = NULL; flow_cache->entries[i].key.hash = 0; flow_cache->entries[i].key.len = sizeof(struct miniflow); - flow_cache->entries[i].key.mf.tnl_map = 0; - flow_cache->entries[i].key.mf.pkt_map = 0; + flowmap_init(&flow_cache->entries[i].key.mf.map); } } @@ -544,8 +575,9 @@ get_dp_netdev(const struct dpif *dpif) } enum pmd_info_type { - PMD_INFO_SHOW_STATS, /* show how cpu cycles are spent */ - PMD_INFO_CLEAR_STATS /* set the cycles count to 0 */ + PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ + PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ + PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */ }; static void @@ -653,6 +685,35 @@ pmd_info_clear_stats(struct ds *reply OVS_UNUSED, } } +static void +pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) +{ + if (pmd->core_id != NON_PMD_CORE_ID) { + struct rxq_poll *poll; + const char *prev_name = NULL; + + ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n", + pmd->numa_id, pmd->core_id); + + ovs_mutex_lock(&pmd->poll_mutex); + LIST_FOR_EACH (poll, node, &pmd->poll_list) { + const char *name = netdev_get_name(poll->port->netdev); + + if (!prev_name || strcmp(name, prev_name)) { + if (prev_name) { + ds_put_cstr(reply, "\n"); + } + ds_put_format(reply, "\tport: %s\tqueue-id:", + netdev_get_name(poll->port->netdev)); + } + ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx)); + prev_name = name; + } + ovs_mutex_unlock(&pmd->poll_mutex); + ds_put_cstr(reply, "\n"); + } +} + static void dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], void *aux) @@ -679,22 +740,26 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], } CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { - unsigned long long stats[DP_N_STATS]; - uint64_t cycles[PMD_N_CYCLES]; - int i; + if (type == PMD_INFO_SHOW_RXQ) { + pmd_info_show_rxq(&reply, pmd); + } else { + unsigned long long stats[DP_N_STATS]; + uint64_t cycles[PMD_N_CYCLES]; + int i; - /* Read current stats and cycle counters */ - for (i = 0; i < ARRAY_SIZE(stats); i++) { - atomic_read_relaxed(&pmd->stats.n[i], &stats[i]); - } - for (i = 0; i < ARRAY_SIZE(cycles); i++) { - atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]); - } + /* Read current stats and cycle counters */ + for (i = 0; i < ARRAY_SIZE(stats); i++) { + atomic_read_relaxed(&pmd->stats.n[i], &stats[i]); + } + for (i = 0; i < ARRAY_SIZE(cycles); i++) { + atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]); + } - if (type == PMD_INFO_CLEAR_STATS) { - pmd_info_clear_stats(&reply, pmd, stats, cycles); - } else if (type == PMD_INFO_SHOW_STATS) { - pmd_info_show_stats(&reply, pmd, stats, cycles); + if (type == PMD_INFO_CLEAR_STATS) { + pmd_info_clear_stats(&reply, pmd, stats, cycles); + } else if (type == PMD_INFO_SHOW_STATS) { + pmd_info_show_stats(&reply, pmd, stats, cycles); + } } } @@ -708,7 +773,8 @@ static int dpif_netdev_init(void) { static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, - clear_aux = PMD_INFO_CLEAR_STATS; + clear_aux = PMD_INFO_CLEAR_STATS, + poll_aux = PMD_INFO_SHOW_RXQ; unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]", 0, 1, dpif_netdev_pmd_info, @@ -716,6 +782,9 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]", 0, 1, dpif_netdev_pmd_info, (void *)&clear_aux); + unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&poll_aux); return 0; } @@ -843,7 +912,6 @@ create_dp_netdev(const char *name, const struct dpif_class *class, ovsthread_key_create(&dp->per_pmd_key, NULL); dp_netdev_set_nonpmd(dp); - dp->n_dpdk_rxqs = NR_QUEUE; ovs_mutex_lock(&dp->port_mutex); error = do_add_port(dp, name, "internal", ODPP_LOCAL); @@ -906,15 +974,16 @@ dp_netdev_free(struct dp_netdev *dp) shash_find_and_delete(&dp_netdevs, dp->name); dp_netdev_destroy_all_pmds(dp); - cmap_destroy(&dp->poll_threads); ovs_mutex_destroy(&dp->non_pmd_mutex); ovsthread_key_delete(dp->per_pmd_key); ovs_mutex_lock(&dp->port_mutex); CMAP_FOR_EACH (port, node, &dp->ports) { + /* PMD threads are destroyed here. do_del_port() cannot quiesce */ do_del_port(dp, port); } ovs_mutex_unlock(&dp->port_mutex); + cmap_destroy(&dp->poll_threads); seq_destroy(dp->port_seq); cmap_destroy(&dp->ports); @@ -1021,18 +1090,6 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) ovs_mutex_unlock(&pmd->cond_mutex); } -/* Causes all pmd threads to reload its tx/rx devices. - * Must be called after adding/removing ports. */ -static void -dp_netdev_reload_pmds(struct dp_netdev *dp) -{ - struct dp_netdev_pmd_thread *pmd; - - CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { - dp_netdev_reload_pmd__(pmd); - } -} - static uint32_t hash_port_no(odp_port_t port_no) { @@ -1082,7 +1139,8 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, /* There can only be ovs_numa_get_n_cores() pmd threads, * so creates a txq for each, and one extra for the non * pmd threads. */ - error = netdev_set_multiq(netdev, n_cores + 1, dp->n_dpdk_rxqs); + error = netdev_set_multiq(netdev, n_cores + 1, + netdev_requested_n_rxq(netdev)); if (error && (error != EOPNOTSUPP)) { VLOG_ERR("%s, cannot set multiq", devname); return errno; @@ -1093,6 +1151,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, port->netdev = netdev; port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev)); port->type = xstrdup(type); + port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev); for (i = 0; i < netdev_n_rxq(netdev); i++) { error = netdev_rxq_open(netdev, &port->rxq[i], i); if (error @@ -1124,8 +1183,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, cmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); if (netdev_is_pmd(netdev)) { - dp_netdev_set_pmds_on_numa(dp, netdev_get_numa_id(netdev)); - dp_netdev_reload_pmds(dp); + dp_netdev_add_port_to_pmds(dp, port); } seq_change(dp->port_seq); @@ -1222,16 +1280,6 @@ port_ref(struct dp_netdev_port *port) } } -static bool -port_try_ref(struct dp_netdev_port *port) -{ - if (port) { - return ovs_refcount_try_ref_rcu(&port->ref_cnt); - } - - return false; -} - static void port_unref(struct dp_netdev_port *port) { @@ -1267,6 +1315,13 @@ get_port_by_name(struct dp_netdev *dp, return ENOENT; } +static int +get_n_pmd_threads(struct dp_netdev *dp) +{ + /* There is one non pmd thread in dp->poll_threads */ + return cmap_count(&dp->poll_threads) - 1; +} + static int get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id) { @@ -1309,12 +1364,15 @@ do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port) if (netdev_is_pmd(port->netdev)) { int numa_id = netdev_get_numa_id(port->netdev); + /* PMD threads can not be on invalid numa node. */ + ovs_assert(ovs_numa_numa_id_is_valid(numa_id)); /* If there is no netdev on the numa node, deletes the pmd threads - * for that numa. Else, just reloads the queues. */ + * for that numa. Else, deletes the queues from polling lists. */ if (!has_pmd_port_for_numa(dp, numa_id)) { dp_netdev_del_pmds_on_numa(dp, numa_id); + } else { + dp_netdev_del_port_from_all_pmds(dp, port); } - dp_netdev_reload_pmds(dp); } port_unref(port); @@ -1521,12 +1579,7 @@ static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow) * miniflow_extract(), if the map is different the miniflow is different. * Therefore we can be faster by comparing the map and the miniflow in a * single memcmp(). - * - These functions can be inlined by the compiler. - * - * The following assertions make sure that what we're doing with miniflow is - * safe. - */ -BUILD_ASSERT_DECL(sizeof(struct miniflow) == 2 * sizeof(uint64_t)); + * - These functions can be inlined by the compiler. */ /* Given the number of bits set in miniflow's maps, returns the size of the * 'netdev_flow_key.mf' */ @@ -1585,47 +1638,32 @@ static inline void netdev_flow_mask_init(struct netdev_flow_key *mask, const struct match *match) { - const uint64_t *mask_u64 = (const uint64_t *) &match->wc.masks; uint64_t *dst = miniflow_values(&mask->mf); - struct miniflow maps; - uint64_t map; + struct flowmap fmap; uint32_t hash = 0; - int n; + size_t idx; /* Only check masks that make sense for the flow. */ - flow_wc_map(&match->flow, &maps); - memset(&mask->mf, 0, sizeof mask->mf); /* Clear maps. */ + flow_wc_map(&match->flow, &fmap); + flowmap_init(&mask->mf.map); - map = maps.tnl_map; - while (map) { - uint64_t rm1bit = rightmost_1bit(map); - int i = raw_ctz(map); + FLOWMAP_FOR_EACH_INDEX(idx, fmap) { + uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx); - if (mask_u64[i]) { - mask->mf.tnl_map |= rm1bit; - *dst++ = mask_u64[i]; - hash = hash_add64(hash, mask_u64[i]); + if (mask_u64) { + flowmap_set(&mask->mf.map, idx, 1); + *dst++ = mask_u64; + hash = hash_add64(hash, mask_u64); } - map -= rm1bit; } - mask_u64 += FLOW_TNL_U64S; - map = maps.pkt_map; - while (map) { - uint64_t rm1bit = rightmost_1bit(map); - int i = raw_ctz(map); - if (mask_u64[i]) { - mask->mf.pkt_map |= rm1bit; - *dst++ = mask_u64[i]; - hash = hash_add64(hash, mask_u64[i]); - } - map -= rm1bit; - } + map_t map; - hash = hash_add64(hash, mask->mf.tnl_map); - hash = hash_add64(hash, mask->mf.pkt_map); + FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) { + hash = hash_add64(hash, map); + } - n = dst - miniflow_get_values(&mask->mf); + size_t n = dst - miniflow_get_values(&mask->mf); mask->hash = hash_finish(hash, n * 8); mask->len = netdev_flow_key_size(n); @@ -1645,7 +1683,7 @@ netdev_flow_key_init_masked(struct netdev_flow_key *dst, dst->len = mask->len; dst->mf = mask->mf; /* Copy maps. */ - FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf) { + FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) { *dst_u64 = value & *mask_u64++; hash = hash_add64(hash, *dst_u64++); } @@ -1653,13 +1691,9 @@ netdev_flow_key_init_masked(struct netdev_flow_key *dst, (dst_u64 - miniflow_get_values(&dst->mf)) * 8); } -/* Iterate through netdev_flow_key TNL u64 values specified by 'MAPS'. */ -#define NETDEV_FLOW_KEY_FOR_EACH_IN_TNL_MAP(VALUE, KEY, MAPS) \ - MINIFLOW_FOR_EACH_IN_TNL_MAP(VALUE, &(KEY)->mf, MAPS) - -/* Iterate through netdev_flow_key PKT u64 values specified by 'MAPS'. */ -#define NETDEV_FLOW_KEY_FOR_EACH_IN_PKT_MAP(VALUE, KEY, MAPS) \ - MINIFLOW_FOR_EACH_IN_PKT_MAP(VALUE, &(KEY)->mf, MAPS) +/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */ +#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \ + MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP) /* Returns a hash value for the bits of 'key' where there are 1-bits in * 'mask'. */ @@ -1669,13 +1703,10 @@ netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key, { const uint64_t *p = miniflow_get_values(&mask->mf); uint32_t hash = 0; - uint64_t key_u64; + uint64_t value; - NETDEV_FLOW_KEY_FOR_EACH_IN_TNL_MAP(key_u64, key, mask->mf) { - hash = hash_add64(hash, key_u64 & *p++); - } - NETDEV_FLOW_KEY_FOR_EACH_IN_PKT_MAP(key_u64, key, mask->mf) { - hash = hash_add64(hash, key_u64 & *p++); + NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) { + hash = hash_add64(hash, value & *p++); } return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8); @@ -1879,59 +1910,33 @@ static int dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, const struct nlattr *mask_key, uint32_t mask_key_len, const struct flow *flow, - struct flow *mask) -{ - if (mask_key_len) { - enum odp_key_fitness fitness; - - fitness = odp_flow_key_to_mask(mask_key, mask_key_len, key, key_len, - mask, flow); - if (fitness) { - /* This should not happen: it indicates that - * odp_flow_key_from_mask() and odp_flow_key_to_mask() - * disagree on the acceptable form of a mask. Log the problem - * as an error, with enough details to enable debugging. */ - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); - - if (!VLOG_DROP_ERR(&rl)) { - struct ds s; - - ds_init(&s); - odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, - true); - VLOG_ERR("internal error parsing flow mask %s (%s)", - ds_cstr(&s), odp_key_fitness_to_string(fitness)); - ds_destroy(&s); - } + struct flow_wildcards *wc) +{ + enum odp_key_fitness fitness; - return EINVAL; - } - } else { - enum mf_field_id id; - /* No mask key, unwildcard everything except fields whose - * prerequisities are not met. */ - memset(mask, 0x0, sizeof *mask); - - for (id = 0; id < MFF_N_IDS; ++id) { - /* Skip registers and metadata. */ - if (!(id >= MFF_REG0 && id < MFF_REG0 + FLOW_N_REGS) - && !(id >= MFF_XREG0 && id < MFF_XREG0 + FLOW_N_XREGS) - && id != MFF_METADATA) { - const struct mf_field *mf = mf_from_id(id); - if (mf_are_prereqs_ok(mf, flow)) { - mf_mask_field(mf, mask); - } - } + fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key, + key_len, wc, flow); + if (fitness) { + /* This should not happen: it indicates that + * odp_flow_key_from_mask() and odp_flow_key_to_mask() + * disagree on the acceptable form of a mask. Log the problem + * as an error, with enough details to enable debugging. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + if (!VLOG_DROP_ERR(&rl)) { + struct ds s; + + ds_init(&s); + odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, + true); + VLOG_ERR("internal error parsing flow mask %s (%s)", + ds_cstr(&s), odp_key_fitness_to_string(fitness)); + ds_destroy(&s); } + + return EINVAL; } - /* Force unwildcard the in_port. - * - * We need to do this even in the case where we unwildcard "everything" - * above because "everything" only includes the 16-bit OpenFlow port number - * mask->in_port.ofp_port, which only covers half of the 32-bit datapath - * port number mask->in_port.odp_port. */ - mask->in_port.odp_port = u32_to_odp(UINT32_MAX); return 0; } @@ -1941,7 +1946,7 @@ dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, { odp_port_t in_port; - if (odp_flow_key_to_flow(key, key_len, flow)) { + if (odp_flow_key_to_flow_udpif(key, key_len, flow)) { /* This should not happen: it indicates that odp_flow_key_from_flow() * and odp_flow_key_to_flow() disagree on the acceptable form of a * flow. Log the problem as an error, with enough details to enable @@ -1965,6 +1970,12 @@ dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, return EINVAL; } + /* Userspace datapath doesn't support conntrack. */ + if (flow->ct_state || flow->ct_zone || flow->ct_mark + || !ovs_u128_is_zero(&flow->ct_label)) { + return EINVAL; + } + return 0; } @@ -2008,8 +2019,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, netdev_flow_mask_init(&mask, match); /* Make sure wc does not have metadata. */ - ovs_assert(!(mask.mf.pkt_map - & (MINIFLOW_PKT_MAP(metadata) | MINIFLOW_PKT_MAP(regs)))); + ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata) + && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs)); /* Do not allocate extra space. */ flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); @@ -2032,6 +2043,7 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, struct match match; struct ds ds = DS_EMPTY_INITIALIZER; + match.tun_md.valid = false; match.flow = flow->flow; miniflow_expand(&flow->cr.mask->mf, &match.wc.masks); @@ -2069,7 +2081,7 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) } error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len, put->mask, put->mask_len, - &match.flow, &match.wc.masks); + &match.flow, &match.wc); if (error) { return error; } @@ -2402,32 +2414,42 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops) /* Returns true if the configuration for rx queues or cpu mask * is changed. */ static bool -pmd_config_changed(const struct dp_netdev *dp, size_t rxqs, const char *cmask) +pmd_config_changed(const struct dp_netdev *dp, const char *cmask) { - if (dp->n_dpdk_rxqs != rxqs) { - return true; - } else { - if (dp->pmd_cmask != NULL && cmask != NULL) { - return strcmp(dp->pmd_cmask, cmask); - } else { - return (dp->pmd_cmask != NULL || cmask != NULL); + struct dp_netdev_port *port; + + CMAP_FOR_EACH (port, node, &dp->ports) { + struct netdev *netdev = port->netdev; + int requested_n_rxq = netdev_requested_n_rxq(netdev); + if (netdev_is_pmd(netdev) + && port->latest_requested_n_rxq != requested_n_rxq) { + return true; } } + + if (dp->pmd_cmask != NULL && cmask != NULL) { + return strcmp(dp->pmd_cmask, cmask); + } else { + return (dp->pmd_cmask != NULL || cmask != NULL); + } } /* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */ static int -dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs, const char *cmask) +dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask) { struct dp_netdev *dp = get_dp_netdev(dpif); - if (pmd_config_changed(dp, n_rxqs, cmask)) { + if (pmd_config_changed(dp, cmask)) { struct dp_netdev_port *port; dp_netdev_destroy_all_pmds(dp); CMAP_FOR_EACH (port, node, &dp->ports) { - if (netdev_is_pmd(port->netdev)) { + struct netdev *netdev = port->netdev; + int requested_n_rxq = netdev_requested_n_rxq(netdev); + if (netdev_is_pmd(port->netdev) + && port->latest_requested_n_rxq != requested_n_rxq) { int i, err; /* Closes the existing 'rxq's. */ @@ -2439,14 +2461,14 @@ dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs, const char *cmask) /* Sets the new rx queue config. */ err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1, - n_rxqs); + requested_n_rxq); if (err && (err != EOPNOTSUPP)) { VLOG_ERR("Failed to set dpdk interface %s rx_queue to:" " %u", netdev_get_name(port->netdev), - n_rxqs); + requested_n_rxq); return err; } - + port->latest_requested_n_rxq = requested_n_rxq; /* If the set_multiq() above succeeds, reopens the 'rxq's. */ port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(port->netdev)); @@ -2455,8 +2477,6 @@ dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs, const char *cmask) } } } - dp->n_dpdk_rxqs = n_rxqs; - /* Reconfigures the cpu mask. */ ovs_numa_set_cpu_mask(cmask); free(dp->pmd_cmask); @@ -2552,16 +2572,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, error = netdev_rxq_recv(rxq, packets, &cnt); cycles_count_end(pmd, PMD_CYCLES_POLLING); if (!error) { - int i; - *recirc_depth_get() = 0; - /* XXX: initialize md in netdev implementation. */ - for (i = 0; i < cnt; i++) { - pkt_metadata_init(&packets[i]->md, port->port_no); - } cycles_count_start(pmd); - dp_netdev_input(pmd, packets, cnt); + dp_netdev_input(pmd, packets, cnt, port->port_no); cycles_count_end(pmd, PMD_CYCLES_PROCESSING); } else if (error != EAGAIN && error != EOPNOTSUPP) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -2594,7 +2608,8 @@ dpif_netdev_run(struct dpif *dpif) ovs_mutex_unlock(&dp->non_pmd_mutex); dp_netdev_pmd_unref(non_pmd); - tnl_arp_cache_run(); + tnl_neigh_cache_run(); + tnl_port_map_run(); new_tnl_seq = seq_read(tnl_conf_seq); if (dp->last_tnl_conf_seq != new_tnl_seq) { @@ -2624,56 +2639,29 @@ dpif_netdev_wait(struct dpif *dpif) seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); } -struct rxq_poll { - struct dp_netdev_port *port; - struct netdev_rxq *rx; -}; - static int pmd_load_queues(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **ppoll_list, int poll_cnt) + OVS_REQUIRES(pmd->poll_mutex) { struct rxq_poll *poll_list = *ppoll_list; - struct dp_netdev_port *port; - int n_pmds_on_numa, index, i; + struct rxq_poll *poll; + int i; - /* Simple scheduler for netdev rx polling. */ for (i = 0; i < poll_cnt; i++) { port_unref(poll_list[i].port); } - poll_cnt = 0; - n_pmds_on_numa = get_n_pmd_threads_on_numa(pmd->dp, pmd->numa_id); - index = 0; - - CMAP_FOR_EACH (port, node, &pmd->dp->ports) { - /* Calls port_try_ref() to prevent the main thread - * from deleting the port. */ - if (port_try_ref(port)) { - if (netdev_is_pmd(port->netdev) - && netdev_get_numa_id(port->netdev) == pmd->numa_id) { - int i; + poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list); - for (i = 0; i < netdev_n_rxq(port->netdev); i++) { - if ((index % n_pmds_on_numa) == pmd->index) { - poll_list = xrealloc(poll_list, - sizeof *poll_list * (poll_cnt + 1)); - - port_ref(port); - poll_list[poll_cnt].port = port; - poll_list[poll_cnt].rx = port->rxq[i]; - poll_cnt++; - } - index++; - } - } - /* Unrefs the port_try_ref(). */ - port_unref(port); - } + i = 0; + LIST_FOR_EACH (poll, node, &pmd->poll_list) { + port_ref(poll->port); + poll_list[i++] = *poll; } *ppoll_list = poll_list; - return poll_cnt; + return pmd->poll_cnt; } static void * @@ -2694,11 +2682,16 @@ pmd_thread_main(void *f_) pmd_thread_setaffinity_cpu(pmd->core_id); reload: emc_cache_init(&pmd->flow_cache); + + ovs_mutex_lock(&pmd->poll_mutex); poll_cnt = pmd_load_queues(pmd, &poll_list, poll_cnt); + ovs_mutex_unlock(&pmd->poll_mutex); /* List port/core affinity */ for (i = 0; i < poll_cnt; i++) { - VLOG_INFO("Core %d processing port \'%s\'\n", pmd->core_id, netdev_get_name(poll_list[i].port->netdev)); + VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", + pmd->core_id, netdev_get_name(poll_list[i].port->netdev), + netdev_rxq_get_queue_id(poll_list[i].rx)); } /* Signal here to make sure the pmd finishes @@ -2706,8 +2699,6 @@ reload: dp_netdev_pmd_reload_done(pmd); for (;;) { - int i; - for (i = 0; i < poll_cnt; i++) { dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx); } @@ -2718,6 +2709,7 @@ reload: lc = 0; emc_cache_slow_sweep(&pmd->flow_cache); + coverage_try_clear(); ovsrcu_quiesce(); atomic_read_relaxed(&pmd->change_seq, &seq); @@ -2735,7 +2727,7 @@ reload: } for (i = 0; i < poll_cnt; i++) { - port_unref(poll_list[i].port); + port_unref(poll_list[i].port); } dp_netdev_pmd_reload_done(pmd); @@ -2774,7 +2766,7 @@ dpif_netdev_enable_upcall(struct dpif *dpif) dp_netdev_enable_upcall(dp); } -void +static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd) { ovs_mutex_lock(&pmd->cond_mutex); @@ -2847,16 +2839,6 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos) return next; } -static int -core_id_to_qid(unsigned core_id) -{ - if (core_id != NON_PMD_CORE_ID) { - return core_id; - } else { - return ovs_numa_get_n_cores(); - } -} - /* Configures the 'pmd' based on the input argument. */ static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, @@ -2865,8 +2847,13 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, pmd->dp = dp; pmd->index = index; pmd->core_id = core_id; - pmd->tx_qid = core_id_to_qid(core_id); pmd->numa_id = numa_id; + pmd->poll_cnt = 0; + + atomic_init(&pmd->tx_qid, + (core_id == NON_PMD_CORE_ID) + ? ovs_numa_get_n_cores() + : get_n_pmd_threads(dp)); ovs_refcount_init(&pmd->ref_cnt); latch_init(&pmd->exit_latch); @@ -2874,8 +2861,10 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, xpthread_cond_init(&pmd->cond, NULL); ovs_mutex_init(&pmd->cond_mutex); ovs_mutex_init(&pmd->flow_mutex); + ovs_mutex_init(&pmd->poll_mutex); dpcls_init(&pmd->cls); cmap_init(&pmd->flow_table); + list_init(&pmd->poll_list); /* init the 'flow_cache' since there is no * actual thread created for NON_PMD_CORE_ID. */ if (core_id == NON_PMD_CORE_ID) { @@ -2895,13 +2884,14 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd) latch_destroy(&pmd->exit_latch); xpthread_cond_destroy(&pmd->cond); ovs_mutex_destroy(&pmd->cond_mutex); + ovs_mutex_destroy(&pmd->poll_mutex); free(pmd); } /* Stops the pmd thread, removes it from the 'dp->poll_threads', * and unrefs the struct. */ static void -dp_netdev_del_pmd(struct dp_netdev_pmd_thread *pmd) +dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) { /* Uninit the 'flow_cache' since there is * no actual thread uninit it for NON_PMD_CORE_ID. */ @@ -2913,6 +2903,15 @@ dp_netdev_del_pmd(struct dp_netdev_pmd_thread *pmd) ovs_numa_unpin_core(pmd->core_id); xpthread_join(pmd->thread, NULL); } + + /* Unref all ports and free poll_list. */ + dp_netdev_pmd_clear_poll_list(pmd); + + /* Purges the 'pmd''s flows after stopping the thread, but before + * destroying the flows, so that the flow stats can be collected. */ + if (dp->dp_purge_cb) { + dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id); + } cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0)); dp_netdev_pmd_unref(pmd); } @@ -2922,23 +2921,202 @@ static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp) { struct dp_netdev_pmd_thread *pmd; + struct dp_netdev_pmd_thread **pmd_list; + size_t k = 0, n_pmds; + + n_pmds = cmap_count(&dp->poll_threads); + pmd_list = xcalloc(n_pmds, sizeof *pmd_list); CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { - dp_netdev_del_pmd(pmd); + /* We cannot call dp_netdev_del_pmd(), since it alters + * 'dp->poll_threads' (while we're iterating it) and it + * might quiesce. */ + ovs_assert(k < n_pmds); + pmd_list[k++] = pmd; + } + + for (size_t i = 0; i < k; i++) { + dp_netdev_del_pmd(dp, pmd_list[i]); } + free(pmd_list); } -/* Deletes all pmd threads on numa node 'numa_id'. */ +/* Deletes all pmd threads on numa node 'numa_id' and + * fixes tx_qids of other threads to keep them sequential. */ static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id) { struct dp_netdev_pmd_thread *pmd; + int n_pmds_on_numa, n_pmds; + int *free_idx, k = 0; + struct dp_netdev_pmd_thread **pmd_list; + + n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id); + free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx); + pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list); CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + /* We cannot call dp_netdev_del_pmd(), since it alters + * 'dp->poll_threads' (while we're iterating it) and it + * might quiesce. */ if (pmd->numa_id == numa_id) { - dp_netdev_del_pmd(pmd); + atomic_read_relaxed(&pmd->tx_qid, &free_idx[k]); + pmd_list[k] = pmd; + ovs_assert(k < n_pmds_on_numa); + k++; + } + } + + for (int i = 0; i < k; i++) { + dp_netdev_del_pmd(dp, pmd_list[i]); + } + + n_pmds = get_n_pmd_threads(dp); + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + int old_tx_qid; + + atomic_read_relaxed(&pmd->tx_qid, &old_tx_qid); + + if (old_tx_qid >= n_pmds) { + int new_tx_qid = free_idx[--k]; + + atomic_store_relaxed(&pmd->tx_qid, new_tx_qid); + } + } + + free(pmd_list); + free(free_idx); +} + +/* Deletes all rx queues from pmd->poll_list. */ +static void +dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd) +{ + struct rxq_poll *poll; + + ovs_mutex_lock(&pmd->poll_mutex); + LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) { + port_unref(poll->port); + free(poll); + } + pmd->poll_cnt = 0; + ovs_mutex_unlock(&pmd->poll_mutex); +} + +/* Deletes all rx queues of 'port' from poll_list of pmd thread and + * reloads it if poll_list was changed. */ +static void +dp_netdev_del_port_from_pmd(struct dp_netdev_port *port, + struct dp_netdev_pmd_thread *pmd) +{ + struct rxq_poll *poll, *next; + bool found = false; + + ovs_mutex_lock(&pmd->poll_mutex); + LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) { + if (poll->port == port) { + found = true; + port_unref(poll->port); + list_remove(&poll->node); + pmd->poll_cnt--; + free(poll); } } + ovs_mutex_unlock(&pmd->poll_mutex); + if (found) { + dp_netdev_reload_pmd__(pmd); + } +} + +/* Deletes all rx queues of 'port' from all pmd threads of dp and + * reloads them if needed. */ +static void +dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp, + struct dp_netdev_port *port) +{ + int numa_id = netdev_get_numa_id(port->netdev); + struct dp_netdev_pmd_thread *pmd; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->numa_id == numa_id) { + dp_netdev_del_port_from_pmd(port, pmd); + } + } +} + +/* Returns PMD thread from this numa node with fewer rx queues to poll. + * Returns NULL if there is no PMD threads on this numa node. + * Can be called safely only by main thread. */ +static struct dp_netdev_pmd_thread * +dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id) +{ + int min_cnt = -1; + struct dp_netdev_pmd_thread *pmd, *res = NULL; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->numa_id == numa_id + && (min_cnt > pmd->poll_cnt || res == NULL)) { + min_cnt = pmd->poll_cnt; + res = pmd; + } + } + + return res; +} + +/* Adds rx queue to poll_list of PMD thread. */ +static void +dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_port *port, struct netdev_rxq *rx) + OVS_REQUIRES(pmd->poll_mutex) +{ + struct rxq_poll *poll = xmalloc(sizeof *poll); + + port_ref(port); + poll->port = port; + poll->rx = rx; + + list_push_back(&pmd->poll_list, &poll->node); + pmd->poll_cnt++; +} + +/* Distributes all rx queues of 'port' between all PMD threads and reloads + * them if needed. */ +static void +dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port) +{ + int numa_id = netdev_get_numa_id(port->netdev); + struct dp_netdev_pmd_thread *pmd; + struct hmapx to_reload; + struct hmapx_node *node; + int i; + + hmapx_init(&to_reload); + /* Cannot create pmd threads for invalid numa node. */ + ovs_assert(ovs_numa_numa_id_is_valid(numa_id)); + + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id); + if (!pmd) { + /* There is no pmd threads on this numa node. */ + dp_netdev_set_pmds_on_numa(dp, numa_id); + /* Assigning of rx queues done. */ + break; + } + + ovs_mutex_lock(&pmd->poll_mutex); + dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]); + ovs_mutex_unlock(&pmd->poll_mutex); + + hmapx_add(&to_reload, pmd); + } + + HMAPX_FOR_EACH (node, &to_reload) { + pmd = (struct dp_netdev_pmd_thread *) node->data; + dp_netdev_reload_pmd__(pmd); + } + + hmapx_destroy(&to_reload); } /* Checks the numa node id of 'netdev' and starts pmd threads for @@ -2960,7 +3138,9 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id) * in which 'netdev' is on, do nothing. Else, creates the * pmd threads for the numa node. */ if (!n_pmds) { - int can_have, n_unpinned, i; + int can_have, n_unpinned, i, index = 0; + struct dp_netdev_pmd_thread **pmds; + struct dp_netdev_port *port; n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id); if (!n_unpinned) { @@ -2972,15 +3152,32 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id) /* If cpu mask is specified, uses all unpinned cores, otherwise * tries creating NR_PMD_THREADS pmd threads. */ can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS); + pmds = xzalloc(can_have * sizeof *pmds); for (i = 0; i < can_have; i++) { - struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd); unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id); + pmds[i] = xzalloc(sizeof **pmds); + dp_netdev_configure_pmd(pmds[i], dp, i, core_id, numa_id); + } - dp_netdev_configure_pmd(pmd, dp, i, core_id, numa_id); - /* Each thread will distribute all devices rx-queues among - * themselves. */ - pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd); + /* Distributes rx queues of this numa node between new pmd threads. */ + CMAP_FOR_EACH (port, node, &dp->ports) { + if (netdev_is_pmd(port->netdev) + && netdev_get_numa_id(port->netdev) == numa_id) { + for (i = 0; i < netdev_n_rxq(port->netdev); i++) { + /* Make thread-safety analyser happy. */ + ovs_mutex_lock(&pmds[index]->poll_mutex); + dp_netdev_add_rxq_to_pmd(pmds[index], port, port->rxq[i]); + ovs_mutex_unlock(&pmds[index]->poll_mutex); + index = (index + 1) % can_have; + } + } } + + /* Actual start of pmd threads. */ + for (i = 0; i < can_have; i++) { + pmds[i]->thread = ovs_thread_create("pmd", pmd_thread_main, pmds[i]); + } + free(pmds); VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id); } } @@ -3036,11 +3233,27 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, struct ofpbuf *actions, struct ofpbuf *put_actions) { struct dp_netdev *dp = pmd->dp; + struct flow_tnl orig_tunnel; + int err; if (OVS_UNLIKELY(!dp->upcall_cb)) { return ENODEV; } + /* Upcall processing expects the Geneve options to be in the translated + * format but we need to retain the raw format for datapath use. */ + orig_tunnel.flags = flow->tunnel.flags; + if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) { + orig_tunnel.metadata.present.len = flow->tunnel.metadata.present.len; + memcpy(orig_tunnel.metadata.opts.gnv, flow->tunnel.metadata.opts.gnv, + flow->tunnel.metadata.present.len); + err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel, + &flow->tunnel); + if (err) { + return err; + } + } + if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) { struct ds ds = DS_EMPTY_INITIALIZER; char *packet_str; @@ -3068,8 +3281,48 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, ds_destroy(&ds); } - return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, - actions, wc, put_actions, dp->upcall_aux); + err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, + actions, wc, put_actions, dp->upcall_aux); + if (err && err != ENOSPC) { + return err; + } + + /* Translate tunnel metadata masks to datapath format. */ + if (wc) { + if (wc->masks.tunnel.metadata.present.map) { + struct geneve_opt opts[TLV_TOT_OPT_SIZE / + sizeof(struct geneve_opt)]; + + if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) { + tun_metadata_to_geneve_udpif_mask(&flow->tunnel, + &wc->masks.tunnel, + orig_tunnel.metadata.opts.gnv, + orig_tunnel.metadata.present.len, + opts); + } else { + orig_tunnel.metadata.present.len = 0; + } + + memset(&wc->masks.tunnel.metadata, 0, + sizeof wc->masks.tunnel.metadata); + memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts, + orig_tunnel.metadata.present.len); + } + wc->masks.tunnel.metadata.present.len = 0xff; + } + + /* Restore tunnel metadata. We need to use the saved options to ensure + * that any unknown options are not lost. The generated mask will have + * the same structure, matching on types and lengths but wildcarding + * option data we don't care about. */ + if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) { + memcpy(&flow->tunnel.metadata.opts.gnv, orig_tunnel.metadata.opts.gnv, + orig_tunnel.metadata.present.len); + flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len; + flow->tunnel.flags |= FLOW_TNL_F_UDPIF; + } + + return err; } static inline uint32_t @@ -3078,8 +3331,9 @@ dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, { uint32_t hash, recirc_depth; - hash = dp_packet_get_rss_hash(packet); - if (OVS_UNLIKELY(!hash)) { + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { + hash = dp_packet_get_rss_hash(packet); + } else { hash = miniflow_hash_5tuple(mf, 0); dp_packet_set_rss_hash(packet, hash); } @@ -3148,74 +3402,76 @@ dp_netdev_queue_batches(struct dp_packet *pkt, { struct packet_batch *batch = flow->batch; - if (OVS_LIKELY(batch)) { - packet_batch_update(batch, pkt, mf); - return; + if (OVS_UNLIKELY(!batch)) { + batch = &batches[(*n_batches)++]; + packet_batch_init(batch, flow); } - batch = &batches[(*n_batches)++]; - packet_batch_init(batch, flow); packet_batch_update(batch, pkt, mf); } -static inline void -dp_packet_swap(struct dp_packet **a, struct dp_packet **b) -{ - struct dp_packet *tmp = *a; - *a = *b; - *b = tmp; -} - /* Try to process all ('cnt') the 'packets' using only the exact match cache - * 'flow_cache'. If a flow is not found for a packet 'packets[i]', the + * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the * miniflow is copied into 'keys' and the packet pointer is moved at the * beginning of the 'packets' array. * * The function returns the number of packets that needs to be processed in the * 'packets' array (they have been moved to the beginning of the vector). + * + * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be + * initialized by this function using 'port_no'. */ static inline size_t emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets, size_t cnt, struct netdev_flow_key *keys, - struct packet_batch batches[], size_t *n_batches) + struct packet_batch batches[], size_t *n_batches, + bool md_is_valid, odp_port_t port_no) { struct emc_cache *flow_cache = &pmd->flow_cache; - struct netdev_flow_key key; - size_t i, notfound_cnt = 0; + struct netdev_flow_key *key = &keys[0]; + size_t i, n_missed = 0, n_dropped = 0; for (i = 0; i < cnt; i++) { struct dp_netdev_flow *flow; + struct dp_packet *packet = packets[i]; - if (OVS_UNLIKELY(dp_packet_size(packets[i]) < ETH_HEADER_LEN)) { - dp_packet_delete(packets[i]); + if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { + dp_packet_delete(packet); + n_dropped++; continue; } if (i != cnt - 1) { - /* Prefetch next packet data */ + /* Prefetch next packet data and metadata. */ OVS_PREFETCH(dp_packet_data(packets[i+1])); + pkt_metadata_prefetch_init(&packets[i+1]->md); } - miniflow_extract(packets[i], &key.mf); - key.len = 0; /* Not computed yet. */ - key.hash = dpif_netdev_packet_get_rss_hash(packets[i], &key.mf); + if (!md_is_valid) { + pkt_metadata_init(&packet->md, port_no); + } + miniflow_extract(packet, &key->mf); + key->len = 0; /* Not computed yet. */ + key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); - flow = emc_lookup(flow_cache, &key); + flow = emc_lookup(flow_cache, key); if (OVS_LIKELY(flow)) { - dp_netdev_queue_batches(packets[i], flow, &key.mf, batches, + dp_netdev_queue_batches(packet, flow, &key->mf, batches, n_batches); } else { - if (i != notfound_cnt) { - dp_packet_swap(&packets[i], &packets[notfound_cnt]); - } - - keys[notfound_cnt++] = key; + /* Exact match cache missed. Group missed packets together at + * the beginning of the 'packets' array. */ + packets[n_missed] = packet; + /* 'key[n_missed]' contains the key of the current packet and it + * must be returned to the caller. The next key should be extracted + * to 'keys[n_missed + 1]'. */ + key = &keys[++n_missed]; } } - dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - notfound_cnt); + dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed); - return notfound_cnt; + return n_missed; } static inline void @@ -3271,6 +3527,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, miss_cnt++; + match.tun_md.valid = false; miniflow_expand(&keys[i].mf, &match.flow); ofpbuf_clear(&actions); @@ -3286,6 +3543,16 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, continue; } + /* The Netlink encoding of datapath flow keys cannot express + * wildcarding the presence of a VLAN tag. Instead, a missing VLAN + * tag is interpreted as exact match on the fact that there is no + * VLAN. Unless we refactor a lot of code that translates between + * Netlink and struct flow representations, we have to do the same + * here. */ + if (!match.wc.masks.vlan_tci) { + match.wc.masks.vlan_tci = htons(0xffff); + } + /* We can't allow the packet batching in the next loop to execute * the actions. Otherwise, if there are any slow path actions, * we'll send the packet up twice. */ @@ -3346,9 +3613,16 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt); } +/* Packets enter the datapath from a port (or from recirculation) here. + * + * For performance reasons a caller may choose not to initialize the metadata + * in 'packets': in this case 'mdinit' is false and this function needs to + * initialize it using 'port_no'. If the metadata in 'packets' is already + * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */ static void -dp_netdev_input(struct dp_netdev_pmd_thread *pmd, - struct dp_packet **packets, int cnt) +dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, + struct dp_packet **packets, int cnt, + bool md_is_valid, odp_port_t port_no) { #if !defined(__CHECKER__) && !defined(_WIN32) const size_t PKT_ARRAY_SIZE = cnt; @@ -3362,7 +3636,8 @@ dp_netdev_input(struct dp_netdev_pmd_thread *pmd, size_t newcnt, n_batches, i; n_batches = 0; - newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches); + newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches, + md_is_valid, port_no); if (OVS_UNLIKELY(newcnt)) { fast_path_processing(pmd, packets, newcnt, keys, batches, &n_batches); } @@ -3376,10 +3651,34 @@ dp_netdev_input(struct dp_netdev_pmd_thread *pmd, } } +static void +dp_netdev_input(struct dp_netdev_pmd_thread *pmd, + struct dp_packet **packets, int cnt, + odp_port_t port_no) +{ + dp_netdev_input__(pmd, packets, cnt, false, port_no); +} + +static void +dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, + struct dp_packet **packets, int cnt) +{ + dp_netdev_input__(pmd, packets, cnt, true, 0); +} + struct dp_netdev_execute_aux { struct dp_netdev_pmd_thread *pmd; }; +static void +dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, + void *aux) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + dp->dp_purge_aux = aux; + dp->dp_purge_cb = cb; +} + static void dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, void *aux) @@ -3448,7 +3747,11 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, case OVS_ACTION_ATTR_OUTPUT: p = dp_netdev_lookup_port(dp, u32_to_odp(nl_attr_get_u32(a))); if (OVS_LIKELY(p)) { - netdev_send(p->netdev, pmd->tx_qid, packets, cnt, may_steal); + int tx_qid; + + atomic_read_relaxed(&pmd->tx_qid, &tx_qid); + + netdev_send(p->netdev, tx_qid, packets, cnt, may_steal); return; } break; @@ -3466,7 +3769,7 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, err = push_tnl_action(dp, a, packets, cnt); if (!err) { (*depth)++; - dp_netdev_input(pmd, packets, cnt); + dp_netdev_recirculate(pmd, packets, cnt); (*depth)--; } else { dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal); @@ -3497,7 +3800,7 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, } (*depth)++; - dp_netdev_input(pmd, packets, cnt); + dp_netdev_recirculate(pmd, packets, cnt); (*depth)--; } else { dp_netdev_drop_packets(tnl_pkt, cnt, !may_steal); @@ -3555,7 +3858,7 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, } (*depth)++; - dp_netdev_input(pmd, packets, cnt); + dp_netdev_recirculate(pmd, packets, cnt); (*depth)--; return; @@ -3564,6 +3867,13 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); break; + case OVS_ACTION_ATTR_CT: + /* If a flow with this action is slow-pathed, datapath assistance is + * required to implement it. However, we don't support this action + * in the userspace datapath. */ + VLOG_WARN("Cannot execute conntrack action in userspace."); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_MPLS: @@ -3627,10 +3937,15 @@ const struct dpif_class dpif_netdev_class = { NULL, /* recv */ NULL, /* recv_wait */ NULL, /* recv_purge */ + dpif_netdev_register_dp_purge_cb, dpif_netdev_register_upcall_cb, dpif_netdev_enable_upcall, dpif_netdev_disable_upcall, dpif_netdev_get_datapath_version, + NULL, /* ct_dump_start */ + NULL, /* ct_dump_next */ + NULL, /* ct_dump_done */ + NULL, /* ct_flush */ }; static void @@ -3730,7 +4045,14 @@ dpif_dummy_register__(const char *type) static void dpif_dummy_override(const char *type) { - if (!dp_unregister_provider(type)) { + int error; + + /* + * Ignore EAFNOSUPPORT to allow --enable-dummy=system with + * a userland-only build. It's useful for testsuite. + */ + error = dp_unregister_provider(type); + if (error == 0 || error == EAFNOSUPPORT) { dpif_dummy_register__(type); } } @@ -3878,15 +4200,10 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule, { const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); - uint64_t target_u64; + uint64_t value; - NETDEV_FLOW_KEY_FOR_EACH_IN_TNL_MAP(target_u64, target, rule->flow.mf) { - if (OVS_UNLIKELY((target_u64 & *maskp++) != *keyp++)) { - return false; - } - } - NETDEV_FLOW_KEY_FOR_EACH_IN_PKT_MAP(target_u64, target, rule->flow.mf) { - if (OVS_UNLIKELY((target_u64 & *maskp++) != *keyp++)) { + NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) { + if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) { return false; } }