2 * Copyright (c) 2014, 2015, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
30 #include <sys/types.h>
35 #include "dp-packet.h"
36 #include "dpif-netdev.h"
37 #include "fatal-signal.h"
38 #include "netdev-dpdk.h"
39 #include "netdev-provider.h"
40 #include "netdev-vport.h"
42 #include "openvswitch/dynamic-string.h"
43 #include "openvswitch/list.h"
44 #include "openvswitch/ofp-print.h"
45 #include "openvswitch/vlog.h"
47 #include "ovs-thread.h"
53 #include "unaligned.h"
57 #include "rte_config.h"
59 #include "rte_meter.h"
60 #include "rte_virtio_net.h"
62 VLOG_DEFINE_THIS_MODULE(dpdk);
63 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
65 #define DPDK_PORT_WATCHDOG_INTERVAL 5
67 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
68 #define OVS_VPORT_DPDK "ovs_dpdk"
71 * need to reserve tons of extra space in the mbufs so we can align the
72 * DMA addresses to 4KB.
73 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
74 * performance for standard Ethernet MTU.
76 #define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN))
77 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
78 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
79 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN - ETHER_CRC_LEN)
80 #define MBUF_SIZE(mtu) ( MTU_TO_MAX_FRAME_LEN(mtu) \
81 + sizeof(struct dp_packet) \
82 + RTE_PKTMBUF_HEADROOM)
83 #define NETDEV_DPDK_MBUF_ALIGN 1024
85 /* Max and min number of packets in the mempool. OVS tries to allocate a
86 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
87 * enough hugepages) we keep halving the number until the allocation succeeds
88 * or we reach MIN_NB_MBUF */
90 #define MAX_NB_MBUF (4096 * 64)
91 #define MIN_NB_MBUF (4096 * 4)
92 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
94 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
95 BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF) == 0);
97 /* The smallest possible NB_MBUF that we're going to try should be a multiple
98 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
99 BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
103 * DPDK XSTATS Counter names definition
105 #define XSTAT_RX_64_PACKETS "rx_size_64_packets"
106 #define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
107 #define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
108 #define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
109 #define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
110 #define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
111 #define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
113 #define XSTAT_TX_64_PACKETS "tx_size_64_packets"
114 #define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
115 #define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
116 #define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
117 #define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
118 #define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
119 #define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
121 #define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
122 #define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
123 #define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
124 #define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
125 #define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
126 #define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
127 #define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
131 #define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
132 #define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
134 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
135 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
136 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
137 * yet mapped to another queue. */
140 static char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */
142 static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
145 * Maximum amount of time in micro seconds to try and enqueue to vhost.
147 #define VHOST_ENQ_RETRY_USECS 100
149 static const struct rte_eth_conf port_conf = {
151 .mq_mode = ETH_MQ_RX_RSS,
153 .header_split = 0, /* Header Split disabled */
154 .hw_ip_checksum = 0, /* IP checksum offload disabled */
155 .hw_vlan_filter = 0, /* VLAN filtering disabled */
156 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
162 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
166 .mq_mode = ETH_MQ_TX_NONE,
170 enum { MAX_TX_QUEUE_LEN = 384 };
171 enum { DPDK_RING_SIZE = 256 };
172 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
173 enum { DRAIN_TSC = 200000ULL };
180 static int rte_eal_init_ret = ENODEV;
182 static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
184 /* Quality of Service */
186 /* An instance of a QoS configuration. Always associated with a particular
189 * Each QoS implementation subclasses this with whatever additional data it
193 const struct dpdk_qos_ops *ops;
196 /* A particular implementation of dpdk QoS operations.
198 * The functions below return 0 if successful or a positive errno value on
199 * failure, except where otherwise noted. All of them must be provided, except
200 * where otherwise noted.
202 struct dpdk_qos_ops {
204 /* Name of the QoS type */
205 const char *qos_name;
207 /* Called to construct the QoS implementation on 'netdev'. The
208 * implementation should make the appropriate calls to configure QoS
209 * according to 'details'. The implementation may assume that any current
210 * QoS configuration already installed should be destroyed before
211 * constructing the new configuration.
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
217 * This function must return 0 if and only if it sets 'netdev->qos_conf'
218 * to an initialized 'struct qos_conf'.
220 * For all QoS implementations it should always be non-null.
222 int (*qos_construct)(struct netdev *netdev, const struct smap *details);
224 /* Destroys the data structures allocated by the implementation as part of
227 * For all QoS implementations it should always be non-null.
229 void (*qos_destruct)(struct netdev *netdev, struct qos_conf *conf);
231 /* Retrieves details of 'netdev->qos_conf' configuration into 'details'.
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
237 int (*qos_get)(const struct netdev *netdev, struct smap *details);
239 /* Reconfigures 'netdev->qos_conf' according to 'details', performing any
240 * required calls to complete the reconfiguration.
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
244 * (which is built as ovs-vswitchd.conf.db(8)).
246 * This function may be null if 'qos_conf' is not configurable.
248 int (*qos_set)(struct netdev *netdev, const struct smap *details);
250 /* Modify an array of rte_mbufs. The modification is specific to
251 * each qos implementation.
253 * The function should take and array of mbufs and an int representing
254 * the current number of mbufs present in the array.
256 * After the function has performed a qos modification to the array of
257 * mbufs it returns an int representing the number of mbufs now present in
258 * the array. This value is can then be passed to the port send function
259 * along with the modified array for transmission.
261 * For all QoS implementations it should always be non-null.
263 int (*qos_run)(struct netdev *netdev, struct rte_mbuf **pkts,
267 /* dpdk_qos_ops for each type of user space QoS implementation */
268 static const struct dpdk_qos_ops egress_policer_ops;
271 * Array of dpdk_qos_ops, contains pointer to all supported QoS
274 static const struct dpdk_qos_ops *const qos_confs[] = {
279 /* Contains all 'struct dpdk_dev's. */
280 static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
281 = OVS_LIST_INITIALIZER(&dpdk_list);
283 static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
284 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
286 /* This mutex must be used by non pmd threads when allocating or freeing
287 * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
288 * use mempools, a non pmd thread should hold this mutex while calling them */
289 static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
292 struct rte_mempool *mp;
296 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
299 /* There should be one 'struct dpdk_tx_queue' created for
301 struct dpdk_tx_queue {
302 bool flush_tx; /* Set to true to flush queue everytime */
303 /* pkts are queued. */
305 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
306 * from concurrent access. It is used only
307 * if the queue is shared among different
308 * pmd threads (see 'txq_needs_locking'). */
309 int map; /* Mapping of configured vhost-user queues
310 * to enabled by guest. */
312 struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
315 /* dpdk has no way to remove dpdk ring ethernet devices
316 so we have to keep them around once they've been created
319 static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
320 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
323 /* For the client rings */
324 struct rte_ring *cring_tx;
325 struct rte_ring *cring_rx;
326 unsigned int user_port_id; /* User given port no, parsed from port name */
327 int eth_port_id; /* ethernet device port id */
328 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
335 enum dpdk_dev_type type;
337 struct dpdk_tx_queue *tx_q;
339 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
341 struct dpdk_mp *dpdk_mp;
345 struct netdev_stats stats;
347 rte_spinlock_t stats_lock;
349 struct eth_addr hwaddr;
350 enum netdev_flags flags;
352 struct rte_eth_link link;
355 /* The user might request more txqs than the NIC has. We remap those
356 * ('up.n_txq') on these ('real_n_txq').
357 * If the numbers match, 'txq_needs_locking' is false, otherwise it is
358 * true and we will take a spinlock on transmission */
361 bool txq_needs_locking;
363 /* virtio-net structure for vhost device */
364 OVSRCU_TYPE(struct virtio_net *) virtio_dev;
366 /* Identifier used to distinguish vhost devices from each other */
367 char vhost_id[PATH_MAX];
370 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
372 /* QoS configuration and lock for the device */
373 struct qos_conf *qos_conf;
374 rte_spinlock_t qos_lock;
376 /* The following properties cannot be changed when a device is running,
377 * so we remember the request and update them next time
378 * netdev_dpdk*_reconfigure() is called */
383 struct netdev_rxq_dpdk {
384 struct netdev_rxq up;
388 static bool dpdk_thread_is_pmd(void);
390 static int netdev_dpdk_construct(struct netdev *);
392 struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
395 is_dpdk_class(const struct netdev_class *class)
397 return class->construct == netdev_dpdk_construct;
400 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
401 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
402 * value, insufficient buffers are allocated to accomodate the packet in its
403 * entirety. Furthermore, certain drivers need to ensure that there is also
404 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
405 * frames). If the RX buffer is too small, then the driver enables scatter RX
406 * behaviour, which reduces performance. To prevent this, use a buffer size that
407 * is closest to 'mtu', but which satisfies the aforementioned criteria.
410 dpdk_buf_size(int mtu)
412 return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
413 NETDEV_DPDK_MBUF_ALIGN);
416 /* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
417 * for all other segments data, bss and text. */
420 dpdk_rte_mzalloc(size_t sz)
424 ptr = rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
431 /* XXX this function should be called only by pmd threads (or by non pmd
432 * threads holding the nonpmd_mempool_mutex) */
434 free_dpdk_buf(struct dp_packet *p)
436 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
438 rte_pktmbuf_free(pkt);
442 ovs_rte_pktmbuf_init(struct rte_mempool *mp,
443 void *opaque_arg OVS_UNUSED,
445 unsigned i OVS_UNUSED)
447 struct rte_mbuf *m = _m;
449 rte_pktmbuf_init(mp, opaque_arg, _m, i);
451 dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
454 static struct dpdk_mp *
455 dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
457 struct dpdk_mp *dmp = NULL;
458 char mp_name[RTE_MEMPOOL_NAMESIZE];
460 struct rte_pktmbuf_pool_private mbp_priv;
462 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
463 if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
469 dmp = dpdk_rte_mzalloc(sizeof *dmp);
470 dmp->socket_id = socket_id;
473 mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet);
474 mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) - sizeof (struct rte_mbuf);
476 mp_size = MAX_NB_MBUF;
478 if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
479 dmp->mtu, dmp->socket_id, mp_size) < 0) {
483 dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
485 sizeof(struct rte_pktmbuf_pool_private),
486 rte_pktmbuf_pool_init, &mbp_priv,
487 ovs_rte_pktmbuf_init, NULL,
489 } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
491 if (dmp->mp == NULL) {
494 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", mp_name, mp_size );
497 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
502 dpdk_mp_put(struct dpdk_mp *dmp)
510 ovs_assert(dmp->refcount >= 0);
513 /* I could not find any API to destroy mp. */
514 if (dmp->refcount == 0) {
515 list_delete(dmp->list_node);
516 /* destroy mp-pool. */
522 check_link_status(struct netdev_dpdk *dev)
524 struct rte_eth_link link;
526 rte_eth_link_get_nowait(dev->port_id, &link);
528 if (dev->link.link_status != link.link_status) {
529 netdev_change_seq_changed(&dev->up);
531 dev->link_reset_cnt++;
533 if (dev->link.link_status) {
534 VLOG_DBG_RL(&rl, "Port %d Link Up - speed %u Mbps - %s",
535 dev->port_id, (unsigned)dev->link.link_speed,
536 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
537 ("full-duplex") : ("half-duplex"));
539 VLOG_DBG_RL(&rl, "Port %d Link Down", dev->port_id);
545 dpdk_watchdog(void *dummy OVS_UNUSED)
547 struct netdev_dpdk *dev;
549 pthread_detach(pthread_self());
552 ovs_mutex_lock(&dpdk_mutex);
553 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
554 ovs_mutex_lock(&dev->mutex);
555 check_link_status(dev);
556 ovs_mutex_unlock(&dev->mutex);
558 ovs_mutex_unlock(&dpdk_mutex);
559 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
566 dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
571 /* A device may report more queues than it makes available (this has
572 * been observed for Intel xl710, which reserves some of them for
573 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
574 * available. When this happens we can retry the configuration
575 * and request less queues */
576 while (n_rxq && n_txq) {
578 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
581 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
586 for (i = 0; i < n_txq; i++) {
587 diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
588 dev->socket_id, NULL);
590 VLOG_INFO("Interface %s txq(%d) setup error: %s",
591 dev->up.name, i, rte_strerror(-diag));
597 /* Retry with less tx queues */
602 for (i = 0; i < n_rxq; i++) {
603 diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
604 dev->socket_id, NULL,
607 VLOG_INFO("Interface %s rxq(%d) setup error: %s",
608 dev->up.name, i, rte_strerror(-diag));
614 /* Retry with less rx queues */
619 dev->up.n_rxq = n_rxq;
620 dev->real_n_txq = n_txq;
630 dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
632 struct rte_pktmbuf_pool_private *mbp_priv;
633 struct rte_eth_dev_info info;
634 struct ether_addr eth_addr;
638 if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
642 rte_eth_dev_info_get(dev->port_id, &info);
644 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
645 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
647 diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
649 VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
650 dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
654 diag = rte_eth_dev_start(dev->port_id);
656 VLOG_ERR("Interface %s start error: %s", dev->up.name,
657 rte_strerror(-diag));
661 rte_eth_promiscuous_enable(dev->port_id);
662 rte_eth_allmulticast_enable(dev->port_id);
664 memset(ð_addr, 0x0, sizeof(eth_addr));
665 rte_eth_macaddr_get(dev->port_id, ð_addr);
666 VLOG_INFO_RL(&rl, "Port %d: "ETH_ADDR_FMT"",
667 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
669 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
670 rte_eth_link_get_nowait(dev->port_id, &dev->link);
672 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
673 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
675 dev->flags = NETDEV_UP | NETDEV_PROMISC;
679 static struct netdev_dpdk *
680 netdev_dpdk_cast(const struct netdev *netdev)
682 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
685 static struct netdev *
686 netdev_dpdk_alloc(void)
688 struct netdev_dpdk *dev;
690 if (!rte_eal_init_ret) { /* Only after successful initialization */
691 dev = dpdk_rte_mzalloc(sizeof *dev);
700 netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
704 dev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *dev->tx_q);
705 for (i = 0; i < n_txqs; i++) {
706 int numa_id = ovs_numa_get_numa_id(i);
708 if (!dev->txq_needs_locking) {
709 /* Each index is considered as a cpu core id, since there should
710 * be one tx queue for each cpu core. If the corresponding core
711 * is not on the same numa node as 'dev', flags the
713 dev->tx_q[i].flush_tx = dev->socket_id == numa_id;
715 /* Queues are shared among CPUs. Always flush */
716 dev->tx_q[i].flush_tx = true;
719 /* Initialize map for vhost devices. */
720 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
721 rte_spinlock_init(&dev->tx_q[i].tx_lock);
726 netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
727 enum dpdk_dev_type type)
728 OVS_REQUIRES(dpdk_mutex)
730 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
735 ovs_mutex_init(&dev->mutex);
736 ovs_mutex_lock(&dev->mutex);
738 rte_spinlock_init(&dev->stats_lock);
740 /* If the 'sid' is negative, it means that the kernel fails
741 * to obtain the pci numa info. In that situation, always
743 if (type == DPDK_DEV_ETH) {
744 sid = rte_eth_dev_socket_id(port_no);
746 sid = rte_lcore_to_socket_id(rte_get_master_lcore());
749 dev->socket_id = sid < 0 ? SOCKET0 : sid;
750 dev->port_id = port_no;
753 dev->mtu = ETHER_MTU;
754 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
756 buf_size = dpdk_buf_size(dev->mtu);
757 dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
763 /* Initialise QoS configuration to NULL and qos lock to unlocked */
764 dev->qos_conf = NULL;
765 rte_spinlock_init(&dev->qos_lock);
767 netdev->n_txq = NR_QUEUE;
768 netdev->n_rxq = NR_QUEUE;
769 dev->requested_n_rxq = NR_QUEUE;
770 dev->requested_n_txq = NR_QUEUE;
771 dev->real_n_txq = NR_QUEUE;
773 if (type == DPDK_DEV_ETH) {
774 netdev_dpdk_alloc_txq(dev, NR_QUEUE);
775 err = dpdk_eth_dev_init(dev);
780 netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
783 ovs_list_push_back(&dpdk_list, &dev->list_node);
789 ovs_mutex_unlock(&dev->mutex);
793 /* dev_name must be the prefix followed by a positive decimal number.
794 * (no leading + or - signs are allowed) */
796 dpdk_dev_parse_name(const char dev_name[], const char prefix[],
797 unsigned int *port_no)
801 if (strncmp(dev_name, prefix, strlen(prefix))) {
805 cport = dev_name + strlen(prefix);
807 if (str_to_uint(cport, 10, port_no)) {
815 vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
817 if (rte_eal_init_ret) {
818 return rte_eal_init_ret;
821 return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
825 netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
827 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
830 if (rte_eal_init_ret) {
831 return rte_eal_init_ret;
834 ovs_mutex_lock(&dpdk_mutex);
835 strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
836 err = vhost_construct_helper(netdev);
837 ovs_mutex_unlock(&dpdk_mutex);
842 netdev_dpdk_vhost_user_construct(struct netdev *netdev)
844 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
845 const char *name = netdev->name;
848 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
849 * the file system. '/' or '\' would traverse directories, so they're not
850 * acceptable in 'name'. */
851 if (strchr(name, '/') || strchr(name, '\\')) {
852 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
853 "A valid name must not include '/' or '\\'",
858 if (rte_eal_init_ret) {
859 return rte_eal_init_ret;
862 ovs_mutex_lock(&dpdk_mutex);
863 /* Take the name of the vhost-user port and append it to the location where
864 * the socket is to be created, then register the socket.
866 snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
867 vhost_sock_dir, name);
869 err = rte_vhost_driver_register(dev->vhost_id);
871 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
874 fatal_signal_add_file_to_unlink(dev->vhost_id);
875 VLOG_INFO("Socket %s created for vhost-user port %s\n",
876 dev->vhost_id, name);
877 err = vhost_construct_helper(netdev);
880 ovs_mutex_unlock(&dpdk_mutex);
885 netdev_dpdk_construct(struct netdev *netdev)
887 unsigned int port_no;
890 if (rte_eal_init_ret) {
891 return rte_eal_init_ret;
894 /* Names always start with "dpdk" */
895 err = dpdk_dev_parse_name(netdev->name, "dpdk", &port_no);
900 ovs_mutex_lock(&dpdk_mutex);
901 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
902 ovs_mutex_unlock(&dpdk_mutex);
907 netdev_dpdk_destruct(struct netdev *netdev)
909 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
911 ovs_mutex_lock(&dev->mutex);
912 rte_eth_dev_stop(dev->port_id);
913 ovs_mutex_unlock(&dev->mutex);
915 ovs_mutex_lock(&dpdk_mutex);
917 ovs_list_remove(&dev->list_node);
918 dpdk_mp_put(dev->dpdk_mp);
919 ovs_mutex_unlock(&dpdk_mutex);
923 netdev_dpdk_vhost_destruct(struct netdev *netdev)
925 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
927 /* Guest becomes an orphan if still attached. */
928 if (netdev_dpdk_get_virtio(dev) != NULL) {
929 VLOG_ERR("Removing port '%s' while vhost device still attached.",
931 VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
932 " '%s' must be restarted.",
936 if (rte_vhost_driver_unregister(dev->vhost_id)) {
937 VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
939 fatal_signal_remove_file_to_unlink(dev->vhost_id);
942 ovs_mutex_lock(&dpdk_mutex);
944 ovs_list_remove(&dev->list_node);
945 dpdk_mp_put(dev->dpdk_mp);
946 ovs_mutex_unlock(&dpdk_mutex);
950 netdev_dpdk_dealloc(struct netdev *netdev)
952 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
958 netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
960 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
962 ovs_mutex_lock(&dev->mutex);
964 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
965 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
966 smap_add_format(args, "requested_tx_queues", "%d", netdev->n_txq);
967 smap_add_format(args, "configured_tx_queues", "%d", dev->real_n_txq);
968 ovs_mutex_unlock(&dev->mutex);
974 netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args)
976 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
979 ovs_mutex_lock(&dev->mutex);
980 new_n_rxq = MAX(smap_get_int(args, "n_rxq", dev->requested_n_rxq), 1);
981 if (new_n_rxq != dev->requested_n_rxq) {
982 dev->requested_n_rxq = new_n_rxq;
983 netdev_request_reconfigure(netdev);
985 ovs_mutex_unlock(&dev->mutex);
991 netdev_dpdk_get_numa_id(const struct netdev *netdev)
993 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
995 return dev->socket_id;
998 /* Sets the number of tx queues for the dpdk interface. */
1000 netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
1002 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1004 ovs_mutex_lock(&dev->mutex);
1006 if (dev->requested_n_txq == n_txq) {
1010 dev->requested_n_txq = n_txq;
1011 netdev_request_reconfigure(netdev);
1014 ovs_mutex_unlock(&dev->mutex);
1018 static struct netdev_rxq *
1019 netdev_dpdk_rxq_alloc(void)
1021 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
1026 static struct netdev_rxq_dpdk *
1027 netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
1029 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
1033 netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
1035 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1036 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1038 ovs_mutex_lock(&dev->mutex);
1039 rx->port_id = dev->port_id;
1040 ovs_mutex_unlock(&dev->mutex);
1046 netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
1051 netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
1053 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1059 dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
1061 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1064 while (nb_tx != txq->count) {
1067 ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
1068 txq->count - nb_tx);
1076 if (OVS_UNLIKELY(nb_tx != txq->count)) {
1077 /* free buffers, which we couldn't transmit, one at a time (each
1078 * packet could come from a different mempool) */
1081 for (i = nb_tx; i < txq->count; i++) {
1082 rte_pktmbuf_free(txq->burst_pkts[i]);
1084 rte_spinlock_lock(&dev->stats_lock);
1085 dev->stats.tx_dropped += txq->count-nb_tx;
1086 rte_spinlock_unlock(&dev->stats_lock);
1090 txq->tsc = rte_get_timer_cycles();
1094 dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
1096 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1098 if (txq->count == 0) {
1101 dpdk_queue_flush__(dev, qid);
1105 is_vhost_running(struct virtio_net *virtio_dev)
1107 return (virtio_dev != NULL && (virtio_dev->flags & VIRTIO_DEV_RUNNING));
1111 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
1112 unsigned int packet_size)
1114 /* Hard-coded search for the size bucket. */
1115 if (packet_size < 256) {
1116 if (packet_size >= 128) {
1117 stats->rx_128_to_255_packets++;
1118 } else if (packet_size <= 64) {
1119 stats->rx_1_to_64_packets++;
1121 stats->rx_65_to_127_packets++;
1124 if (packet_size >= 1523) {
1125 stats->rx_1523_to_max_packets++;
1126 } else if (packet_size >= 1024) {
1127 stats->rx_1024_to_1522_packets++;
1128 } else if (packet_size < 512) {
1129 stats->rx_256_to_511_packets++;
1131 stats->rx_512_to_1023_packets++;
1137 netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
1138 struct dp_packet **packets, int count)
1141 unsigned int packet_size;
1142 struct dp_packet *packet;
1144 stats->rx_packets += count;
1145 for (i = 0; i < count; i++) {
1146 packet = packets[i];
1147 packet_size = dp_packet_size(packet);
1149 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
1150 /* This only protects the following multicast counting from
1151 * too short packets, but it does not stop the packet from
1152 * further processing. */
1154 stats->rx_length_errors++;
1158 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
1160 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
1161 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
1165 stats->rx_bytes += packet_size;
1170 * The receive path for the vhost port is the TX path out from guest.
1173 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
1174 struct dp_packet **packets, int *c)
1176 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1177 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
1178 int qid = rxq->queue_id;
1181 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
1185 if (rxq->queue_id >= dev->real_n_rxq) {
1189 nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ,
1191 (struct rte_mbuf **)packets,
1197 rte_spinlock_lock(&dev->stats_lock);
1198 netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx);
1199 rte_spinlock_unlock(&dev->stats_lock);
1206 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
1209 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1210 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1213 /* There is only one tx queue for this core. Do not flush other
1215 * Do not flush tx queue which is shared among CPUs
1216 * since it is always flushed */
1217 if (rxq->queue_id == rte_lcore_id() &&
1218 OVS_LIKELY(!dev->txq_needs_locking)) {
1219 dpdk_queue_flush(dev, rxq->queue_id);
1222 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
1223 (struct rte_mbuf **) packets,
1235 netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
1238 struct netdev *netdev = &dev->up;
1240 if (dev->qos_conf != NULL) {
1241 rte_spinlock_lock(&dev->qos_lock);
1242 if (dev->qos_conf != NULL) {
1243 cnt = dev->qos_conf->ops->qos_run(netdev, pkts, cnt);
1245 rte_spinlock_unlock(&dev->qos_lock);
1252 netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
1253 struct dp_packet **packets,
1258 int sent = attempted - dropped;
1260 stats->tx_packets += sent;
1261 stats->tx_dropped += dropped;
1263 for (i = 0; i < sent; i++) {
1264 stats->tx_bytes += dp_packet_size(packets[i]);
1269 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
1270 struct dp_packet **pkts, int cnt,
1273 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1274 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
1275 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
1276 unsigned int total_pkts = cnt;
1277 unsigned int qos_pkts = cnt;
1280 qid = dev->tx_q[qid % dev->real_n_txq].map;
1282 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {
1283 rte_spinlock_lock(&dev->stats_lock);
1284 dev->stats.tx_dropped+= cnt;
1285 rte_spinlock_unlock(&dev->stats_lock);
1289 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1291 /* Check has QoS has been configured for the netdev */
1292 cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
1296 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
1297 unsigned int tx_pkts;
1299 tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
1301 if (OVS_LIKELY(tx_pkts)) {
1302 /* Packets have been sent.*/
1304 /* Prepare for possible next iteration.*/
1305 cur_pkts = &cur_pkts[tx_pkts];
1307 uint64_t timeout = VHOST_ENQ_RETRY_USECS * rte_get_timer_hz() / 1E6;
1308 unsigned int expired = 0;
1311 start = rte_get_timer_cycles();
1315 * Unable to enqueue packets to vhost interface.
1316 * Check available entries before retrying.
1318 while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {
1319 if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {
1325 /* break out of main loop. */
1331 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1333 rte_spinlock_lock(&dev->stats_lock);
1335 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
1336 rte_spinlock_unlock(&dev->stats_lock);
1342 for (i = 0; i < total_pkts; i++) {
1343 dp_packet_delete(pkts[i]);
1349 dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
1350 struct rte_mbuf **pkts, int cnt)
1352 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1358 int freeslots = MAX_TX_QUEUE_LEN - txq->count;
1359 int tocopy = MIN(freeslots, cnt-i);
1361 memcpy(&txq->burst_pkts[txq->count], &pkts[i],
1362 tocopy * sizeof (struct rte_mbuf *));
1364 txq->count += tocopy;
1367 if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
1368 dpdk_queue_flush__(dev, qid);
1370 diff_tsc = rte_get_timer_cycles() - txq->tsc;
1371 if (diff_tsc >= DRAIN_TSC) {
1372 dpdk_queue_flush__(dev, qid);
1377 /* Tx function. Transmit packets indefinitely */
1379 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
1381 OVS_NO_THREAD_SAFETY_ANALYSIS
1383 #if !defined(__CHECKER__) && !defined(_WIN32)
1384 const size_t PKT_ARRAY_SIZE = cnt;
1386 /* Sparse or MSVC doesn't like variable length array. */
1387 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
1389 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1390 struct rte_mbuf *mbufs[PKT_ARRAY_SIZE];
1395 /* If we are on a non pmd thread we have to use the mempool mutex, because
1396 * every non pmd thread shares the same mempool cache */
1398 if (!dpdk_thread_is_pmd()) {
1399 ovs_mutex_lock(&nonpmd_mempool_mutex);
1402 for (i = 0; i < cnt; i++) {
1403 int size = dp_packet_size(pkts[i]);
1405 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1406 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1407 (int)size , dev->max_packet_len);
1413 mbufs[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
1415 if (!mbufs[newcnt]) {
1420 /* We have to do a copy for now */
1421 memcpy(rte_pktmbuf_mtod(mbufs[newcnt], void *), dp_packet_data(pkts[i]), size);
1423 rte_pktmbuf_data_len(mbufs[newcnt]) = size;
1424 rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
1429 if (dev->type == DPDK_DEV_VHOST) {
1430 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
1432 unsigned int qos_pkts = newcnt;
1434 /* Check if QoS has been configured for this netdev. */
1435 newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
1437 dropped += qos_pkts - newcnt;
1438 dpdk_queue_pkts(dev, qid, mbufs, newcnt);
1439 dpdk_queue_flush(dev, qid);
1442 if (OVS_UNLIKELY(dropped)) {
1443 rte_spinlock_lock(&dev->stats_lock);
1444 dev->stats.tx_dropped += dropped;
1445 rte_spinlock_unlock(&dev->stats_lock);
1448 if (!dpdk_thread_is_pmd()) {
1449 ovs_mutex_unlock(&nonpmd_mempool_mutex);
1454 netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
1455 int cnt, bool may_steal)
1457 if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
1460 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
1462 for (i = 0; i < cnt; i++) {
1463 dp_packet_delete(pkts[i]);
1467 __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
1473 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
1474 struct dp_packet **pkts, int cnt, bool may_steal)
1478 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1479 qid = qid % dev->real_n_txq;
1480 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1483 if (OVS_UNLIKELY(!may_steal ||
1484 pkts[0]->source != DPBUF_DPDK)) {
1485 struct netdev *netdev = &dev->up;
1487 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
1490 for (i = 0; i < cnt; i++) {
1491 dp_packet_delete(pkts[i]);
1495 int next_tx_idx = 0;
1497 unsigned int qos_pkts = 0;
1498 unsigned int temp_cnt = 0;
1500 for (i = 0; i < cnt; i++) {
1501 int size = dp_packet_size(pkts[i]);
1503 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1504 if (next_tx_idx != i) {
1505 temp_cnt = i - next_tx_idx;
1506 qos_pkts = temp_cnt;
1508 temp_cnt = netdev_dpdk_qos_run__(dev, (struct rte_mbuf**)pkts,
1510 dropped += qos_pkts - temp_cnt;
1511 dpdk_queue_pkts(dev, qid,
1512 (struct rte_mbuf **)&pkts[next_tx_idx],
1517 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1518 (int)size , dev->max_packet_len);
1520 dp_packet_delete(pkts[i]);
1522 next_tx_idx = i + 1;
1525 if (next_tx_idx != cnt) {
1529 cnt = netdev_dpdk_qos_run__(dev, (struct rte_mbuf**)pkts, cnt);
1530 dropped += qos_pkts - cnt;
1531 dpdk_queue_pkts(dev, qid, (struct rte_mbuf **)&pkts[next_tx_idx],
1535 if (OVS_UNLIKELY(dropped)) {
1536 rte_spinlock_lock(&dev->stats_lock);
1537 dev->stats.tx_dropped += dropped;
1538 rte_spinlock_unlock(&dev->stats_lock);
1542 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1543 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1548 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
1549 struct dp_packet **pkts, int cnt, bool may_steal)
1551 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1553 netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
1558 netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
1560 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1562 ovs_mutex_lock(&dev->mutex);
1563 if (!eth_addr_equals(dev->hwaddr, mac)) {
1565 netdev_change_seq_changed(netdev);
1567 ovs_mutex_unlock(&dev->mutex);
1573 netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
1575 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1577 ovs_mutex_lock(&dev->mutex);
1579 ovs_mutex_unlock(&dev->mutex);
1585 netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
1587 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1589 ovs_mutex_lock(&dev->mutex);
1591 ovs_mutex_unlock(&dev->mutex);
1597 netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
1599 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1600 int old_mtu, err, dpdk_mtu;
1601 struct dpdk_mp *old_mp;
1605 ovs_mutex_lock(&dpdk_mutex);
1606 ovs_mutex_lock(&dev->mutex);
1607 if (dev->mtu == mtu) {
1612 buf_size = dpdk_buf_size(mtu);
1613 dpdk_mtu = FRAME_LEN_TO_MTU(buf_size);
1615 mp = dpdk_mp_get(dev->socket_id, dpdk_mtu);
1621 rte_eth_dev_stop(dev->port_id);
1624 old_mp = dev->dpdk_mp;
1627 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1629 err = dpdk_eth_dev_init(dev);
1633 dev->dpdk_mp = old_mp;
1634 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1635 dpdk_eth_dev_init(dev);
1639 dpdk_mp_put(old_mp);
1640 netdev_change_seq_changed(netdev);
1642 ovs_mutex_unlock(&dev->mutex);
1643 ovs_mutex_unlock(&dpdk_mutex);
1648 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
1651 netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
1652 struct netdev_stats *stats)
1654 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1656 ovs_mutex_lock(&dev->mutex);
1658 rte_spinlock_lock(&dev->stats_lock);
1659 /* Supported Stats */
1660 stats->rx_packets += dev->stats.rx_packets;
1661 stats->tx_packets += dev->stats.tx_packets;
1662 stats->tx_dropped += dev->stats.tx_dropped;
1663 stats->multicast = dev->stats.multicast;
1664 stats->rx_bytes = dev->stats.rx_bytes;
1665 stats->tx_bytes = dev->stats.tx_bytes;
1666 stats->rx_errors = dev->stats.rx_errors;
1667 stats->rx_length_errors = dev->stats.rx_length_errors;
1669 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
1670 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
1671 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
1672 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
1673 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
1674 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
1675 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
1677 rte_spinlock_unlock(&dev->stats_lock);
1679 ovs_mutex_unlock(&dev->mutex);
1685 netdev_dpdk_convert_xstats(struct netdev_stats *stats,
1686 const struct rte_eth_xstats *xstats,
1687 const unsigned int size)
1689 /* XXX Current implementation is simple search through an array
1690 * to find hardcoded counter names. In future DPDK release (TBD)
1691 * XSTATS API will change so each counter will be represented by
1692 * unique ID instead of String. */
1694 for (unsigned int i = 0; i < size; i++) {
1695 if (strcmp(XSTAT_RX_64_PACKETS, xstats[i].name) == 0) {
1696 stats->rx_1_to_64_packets = xstats[i].value;
1697 } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, xstats[i].name) == 0) {
1698 stats->rx_65_to_127_packets = xstats[i].value;
1699 } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, xstats[i].name) == 0) {
1700 stats->rx_128_to_255_packets = xstats[i].value;
1701 } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, xstats[i].name) == 0) {
1702 stats->rx_256_to_511_packets = xstats[i].value;
1703 } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS,
1704 xstats[i].name) == 0) {
1705 stats->rx_512_to_1023_packets = xstats[i].value;
1706 } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS,
1707 xstats[i].name) == 0) {
1708 stats->rx_1024_to_1522_packets = xstats[i].value;
1709 } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS,
1710 xstats[i].name) == 0) {
1711 stats->rx_1523_to_max_packets = xstats[i].value;
1712 } else if (strcmp(XSTAT_TX_64_PACKETS, xstats[i].name) == 0) {
1713 stats->tx_1_to_64_packets = xstats[i].value;
1714 } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, xstats[i].name) == 0) {
1715 stats->tx_65_to_127_packets = xstats[i].value;
1716 } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, xstats[i].name) == 0) {
1717 stats->tx_128_to_255_packets = xstats[i].value;
1718 } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, xstats[i].name) == 0) {
1719 stats->tx_256_to_511_packets = xstats[i].value;
1720 } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS,
1721 xstats[i].name) == 0) {
1722 stats->tx_512_to_1023_packets = xstats[i].value;
1723 } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS,
1724 xstats[i].name) == 0) {
1725 stats->tx_1024_to_1522_packets = xstats[i].value;
1726 } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS,
1727 xstats[i].name) == 0) {
1728 stats->tx_1523_to_max_packets = xstats[i].value;
1729 } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, xstats[i].name) == 0) {
1730 stats->tx_multicast_packets = xstats[i].value;
1731 } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, xstats[i].name) == 0) {
1732 stats->rx_broadcast_packets = xstats[i].value;
1733 } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, xstats[i].name) == 0) {
1734 stats->tx_broadcast_packets = xstats[i].value;
1735 } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, xstats[i].name) == 0) {
1736 stats->rx_undersized_errors = xstats[i].value;
1737 } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, xstats[i].name) == 0) {
1738 stats->rx_fragmented_errors = xstats[i].value;
1739 } else if (strcmp(XSTAT_RX_JABBER_ERRORS, xstats[i].name) == 0) {
1740 stats->rx_jabber_errors = xstats[i].value;
1746 netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
1748 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1749 struct rte_eth_stats rte_stats;
1752 netdev_dpdk_get_carrier(netdev, &gg);
1753 ovs_mutex_lock(&dev->mutex);
1755 struct rte_eth_xstats *rte_xstats;
1756 int rte_xstats_len, rte_xstats_ret;
1758 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
1759 VLOG_ERR("Can't get ETH statistics for port: %i.", dev->port_id);
1760 ovs_mutex_unlock(&dev->mutex);
1764 rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);
1765 if (rte_xstats_len > 0) {
1766 rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats) * rte_xstats_len);
1767 memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);
1768 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
1770 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
1771 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);
1773 rte_free(rte_xstats);
1775 VLOG_WARN("Can't get XSTATS counters for port: %i.", dev->port_id);
1778 stats->rx_packets = rte_stats.ipackets;
1779 stats->tx_packets = rte_stats.opackets;
1780 stats->rx_bytes = rte_stats.ibytes;
1781 stats->tx_bytes = rte_stats.obytes;
1782 /* DPDK counts imissed as errors, but count them here as dropped instead */
1783 stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
1784 stats->tx_errors = rte_stats.oerrors;
1785 stats->multicast = rte_stats.imcasts;
1787 rte_spinlock_lock(&dev->stats_lock);
1788 stats->tx_dropped = dev->stats.tx_dropped;
1789 rte_spinlock_unlock(&dev->stats_lock);
1791 /* These are the available DPDK counters for packets not received due to
1792 * local resource constraints in DPDK and NIC respectively. */
1793 stats->rx_dropped = rte_stats.rx_nombuf + rte_stats.imissed;
1794 stats->rx_missed_errors = rte_stats.imissed;
1796 ovs_mutex_unlock(&dev->mutex);
1802 netdev_dpdk_get_features(const struct netdev *netdev,
1803 enum netdev_features *current,
1804 enum netdev_features *advertised OVS_UNUSED,
1805 enum netdev_features *supported OVS_UNUSED,
1806 enum netdev_features *peer OVS_UNUSED)
1808 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1809 struct rte_eth_link link;
1811 ovs_mutex_lock(&dev->mutex);
1813 ovs_mutex_unlock(&dev->mutex);
1815 if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
1816 if (link.link_speed == ETH_SPEED_NUM_10M) {
1817 *current = NETDEV_F_10MB_HD;
1819 if (link.link_speed == ETH_SPEED_NUM_100M) {
1820 *current = NETDEV_F_100MB_HD;
1822 if (link.link_speed == ETH_SPEED_NUM_1G) {
1823 *current = NETDEV_F_1GB_HD;
1825 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
1826 if (link.link_speed == ETH_SPEED_NUM_10M) {
1827 *current = NETDEV_F_10MB_FD;
1829 if (link.link_speed == ETH_SPEED_NUM_100M) {
1830 *current = NETDEV_F_100MB_FD;
1832 if (link.link_speed == ETH_SPEED_NUM_1G) {
1833 *current = NETDEV_F_1GB_FD;
1835 if (link.link_speed == ETH_SPEED_NUM_10G) {
1836 *current = NETDEV_F_10GB_FD;
1840 if (link.link_autoneg) {
1841 *current |= NETDEV_F_AUTONEG;
1848 netdev_dpdk_get_ifindex(const struct netdev *netdev)
1850 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1853 ovs_mutex_lock(&dev->mutex);
1854 ifindex = dev->port_id;
1855 ovs_mutex_unlock(&dev->mutex);
1861 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
1863 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1865 ovs_mutex_lock(&dev->mutex);
1866 check_link_status(dev);
1867 *carrier = dev->link.link_status;
1869 ovs_mutex_unlock(&dev->mutex);
1875 netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
1877 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1878 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
1880 ovs_mutex_lock(&dev->mutex);
1882 if (is_vhost_running(virtio_dev)) {
1888 ovs_mutex_unlock(&dev->mutex);
1893 static long long int
1894 netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
1896 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1897 long long int carrier_resets;
1899 ovs_mutex_lock(&dev->mutex);
1900 carrier_resets = dev->link_reset_cnt;
1901 ovs_mutex_unlock(&dev->mutex);
1903 return carrier_resets;
1907 netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
1908 long long int interval OVS_UNUSED)
1914 netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
1915 enum netdev_flags off, enum netdev_flags on,
1916 enum netdev_flags *old_flagsp) OVS_REQUIRES(dev->mutex)
1920 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
1924 *old_flagsp = dev->flags;
1928 if (dev->flags == *old_flagsp) {
1932 if (dev->type == DPDK_DEV_ETH) {
1933 if (dev->flags & NETDEV_UP) {
1934 err = rte_eth_dev_start(dev->port_id);
1939 if (dev->flags & NETDEV_PROMISC) {
1940 rte_eth_promiscuous_enable(dev->port_id);
1943 if (!(dev->flags & NETDEV_UP)) {
1944 rte_eth_dev_stop(dev->port_id);
1952 netdev_dpdk_update_flags(struct netdev *netdev,
1953 enum netdev_flags off, enum netdev_flags on,
1954 enum netdev_flags *old_flagsp)
1956 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1959 ovs_mutex_lock(&dev->mutex);
1960 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
1961 ovs_mutex_unlock(&dev->mutex);
1967 netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
1969 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1970 struct rte_eth_dev_info dev_info;
1972 if (dev->port_id < 0)
1975 ovs_mutex_lock(&dev->mutex);
1976 rte_eth_dev_info_get(dev->port_id, &dev_info);
1977 ovs_mutex_unlock(&dev->mutex);
1979 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1981 smap_add_format(args, "port_no", "%d", dev->port_id);
1982 smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
1983 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1984 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
1985 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
1986 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
1987 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
1988 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
1989 smap_add_format(args, "max_hash_mac_addrs", "%u", dev_info.max_hash_mac_addrs);
1990 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
1991 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
1993 if (dev_info.pci_dev) {
1994 smap_add_format(args, "pci-vendor_id", "0x%u",
1995 dev_info.pci_dev->id.vendor_id);
1996 smap_add_format(args, "pci-device_id", "0x%x",
1997 dev_info.pci_dev->id.device_id);
2004 netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
2005 OVS_REQUIRES(dev->mutex)
2007 enum netdev_flags old_flags;
2010 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
2012 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
2017 netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
2018 const char *argv[], void *aux OVS_UNUSED)
2022 if (!strcasecmp(argv[argc - 1], "up")) {
2024 } else if ( !strcasecmp(argv[argc - 1], "down")) {
2027 unixctl_command_reply_error(conn, "Invalid Admin State");
2032 struct netdev *netdev = netdev_from_name(argv[1]);
2033 if (netdev && is_dpdk_class(netdev->netdev_class)) {
2034 struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
2036 ovs_mutex_lock(&dpdk_dev->mutex);
2037 netdev_dpdk_set_admin_state__(dpdk_dev, up);
2038 ovs_mutex_unlock(&dpdk_dev->mutex);
2040 netdev_close(netdev);
2042 unixctl_command_reply_error(conn, "Not a DPDK Interface");
2043 netdev_close(netdev);
2047 struct netdev_dpdk *netdev;
2049 ovs_mutex_lock(&dpdk_mutex);
2050 LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
2051 ovs_mutex_lock(&netdev->mutex);
2052 netdev_dpdk_set_admin_state__(netdev, up);
2053 ovs_mutex_unlock(&netdev->mutex);
2055 ovs_mutex_unlock(&dpdk_mutex);
2057 unixctl_command_reply(conn, "OK");
2061 * Set virtqueue flags so that we do not receive interrupts.
2064 set_irq_status(struct virtio_net *virtio_dev)
2069 for (i = 0; i < virtio_dev->virt_qp_nb; i++) {
2070 idx = i * VIRTIO_QNUM;
2071 rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_RXQ, 0);
2072 rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_TXQ, 0);
2077 * Fixes mapping for vhost-user tx queues. Must be called after each
2078 * enabling/disabling of queues and real_n_txq modifications.
2081 netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
2082 OVS_REQUIRES(dev->mutex)
2084 int *enabled_queues, n_enabled = 0;
2085 int i, k, total_txqs = dev->real_n_txq;
2087 enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
2089 for (i = 0; i < total_txqs; i++) {
2090 /* Enabled queues always mapped to themselves. */
2091 if (dev->tx_q[i].map == i) {
2092 enabled_queues[n_enabled++] = i;
2096 if (n_enabled == 0 && total_txqs != 0) {
2097 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
2102 for (i = 0; i < total_txqs; i++) {
2103 if (dev->tx_q[i].map != i) {
2104 dev->tx_q[i].map = enabled_queues[k];
2105 k = (k + 1) % n_enabled;
2109 VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
2110 for (i = 0; i < total_txqs; i++) {
2111 VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
2114 rte_free(enabled_queues);
2118 netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct virtio_net *virtio_dev)
2119 OVS_REQUIRES(dev->mutex)
2123 qp_num = virtio_dev->virt_qp_nb;
2124 if (qp_num > dev->up.n_rxq) {
2125 VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "
2126 "too many queues %d > %d", virtio_dev->ifname, virtio_dev->device_fh,
2127 qp_num, dev->up.n_rxq);
2131 dev->real_n_rxq = qp_num;
2132 dev->real_n_txq = qp_num;
2133 dev->txq_needs_locking = true;
2134 /* Enable TX queue 0 by default if it wasn't disabled. */
2135 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
2136 dev->tx_q[0].map = 0;
2139 netdev_dpdk_remap_txqs(dev);
2145 * A new virtio-net device is added to a vhost port.
2148 new_device(struct virtio_net *virtio_dev)
2150 struct netdev_dpdk *dev;
2151 bool exists = false;
2153 ovs_mutex_lock(&dpdk_mutex);
2154 /* Add device to the vhost port with the same name as that passed down. */
2155 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
2156 if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
2157 ovs_mutex_lock(&dev->mutex);
2158 if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {
2159 ovs_mutex_unlock(&dev->mutex);
2160 ovs_mutex_unlock(&dpdk_mutex);
2163 ovsrcu_set(&dev->virtio_dev, virtio_dev);
2165 virtio_dev->flags |= VIRTIO_DEV_RUNNING;
2166 /* Disable notifications. */
2167 set_irq_status(virtio_dev);
2168 ovs_mutex_unlock(&dev->mutex);
2172 ovs_mutex_unlock(&dpdk_mutex);
2175 VLOG_INFO("vHost Device '%s' %"PRIu64" can't be added - name not "
2176 "found", virtio_dev->ifname, virtio_dev->device_fh);
2181 VLOG_INFO("vHost Device '%s' %"PRIu64" has been added", virtio_dev->ifname,
2182 virtio_dev->device_fh);
2186 /* Clears mapping for all available queues of vhost interface. */
2188 netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
2189 OVS_REQUIRES(dev->mutex)
2193 for (i = 0; i < dev->real_n_txq; i++) {
2194 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
2199 * Remove a virtio-net device from the specific vhost port. Use dev->remove
2200 * flag to stop any more packets from being sent or received to/from a VM and
2201 * ensure all currently queued packets have been sent/received before removing
2205 destroy_device(volatile struct virtio_net *virtio_dev)
2207 struct netdev_dpdk *dev;
2208 bool exists = false;
2210 ovs_mutex_lock(&dpdk_mutex);
2211 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
2212 if (netdev_dpdk_get_virtio(dev) == virtio_dev) {
2214 ovs_mutex_lock(&dev->mutex);
2215 virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
2216 ovsrcu_set(&dev->virtio_dev, NULL);
2217 netdev_dpdk_txq_map_clear(dev);
2219 ovs_mutex_unlock(&dev->mutex);
2224 ovs_mutex_unlock(&dpdk_mutex);
2226 if (exists == true) {
2228 * Wait for other threads to quiesce after setting the 'virtio_dev'
2229 * to NULL, before returning.
2231 ovsrcu_synchronize();
2233 * As call to ovsrcu_synchronize() will end the quiescent state,
2234 * put thread back into quiescent state before returning.
2236 ovsrcu_quiesce_start();
2237 VLOG_INFO("vHost Device '%s' %"PRIu64" has been removed",
2238 virtio_dev->ifname, virtio_dev->device_fh);
2240 VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
2241 virtio_dev->device_fh);
2246 vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
2249 struct netdev_dpdk *dev;
2250 bool exists = false;
2251 int qid = queue_id / VIRTIO_QNUM;
2253 if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
2257 ovs_mutex_lock(&dpdk_mutex);
2258 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
2259 if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
2260 ovs_mutex_lock(&dev->mutex);
2262 dev->tx_q[qid].map = qid;
2264 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
2266 netdev_dpdk_remap_txqs(dev);
2268 ovs_mutex_unlock(&dev->mutex);
2272 ovs_mutex_unlock(&dpdk_mutex);
2275 VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
2276 PRIu64" changed to \'%s\'", queue_id, qid,
2277 virtio_dev->ifname, virtio_dev->device_fh,
2278 (enable == 1) ? "enabled" : "disabled");
2280 VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
2281 virtio_dev->device_fh);
2289 netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
2291 return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);
2295 * These callbacks allow virtio-net devices to be added to vhost ports when
2296 * configuration has been fully complete.
2298 static const struct virtio_net_device_ops virtio_net_device_ops =
2300 .new_device = new_device,
2301 .destroy_device = destroy_device,
2302 .vring_state_changed = vring_state_changed
2306 start_vhost_loop(void *dummy OVS_UNUSED)
2308 pthread_detach(pthread_self());
2309 /* Put the cuse thread into quiescent state. */
2310 ovsrcu_quiesce_start();
2311 rte_vhost_driver_session_start();
2316 dpdk_vhost_class_init(void)
2318 rte_vhost_driver_callback_register(&virtio_net_device_ops);
2319 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
2320 | 1ULL << VIRTIO_NET_F_HOST_TSO6
2321 | 1ULL << VIRTIO_NET_F_CSUM);
2323 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
2328 dpdk_vhost_cuse_class_init(void)
2334 dpdk_vhost_user_class_init(void)
2340 dpdk_common_init(void)
2342 unixctl_command_register("netdev-dpdk/set-admin-state",
2343 "[netdev] up|down", 1, 2,
2344 netdev_dpdk_set_admin_state, NULL);
2351 dpdk_ring_create(const char dev_name[], unsigned int port_no,
2352 unsigned int *eth_port_id)
2354 struct dpdk_ring *ivshmem;
2355 char ring_name[RTE_RING_NAMESIZE];
2358 ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
2359 if (ivshmem == NULL) {
2363 /* XXX: Add support for multiquque ring. */
2364 err = snprintf(ring_name, sizeof(ring_name), "%s_tx", dev_name);
2369 /* Create single producer tx ring, netdev does explicit locking. */
2370 ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
2372 if (ivshmem->cring_tx == NULL) {
2377 err = snprintf(ring_name, sizeof(ring_name), "%s_rx", dev_name);
2382 /* Create single consumer rx ring, netdev does explicit locking. */
2383 ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
2385 if (ivshmem->cring_rx == NULL) {
2390 err = rte_eth_from_rings(dev_name, &ivshmem->cring_rx, 1,
2391 &ivshmem->cring_tx, 1, SOCKET0);
2398 ivshmem->user_port_id = port_no;
2399 ivshmem->eth_port_id = rte_eth_dev_count() - 1;
2400 ovs_list_push_back(&dpdk_ring_list, &ivshmem->list_node);
2402 *eth_port_id = ivshmem->eth_port_id;
2407 dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dpdk_mutex)
2409 struct dpdk_ring *ivshmem;
2410 unsigned int port_no;
2413 /* Names always start with "dpdkr" */
2414 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
2419 /* look through our list to find the device */
2420 LIST_FOR_EACH (ivshmem, list_node, &dpdk_ring_list) {
2421 if (ivshmem->user_port_id == port_no) {
2422 VLOG_INFO("Found dpdk ring device %s:", dev_name);
2423 *eth_port_id = ivshmem->eth_port_id; /* really all that is needed */
2427 /* Need to create the device rings */
2428 return dpdk_ring_create(dev_name, port_no, eth_port_id);
2432 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
2433 struct dp_packet **pkts, int cnt, bool may_steal)
2435 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2438 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the
2439 * rss hash field is clear. This is because the same mbuf may be modified by
2440 * the consumer of the ring and return into the datapath without recalculating
2442 for (i = 0; i < cnt; i++) {
2443 dp_packet_rss_invalidate(pkts[i]);
2446 netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
2451 netdev_dpdk_ring_construct(struct netdev *netdev)
2453 unsigned int port_no = 0;
2456 if (rte_eal_init_ret) {
2457 return rte_eal_init_ret;
2460 ovs_mutex_lock(&dpdk_mutex);
2462 err = dpdk_ring_open(netdev->name, &port_no);
2467 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
2470 ovs_mutex_unlock(&dpdk_mutex);
2477 * Initialize QoS configuration operations.
2480 qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
2486 * Search existing QoS operations in qos_ops and compare each set of
2487 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
2490 static const struct dpdk_qos_ops *
2491 qos_lookup_name(const char *name)
2493 const struct dpdk_qos_ops *const *opsp;
2495 for (opsp = qos_confs; *opsp != NULL; opsp++) {
2496 const struct dpdk_qos_ops *ops = *opsp;
2497 if (!strcmp(name, ops->qos_name)) {
2505 * Call qos_destruct to clean up items associated with the netdevs
2506 * qos_conf. Set netdevs qos_conf to NULL.
2509 qos_delete_conf(struct netdev *netdev)
2511 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2513 rte_spinlock_lock(&dev->qos_lock);
2514 if (dev->qos_conf) {
2515 if (dev->qos_conf->ops->qos_destruct) {
2516 dev->qos_conf->ops->qos_destruct(netdev, dev->qos_conf);
2518 dev->qos_conf = NULL;
2520 rte_spinlock_unlock(&dev->qos_lock);
2524 netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2527 const struct dpdk_qos_ops *const *opsp;
2529 for (opsp = qos_confs; *opsp != NULL; opsp++) {
2530 const struct dpdk_qos_ops *ops = *opsp;
2531 if (ops->qos_construct && ops->qos_name[0] != '\0') {
2532 sset_add(types, ops->qos_name);
2539 netdev_dpdk_get_qos(const struct netdev *netdev,
2540 const char **typep, struct smap *details)
2542 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2545 ovs_mutex_lock(&dev->mutex);
2547 *typep = dev->qos_conf->ops->qos_name;
2548 error = (dev->qos_conf->ops->qos_get
2549 ? dev->qos_conf->ops->qos_get(netdev, details): 0);
2551 ovs_mutex_unlock(&dev->mutex);
2557 netdev_dpdk_set_qos(struct netdev *netdev,
2558 const char *type, const struct smap *details)
2560 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2561 const struct dpdk_qos_ops *new_ops = NULL;
2564 /* If type is empty or unsupported then the current QoS configuration
2565 * for the dpdk-netdev can be destroyed */
2566 new_ops = qos_lookup_name(type);
2568 if (type[0] == '\0' || !new_ops || !new_ops->qos_construct) {
2569 qos_delete_conf(netdev);
2573 ovs_mutex_lock(&dev->mutex);
2575 if (dev->qos_conf) {
2576 if (new_ops == dev->qos_conf->ops) {
2577 error = new_ops->qos_set ? new_ops->qos_set(netdev, details) : 0;
2579 /* Delete existing QoS configuration. */
2580 qos_delete_conf(netdev);
2581 ovs_assert(dev->qos_conf == NULL);
2583 /* Install new QoS configuration. */
2584 error = new_ops->qos_construct(netdev, details);
2585 ovs_assert((error == 0) == (dev->qos_conf != NULL));
2588 error = new_ops->qos_construct(netdev, details);
2589 ovs_assert((error == 0) == (dev->qos_conf != NULL));
2592 ovs_mutex_unlock(&dev->mutex);
2596 /* egress-policer details */
2598 struct egress_policer {
2599 struct qos_conf qos_conf;
2600 struct rte_meter_srtcm_params app_srtcm_params;
2601 struct rte_meter_srtcm egress_meter;
2604 static struct egress_policer *
2605 egress_policer_get__(const struct netdev *netdev)
2607 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2608 return CONTAINER_OF(dev->qos_conf, struct egress_policer, qos_conf);
2612 egress_policer_qos_construct(struct netdev *netdev,
2613 const struct smap *details)
2615 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2616 struct egress_policer *policer;
2621 rte_spinlock_lock(&dev->qos_lock);
2622 policer = xmalloc(sizeof *policer);
2623 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
2624 dev->qos_conf = &policer->qos_conf;
2625 cir_s = smap_get(details, "cir");
2626 cbs_s = smap_get(details, "cbs");
2627 policer->app_srtcm_params.cir = cir_s ? strtoull(cir_s, NULL, 10) : 0;
2628 policer->app_srtcm_params.cbs = cbs_s ? strtoull(cbs_s, NULL, 10) : 0;
2629 policer->app_srtcm_params.ebs = 0;
2630 err = rte_meter_srtcm_config(&policer->egress_meter,
2631 &policer->app_srtcm_params);
2632 rte_spinlock_unlock(&dev->qos_lock);
2638 egress_policer_qos_destruct(struct netdev *netdev OVS_UNUSED,
2639 struct qos_conf *conf)
2641 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
2647 egress_policer_qos_get(const struct netdev *netdev, struct smap *details)
2649 struct egress_policer *policer = egress_policer_get__(netdev);
2650 smap_add_format(details, "cir", "%llu",
2651 1ULL * policer->app_srtcm_params.cir);
2652 smap_add_format(details, "cbs", "%llu",
2653 1ULL * policer->app_srtcm_params.cbs);
2659 egress_policer_qos_set(struct netdev *netdev, const struct smap *details)
2661 struct egress_policer *policer;
2666 policer = egress_policer_get__(netdev);
2667 cir_s = smap_get(details, "cir");
2668 cbs_s = smap_get(details, "cbs");
2669 policer->app_srtcm_params.cir = cir_s ? strtoull(cir_s, NULL, 10) : 0;
2670 policer->app_srtcm_params.cbs = cbs_s ? strtoull(cbs_s, NULL, 10) : 0;
2671 policer->app_srtcm_params.ebs = 0;
2672 err = rte_meter_srtcm_config(&policer->egress_meter,
2673 &policer->app_srtcm_params);
2679 egress_policer_pkt_handle__(struct rte_meter_srtcm *meter,
2680 struct rte_mbuf *pkt, uint64_t time)
2682 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
2684 return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
2689 egress_policer_run(struct netdev *netdev, struct rte_mbuf **pkts,
2694 struct egress_policer *policer = egress_policer_get__(netdev);
2695 struct rte_mbuf *pkt = NULL;
2696 uint64_t current_time = rte_rdtsc();
2698 for(i = 0; i < pkt_cnt; i++) {
2700 /* Handle current packet */
2701 if (egress_policer_pkt_handle__(&policer->egress_meter, pkt,
2708 rte_pktmbuf_free(pkt);
2715 static const struct dpdk_qos_ops egress_policer_ops = {
2716 "egress-policer", /* qos_name */
2717 egress_policer_qos_construct,
2718 egress_policer_qos_destruct,
2719 egress_policer_qos_get,
2720 egress_policer_qos_set,
2725 netdev_dpdk_reconfigure(struct netdev *netdev)
2727 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2730 ovs_mutex_lock(&dpdk_mutex);
2731 ovs_mutex_lock(&dev->mutex);
2733 if (netdev->n_txq == dev->requested_n_txq
2734 && netdev->n_rxq == dev->requested_n_rxq) {
2735 /* Reconfiguration is unnecessary */
2740 rte_eth_dev_stop(dev->port_id);
2742 netdev->n_txq = dev->requested_n_txq;
2743 netdev->n_rxq = dev->requested_n_rxq;
2745 rte_free(dev->tx_q);
2746 err = dpdk_eth_dev_init(dev);
2747 netdev_dpdk_alloc_txq(dev, dev->real_n_txq);
2749 dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;
2753 ovs_mutex_unlock(&dev->mutex);
2754 ovs_mutex_unlock(&dpdk_mutex);
2760 netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
2762 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2764 ovs_mutex_lock(&dpdk_mutex);
2765 ovs_mutex_lock(&dev->mutex);
2767 netdev->n_txq = dev->requested_n_txq;
2768 netdev->n_rxq = dev->requested_n_rxq;
2770 ovs_mutex_unlock(&dev->mutex);
2771 ovs_mutex_unlock(&dpdk_mutex);
2777 netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev)
2779 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2781 ovs_mutex_lock(&dpdk_mutex);
2782 ovs_mutex_lock(&dev->mutex);
2784 netdev->n_txq = dev->requested_n_txq;
2785 dev->real_n_txq = 1;
2787 dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;
2789 ovs_mutex_unlock(&dev->mutex);
2790 ovs_mutex_unlock(&dpdk_mutex);
2795 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, SEND, \
2796 GET_CARRIER, GET_STATS, GET_FEATURES, \
2797 GET_STATUS, RECONFIGURE, RXQ_RECV) \
2800 true, /* is_pmd */ \
2802 NULL, /* netdev_dpdk_run */ \
2803 NULL, /* netdev_dpdk_wait */ \
2805 netdev_dpdk_alloc, \
2808 netdev_dpdk_dealloc, \
2809 netdev_dpdk_get_config, \
2810 netdev_dpdk_set_config, \
2811 NULL, /* get_tunnel_config */ \
2812 NULL, /* build header */ \
2813 NULL, /* push header */ \
2814 NULL, /* pop header */ \
2815 netdev_dpdk_get_numa_id, /* get_numa_id */ \
2816 netdev_dpdk_set_tx_multiq, \
2819 NULL, /* send_wait */ \
2821 netdev_dpdk_set_etheraddr, \
2822 netdev_dpdk_get_etheraddr, \
2823 netdev_dpdk_get_mtu, \
2824 netdev_dpdk_set_mtu, \
2825 netdev_dpdk_get_ifindex, \
2827 netdev_dpdk_get_carrier_resets, \
2828 netdev_dpdk_set_miimon, \
2831 NULL, /* set_advertisements */ \
2833 NULL, /* set_policing */ \
2834 netdev_dpdk_get_qos_types, \
2835 NULL, /* get_qos_capabilities */ \
2836 netdev_dpdk_get_qos, \
2837 netdev_dpdk_set_qos, \
2838 NULL, /* get_queue */ \
2839 NULL, /* set_queue */ \
2840 NULL, /* delete_queue */ \
2841 NULL, /* get_queue_stats */ \
2842 NULL, /* queue_dump_start */ \
2843 NULL, /* queue_dump_next */ \
2844 NULL, /* queue_dump_done */ \
2845 NULL, /* dump_queue_stats */ \
2847 NULL, /* set_in4 */ \
2848 NULL, /* get_addr_list */ \
2849 NULL, /* add_router */ \
2850 NULL, /* get_next_hop */ \
2852 NULL, /* arp_lookup */ \
2854 netdev_dpdk_update_flags, \
2857 netdev_dpdk_rxq_alloc, \
2858 netdev_dpdk_rxq_construct, \
2859 netdev_dpdk_rxq_destruct, \
2860 netdev_dpdk_rxq_dealloc, \
2862 NULL, /* rx_wait */ \
2863 NULL, /* rxq_drain */ \
2867 process_vhost_flags(char *flag, char *default_val, int size,
2868 const struct smap *ovs_other_config,
2874 val = smap_get(ovs_other_config, flag);
2876 /* Depending on which version of vhost is in use, process the vhost-specific
2877 * flag if it is provided, otherwise resort to default value.
2879 if (val && (strlen(val) <= size)) {
2881 *new_val = xstrdup(val);
2882 VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
2884 VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
2885 *new_val = default_val;
2892 grow_argv(char ***argv, size_t cur_siz, size_t grow_by)
2894 return xrealloc(*argv, sizeof(char *) * (cur_siz + grow_by));
2898 dpdk_option_extend(char ***argv, int argc, const char *option,
2901 char **newargv = grow_argv(argv, argc, 2);
2903 newargv[argc] = xstrdup(option);
2904 newargv[argc+1] = xstrdup(value);
2908 move_argv(char ***argv, size_t cur_size, char **src_argv, size_t src_argc)
2910 char **newargv = grow_argv(argv, cur_size, src_argc);
2911 while (src_argc--) {
2912 newargv[cur_size+src_argc] = src_argv[src_argc];
2913 src_argv[src_argc] = NULL;
2919 extra_dpdk_args(const char *ovs_extra_config, char ***argv, int argc)
2922 char *release_tok = xstrdup(ovs_extra_config);
2923 char *tok = release_tok, *endptr = NULL;
2925 for (tok = strtok_r(release_tok, " ", &endptr); tok != NULL;
2926 tok = strtok_r(NULL, " ", &endptr)) {
2927 char **newarg = grow_argv(argv, ret, 1);
2929 newarg[ret++] = xstrdup(tok);
2936 argv_contains(char **argv_haystack, const size_t argc_haystack,
2939 for (size_t i = 0; i < argc_haystack; ++i) {
2940 if (!strcmp(argv_haystack[i], needle))
2947 construct_dpdk_options(const struct smap *ovs_other_config,
2948 char ***argv, const int initial_size,
2949 char **extra_args, const size_t extra_argc)
2951 struct dpdk_options_map {
2952 const char *ovs_configuration;
2953 const char *dpdk_option;
2954 bool default_enabled;
2955 const char *default_value;
2957 {"dpdk-lcore-mask", "-c", false, NULL},
2958 {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
2961 int i, ret = initial_size;
2963 /*First, construct from the flat-options (non-mutex)*/
2964 for (i = 0; i < ARRAY_SIZE(opts); ++i) {
2965 const char *lookup = smap_get(ovs_other_config,
2966 opts[i].ovs_configuration);
2967 if (!lookup && opts[i].default_enabled) {
2968 lookup = opts[i].default_value;
2972 if (!argv_contains(extra_args, extra_argc, opts[i].dpdk_option)) {
2973 dpdk_option_extend(argv, ret, opts[i].dpdk_option, lookup);
2976 VLOG_WARN("Ignoring database defined option '%s' due to "
2977 "dpdk_extras config", opts[i].dpdk_option);
2985 #define MAX_DPDK_EXCL_OPTS 10
2988 construct_dpdk_mutex_options(const struct smap *ovs_other_config,
2989 char ***argv, const int initial_size,
2990 char **extra_args, const size_t extra_argc)
2992 struct dpdk_exclusive_options_map {
2993 const char *category;
2994 const char *ovs_dpdk_options[MAX_DPDK_EXCL_OPTS];
2995 const char *eal_dpdk_options[MAX_DPDK_EXCL_OPTS];
2996 const char *default_value;
3000 {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
3001 {"-m", "--socket-mem", NULL,},
3006 int i, ret = initial_size;
3007 for (i = 0; i < ARRAY_SIZE(excl_opts); ++i) {
3008 int found_opts = 0, scan, found_pos = -1;
3009 const char *found_value;
3010 struct dpdk_exclusive_options_map *popt = &excl_opts[i];
3012 for (scan = 0; scan < MAX_DPDK_EXCL_OPTS
3013 && popt->ovs_dpdk_options[scan]; ++scan) {
3014 const char *lookup = smap_get(ovs_other_config,
3015 popt->ovs_dpdk_options[scan]);
3016 if (lookup && strlen(lookup)) {
3019 found_value = lookup;
3024 if (popt->default_option) {
3025 found_pos = popt->default_option;
3026 found_value = popt->default_value;
3032 if (found_opts > 1) {
3033 VLOG_ERR("Multiple defined options for %s. Please check your"
3034 " database settings and reconfigure if necessary.",
3038 if (!argv_contains(extra_args, extra_argc,
3039 popt->eal_dpdk_options[found_pos])) {
3040 dpdk_option_extend(argv, ret, popt->eal_dpdk_options[found_pos],
3044 VLOG_WARN("Ignoring database defined option '%s' due to "
3045 "dpdk_extras config", popt->eal_dpdk_options[found_pos]);
3053 get_dpdk_args(const struct smap *ovs_other_config, char ***argv,
3056 const char *extra_configuration;
3057 char **extra_args = NULL;
3059 size_t extra_argc = 0;
3061 extra_configuration = smap_get(ovs_other_config, "dpdk-extra");
3062 if (extra_configuration) {
3063 extra_argc = extra_dpdk_args(extra_configuration, &extra_args, 0);
3066 i = construct_dpdk_options(ovs_other_config, argv, argc, extra_args,
3068 i = construct_dpdk_mutex_options(ovs_other_config, argv, i, extra_args,
3071 if (extra_configuration) {
3072 *argv = move_argv(argv, i, extra_args, extra_argc);
3075 return i + extra_argc;
3078 static char **dpdk_argv;
3079 static int dpdk_argc;
3082 deferred_argv_release(void)
3085 for (result = 0; result < dpdk_argc; ++result) {
3086 free(dpdk_argv[result]);
3093 dpdk_init__(const struct smap *ovs_other_config)
3098 bool auto_determine = true;
3102 char *sock_dir_subcomponent;
3105 if (!smap_get_bool(ovs_other_config, "dpdk-init", false)) {
3106 VLOG_INFO("DPDK Disabled - to change this requires a restart.\n");
3110 VLOG_INFO("DPDK Enabled, initializing");
3113 if (process_vhost_flags("cuse-dev-name", xstrdup("vhost-net"),
3114 PATH_MAX, ovs_other_config, &cuse_dev_name)) {
3116 if (process_vhost_flags("vhost-sock-dir", xstrdup(ovs_rundir()),
3117 NAME_MAX, ovs_other_config,
3118 &sock_dir_subcomponent)) {
3120 if (!strstr(sock_dir_subcomponent, "..")) {
3121 vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(),
3122 sock_dir_subcomponent);
3124 err = stat(vhost_sock_dir, &s);
3126 VLOG_ERR("vhost-user sock directory '%s' does not exist.",
3130 vhost_sock_dir = xstrdup(ovs_rundir());
3131 VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
3132 "characters '..' - using %s instead.",
3133 ovs_rundir(), sock_dir_subcomponent, ovs_rundir());
3135 free(sock_dir_subcomponent);
3137 vhost_sock_dir = sock_dir_subcomponent;
3141 argv = grow_argv(&argv, 0, 1);
3143 argv[0] = xstrdup(ovs_get_program_name());
3144 argc_tmp = get_dpdk_args(ovs_other_config, &argv, argc);
3146 while (argc_tmp != argc) {
3147 if (!strcmp("-c", argv[argc]) || !strcmp("-l", argv[argc])) {
3148 auto_determine = false;
3156 * NOTE: This is an unsophisticated mechanism for determining the DPDK
3157 * lcore for the DPDK Master.
3159 if (auto_determine) {
3161 /* Get the main thread affinity */
3163 err = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t),
3166 for (i = 0; i < CPU_SETSIZE; i++) {
3167 if (CPU_ISSET(i, &cpuset)) {
3168 argv = grow_argv(&argv, argc, 2);
3169 argv[argc++] = xstrdup("-c");
3170 argv[argc++] = xasprintf("0x%08llX", (1ULL<<i));
3175 VLOG_ERR("Thread getaffinity error %d. Using core 0x1", err);
3176 /* User did not set dpdk-lcore-mask and unable to get current
3177 * thread affintity - default to core 0x1 */
3178 argv = grow_argv(&argv, argc, 2);
3179 argv[argc++] = xstrdup("-c");
3180 argv[argc++] = xasprintf("0x%X", 1);
3184 argv = grow_argv(&argv, argc, 1);
3189 if (VLOG_IS_INFO_ENABLED()) {
3193 ds_put_cstr(&eal_args, "EAL ARGS:");
3194 for (opt = 0; opt < argc; ++opt) {
3195 ds_put_cstr(&eal_args, " ");
3196 ds_put_cstr(&eal_args, argv[opt]);
3198 VLOG_INFO("%s", ds_cstr_ro(&eal_args));
3199 ds_destroy(&eal_args);
3202 /* Make sure things are initialized ... */
3203 result = rte_eal_init(argc, argv);
3205 ovs_abort(result, "Cannot init EAL");
3208 /* Set the main thread affinity back to pre rte_eal_init() value */
3209 if (auto_determine && !err) {
3210 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
3213 VLOG_ERR("Thread setaffinity error %d", err);
3220 atexit(deferred_argv_release);
3222 rte_memzone_dump(stdout);
3223 rte_eal_init_ret = 0;
3225 /* We are called from the main thread here */
3226 RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
3228 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3231 /* Register CUSE device to handle IOCTLs.
3232 * Unless otherwise specified, cuse_dev_name is set to vhost-net.
3234 err = rte_vhost_driver_register(cuse_dev_name);
3237 VLOG_ERR("CUSE device setup failure.");
3242 dpdk_vhost_class_init();
3244 /* Finally, register the dpdk classes */
3245 netdev_dpdk_register();
3249 dpdk_init(const struct smap *ovs_other_config)
3251 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3253 if (ovs_other_config && ovsthread_once_start(&once)) {
3254 dpdk_init__(ovs_other_config);
3255 ovsthread_once_done(&once);
3259 static const struct netdev_class dpdk_class =
3263 netdev_dpdk_construct,
3264 netdev_dpdk_destruct,
3265 netdev_dpdk_eth_send,
3266 netdev_dpdk_get_carrier,
3267 netdev_dpdk_get_stats,
3268 netdev_dpdk_get_features,
3269 netdev_dpdk_get_status,
3270 netdev_dpdk_reconfigure,
3271 netdev_dpdk_rxq_recv);
3273 static const struct netdev_class dpdk_ring_class =
3277 netdev_dpdk_ring_construct,
3278 netdev_dpdk_destruct,
3279 netdev_dpdk_ring_send,
3280 netdev_dpdk_get_carrier,
3281 netdev_dpdk_get_stats,
3282 netdev_dpdk_get_features,
3283 netdev_dpdk_get_status,
3284 netdev_dpdk_reconfigure,
3285 netdev_dpdk_rxq_recv);
3287 static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
3290 dpdk_vhost_cuse_class_init,
3291 netdev_dpdk_vhost_cuse_construct,
3292 netdev_dpdk_vhost_destruct,
3293 netdev_dpdk_vhost_send,
3294 netdev_dpdk_vhost_get_carrier,
3295 netdev_dpdk_vhost_get_stats,
3298 netdev_dpdk_vhost_cuse_reconfigure,
3299 netdev_dpdk_vhost_rxq_recv);
3301 static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
3304 dpdk_vhost_user_class_init,
3305 netdev_dpdk_vhost_user_construct,
3306 netdev_dpdk_vhost_destruct,
3307 netdev_dpdk_vhost_send,
3308 netdev_dpdk_vhost_get_carrier,
3309 netdev_dpdk_vhost_get_stats,
3312 netdev_dpdk_vhost_user_reconfigure,
3313 netdev_dpdk_vhost_rxq_recv);
3316 netdev_dpdk_register(void)
3319 netdev_register_provider(&dpdk_class);
3320 netdev_register_provider(&dpdk_ring_class);
3322 netdev_register_provider(&dpdk_vhost_cuse_class);
3324 netdev_register_provider(&dpdk_vhost_user_class);
3329 pmd_thread_setaffinity_cpu(unsigned cpu)
3335 CPU_SET(cpu, &cpuset);
3336 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
3338 VLOG_ERR("Thread affinity error %d",err);
3341 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
3342 ovs_assert(cpu != NON_PMD_CORE_ID);
3343 RTE_PER_LCORE(_lcore_id) = cpu;
3349 dpdk_thread_is_pmd(void)
3351 return rte_lcore_id() != NON_PMD_CORE_ID;