* The minimum mbuf size is limited to avoid scatter behaviour and drop in
* performance for standard Ethernet MTU.
*/
-#define MTU_TO_MAX_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
-#define MBUF_SIZE_MTU(mtu) (MTU_TO_MAX_LEN(mtu) \
- + sizeof(struct dp_packet) \
- + RTE_PKTMBUF_HEADROOM)
-#define MBUF_SIZE_DRIVER (2048 \
- + sizeof (struct rte_mbuf) \
- + RTE_PKTMBUF_HEADROOM)
-#define MBUF_SIZE(mtu) MAX(MBUF_SIZE_MTU(mtu), MBUF_SIZE_DRIVER)
+#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN))
+#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
+#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
+#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN - ETHER_CRC_LEN)
+#define MBUF_SIZE(mtu) ( MTU_TO_MAX_FRAME_LEN(mtu) \
+ + sizeof(struct dp_packet) \
+ + RTE_PKTMBUF_HEADROOM)
+#define NETDEV_DPDK_MBUF_ALIGN 1024
/* Max and min number of packets in the mempool. OVS tries to allocate a
* mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
#define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
#define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
+#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
+
static char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */
static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
* from concurrent access. It is used only
* if the queue is shared among different
* pmd threads (see 'txq_needs_locking'). */
+ int map; /* Mapping of configured vhost-user queues
+ * to enabled by guest. */
uint64_t tsc;
struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
};
/* For the client rings */
struct rte_ring *cring_tx;
struct rte_ring *cring_rx;
- int user_port_id; /* User given port no, parsed from port name */
+ unsigned int user_port_id; /* User given port no, parsed from port name */
int eth_port_id; /* ethernet device port id */
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
};
return class->construct == netdev_dpdk_construct;
}
+/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
+ * aligned at 1k or less. If a declared mbuf size is not a multiple of this
+ * value, insufficient buffers are allocated to accomodate the packet in its
+ * entirety. Furthermore, certain drivers need to ensure that there is also
+ * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
+ * frames). If the RX buffer is too small, then the driver enables scatter RX
+ * behaviour, which reduces performance. To prevent this, use a buffer size that
+ * is closest to 'mtu', but which satisfies the aforementioned criteria.
+ */
+static uint32_t
+dpdk_buf_size(int mtu)
+{
+ return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
+ NETDEV_DPDK_MBUF_ALIGN);
+}
+
/* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
* for all other segments data, bss and text. */
rte_pktmbuf_free_seg(pkt);
}
-static void
-__rte_pktmbuf_init(struct rte_mempool *mp,
- void *opaque_arg OVS_UNUSED,
- void *_m,
- unsigned i OVS_UNUSED)
-{
- struct rte_mbuf *m = _m;
- uint32_t buf_len = mp->elt_size - sizeof(struct dp_packet);
-
- RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dp_packet));
-
- memset(m, 0, mp->elt_size);
-
- /* start of buffer is just after mbuf structure */
- m->buf_addr = (char *)m + sizeof(struct dp_packet);
- m->buf_physaddr = rte_mempool_virt2phy(mp, m) +
- sizeof(struct dp_packet);
- m->buf_len = (uint16_t)buf_len;
-
- /* keep some headroom between start of buffer and data */
- m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
-
- /* init some constant fields */
- m->pool = mp;
- m->nb_segs = 1;
- m->port = 0xff;
-}
-
static void
ovs_rte_pktmbuf_init(struct rte_mempool *mp,
void *opaque_arg OVS_UNUSED,
{
struct rte_mbuf *m = _m;
- __rte_pktmbuf_init(mp, opaque_arg, _m, i);
+ rte_pktmbuf_init(mp, opaque_arg, _m, i);
dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
}
struct dpdk_mp *dmp = NULL;
char mp_name[RTE_MEMPOOL_NAMESIZE];
unsigned mp_size;
+ struct rte_pktmbuf_pool_private mbp_priv;
LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
dmp->socket_id = socket_id;
dmp->mtu = mtu;
dmp->refcount = 1;
+ mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet);
+ mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) - sizeof (struct rte_mbuf);
mp_size = MAX_NB_MBUF;
do {
dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
MP_CACHE_SZ,
sizeof(struct rte_pktmbuf_pool_private),
- rte_pktmbuf_pool_init, NULL,
+ rte_pktmbuf_pool_init, &mbp_priv,
ovs_rte_pktmbuf_init, NULL,
socket_id, 0);
} while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
/* Queues are shared among CPUs. Always flush */
netdev->tx_q[i].flush_tx = true;
}
+
+ /* Initialize map for vhost devices. */
+ netdev->tx_q[i].map = -1;
rte_spinlock_init(&netdev->tx_q[i].tx_lock);
}
}
struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
int sid;
int err = 0;
+ uint32_t buf_size;
ovs_mutex_init(&netdev->mutex);
ovs_mutex_lock(&netdev->mutex);
netdev->type = type;
netdev->flags = 0;
netdev->mtu = ETHER_MTU;
- netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
+ netdev->max_packet_len = MTU_TO_FRAME_LEN(netdev->mtu);
- netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
+ buf_size = dpdk_buf_size(netdev->mtu);
+ netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, FRAME_LEN_TO_MTU(buf_size));
if (!netdev->dpdk_mp) {
err = ENOMEM;
goto unlock;
if (err) {
goto unlock;
}
+ } else {
+ netdev_dpdk_alloc_txq(netdev, OVS_VHOST_MAX_QUEUE_NUM);
}
list_push_back(&dpdk_list, &netdev->list_node);
return err;
}
+/* dev_name must be the prefix followed by a positive decimal number.
+ * (no leading + or - signs are allowed) */
static int
dpdk_dev_parse_name(const char dev_name[], const char prefix[],
unsigned int *port_no)
}
cport = dev_name + strlen(prefix);
- *port_no = strtol(cport, NULL, 0); /* string must be null terminated */
- return 0;
+
+ if (str_to_uint(cport, 10, port_no)) {
+ return 0;
+ } else {
+ return ENODEV;
+ }
}
static int
netdev_dpdk_vhost_user_construct(struct netdev *netdev_)
{
struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+ const char *name = netdev_->name;
int err;
+ /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
+ * the file system. '/' or '\' would traverse directories, so they're not
+ * acceptable in 'name'. */
+ if (strchr(name, '/') || strchr(name, '\\')) {
+ VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
+ "A valid name must not include '/' or '\\'",
+ name);
+ return EINVAL;
+ }
+
ovs_mutex_lock(&dpdk_mutex);
/* Take the name of the vhost-user port and append it to the location where
* the socket is to be created, then register the socket.
*/
snprintf(netdev->vhost_id, sizeof(netdev->vhost_id), "%s/%s",
- vhost_sock_dir, netdev_->name);
+ vhost_sock_dir, name);
+
err = rte_vhost_driver_register(netdev->vhost_id);
if (err) {
VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
} else {
fatal_signal_add_file_to_unlink(netdev->vhost_id);
VLOG_INFO("Socket %s created for vhost-user port %s\n",
- netdev->vhost_id, netdev_->name);
+ netdev->vhost_id, name);
err = vhost_construct_helper(netdev_);
}
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&netdev->mutex);
- rte_free(netdev->tx_q);
netdev->up.n_txq = n_txq;
netdev->up.n_rxq = n_rxq;
- netdev_dpdk_alloc_txq(netdev, netdev->up.n_txq);
ovs_mutex_unlock(&netdev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
unsigned int total_pkts = cnt;
uint64_t start = 0;
- if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
+ qid = vhost_dev->tx_q[qid % vhost_dev->real_n_txq].map;
+
+ if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid == -1)) {
rte_spinlock_lock(&vhost_dev->stats_lock);
vhost_dev->stats.tx_dropped+= cnt;
rte_spinlock_unlock(&vhost_dev->stats_lock);
goto out;
}
- if (vhost_dev->txq_needs_locking) {
- qid = qid % vhost_dev->real_n_txq;
- rte_spinlock_lock(&vhost_dev->tx_q[qid].tx_lock);
- }
+ rte_spinlock_lock(&vhost_dev->tx_q[qid].tx_lock);
do {
int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
}
} while (cnt);
- if (vhost_dev->txq_needs_locking) {
- rte_spinlock_unlock(&vhost_dev->tx_q[qid].tx_lock);
- }
+ rte_spinlock_unlock(&vhost_dev->tx_q[qid].tx_lock);
rte_spinlock_lock(&vhost_dev->stats_lock);
netdev_dpdk_vhost_update_tx_counters(&vhost_dev->stats, pkts, total_pkts,
netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
- int old_mtu, err;
+ int old_mtu, err, dpdk_mtu;
struct dpdk_mp *old_mp;
struct dpdk_mp *mp;
+ uint32_t buf_size;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
goto out;
}
- mp = dpdk_mp_get(dev->socket_id, dev->mtu);
+ buf_size = dpdk_buf_size(mtu);
+ dpdk_mtu = FRAME_LEN_TO_MTU(buf_size);
+
+ mp = dpdk_mp_get(dev->socket_id, dpdk_mtu);
if (!mp) {
err = ENOMEM;
goto out;
old_mp = dev->dpdk_mp;
dev->dpdk_mp = mp;
dev->mtu = mtu;
- dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
+ dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
err = dpdk_eth_dev_init(dev);
if (err) {
dpdk_mp_put(mp);
dev->mtu = old_mtu;
dev->dpdk_mp = old_mp;
- dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
+ dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
dpdk_eth_dev_init(dev);
goto out;
}
smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
- smap_add_format(args, "max_rx_pktlen", "%u", dev_info.max_rx_pktlen);
+ smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
}
}
+/*
+ * Fixes mapping for vhost-user tx queues. Must be called after each
+ * enabling/disabling of queues and real_n_txq modifications.
+ */
+static void
+netdev_dpdk_remap_txqs(struct netdev_dpdk *netdev)
+ OVS_REQUIRES(netdev->mutex)
+{
+ int *enabled_queues, n_enabled = 0;
+ int i, k, total_txqs = netdev->real_n_txq;
+
+ enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
+
+ for (i = 0; i < total_txqs; i++) {
+ /* Enabled queues always mapped to themselves. */
+ if (netdev->tx_q[i].map == i) {
+ enabled_queues[n_enabled++] = i;
+ }
+ }
+
+ if (n_enabled == 0 && total_txqs != 0) {
+ enabled_queues[0] = -1;
+ n_enabled = 1;
+ }
+
+ k = 0;
+ for (i = 0; i < total_txqs; i++) {
+ if (netdev->tx_q[i].map != i) {
+ netdev->tx_q[i].map = enabled_queues[k];
+ k = (k + 1) % n_enabled;
+ }
+ }
+
+ VLOG_DBG("TX queue mapping for %s\n", netdev->vhost_id);
+ for (i = 0; i < total_txqs; i++) {
+ VLOG_DBG("%2d --> %2d", i, netdev->tx_q[i].map);
+ }
+
+ rte_free(enabled_queues);
+}
static int
netdev_dpdk_vhost_set_queues(struct netdev_dpdk *netdev, struct virtio_net *dev)
+ OVS_REQUIRES(netdev->mutex)
{
uint32_t qp_num;
netdev->real_n_rxq = qp_num;
netdev->real_n_txq = qp_num;
- if (netdev->up.n_txq > netdev->real_n_txq) {
- netdev->txq_needs_locking = true;
- } else {
- netdev->txq_needs_locking = false;
- }
+ netdev->txq_needs_locking = true;
+
+ netdev_dpdk_remap_txqs(netdev);
return 0;
}
}
+static int
+vring_state_changed(struct virtio_net *dev, uint16_t queue_id, int enable)
+{
+ struct netdev_dpdk *vhost_dev;
+ bool exists = false;
+ int qid = queue_id / VIRTIO_QNUM;
+
+ if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
+ return 0;
+ }
+
+ ovs_mutex_lock(&dpdk_mutex);
+ LIST_FOR_EACH (vhost_dev, list_node, &dpdk_list) {
+ if (strncmp(dev->ifname, vhost_dev->vhost_id, IF_NAME_SZ) == 0) {
+ ovs_mutex_lock(&vhost_dev->mutex);
+ if (enable) {
+ vhost_dev->tx_q[qid].map = qid;
+ } else {
+ vhost_dev->tx_q[qid].map = -1;
+ }
+ netdev_dpdk_remap_txqs(vhost_dev);
+ exists = true;
+ ovs_mutex_unlock(&vhost_dev->mutex);
+ break;
+ }
+ }
+ ovs_mutex_unlock(&dpdk_mutex);
+
+ if (exists) {
+ VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
+ PRIu64" changed to \'%s\'", queue_id, qid, dev->ifname,
+ dev->device_fh, (enable == 1) ? "enabled" : "disabled");
+ } else {
+ VLOG_INFO("vHost Device '%s' %"PRIu64" not found", dev->ifname,
+ dev->device_fh);
+ return -1;
+ }
+
+ return 0;
+}
+
struct virtio_net *
netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
{
{
.new_device = new_device,
.destroy_device = destroy_device,
+ .vring_state_changed = vring_state_changed
};
static void *