2 * Copyright (c) 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-vport.h"
23 #include <sys/socket.h>
25 #include <sys/ioctl.h>
27 #include "byte-order.h"
32 #include "dp-packet.h"
33 #include "dynamic-string.h"
38 #include "netdev-provider.h"
39 #include "odp-netlink.h"
40 #include "dp-packet.h"
41 #include "ovs-router.h"
43 #include "poll-loop.h"
44 #include "route-table.h"
46 #include "socket-util.h"
47 #include "openvswitch/vlog.h"
48 #include "unaligned.h"
52 VLOG_DEFINE_THIS_MODULE(netdev_vport);
53 static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
55 #define GENEVE_DST_PORT 6081
56 #define VXLAN_DST_PORT 4789
57 #define LISP_DST_PORT 4341
59 #define VXLAN_HLEN (sizeof(struct eth_header) + \
60 sizeof(struct ip_header) + \
61 sizeof(struct udp_header) + \
62 sizeof(struct vxlanhdr))
64 #define GENEVE_BASE_HLEN (sizeof(struct eth_header) + \
65 sizeof(struct ip_header) + \
66 sizeof(struct udp_header) + \
67 sizeof(struct genevehdr))
69 #define DEFAULT_TTL 64
74 /* Protects all members below. */
75 struct ovs_mutex mutex;
77 uint8_t etheraddr[ETH_ADDR_LEN];
78 struct netdev_stats stats;
81 struct netdev_tunnel_config tnl_cfg;
82 char egress_iface[IFNAMSIZ];
90 const char *dpif_port;
91 struct netdev_class netdev_class;
94 /* Last read of the route-table's change number. */
95 static uint64_t rt_change_seqno;
97 static int netdev_vport_construct(struct netdev *);
98 static int get_patch_config(const struct netdev *netdev, struct smap *args);
99 static int get_tunnel_config(const struct netdev *, struct smap *args);
100 static bool tunnel_check_status_change__(struct netdev_vport *);
102 static uint16_t tnl_udp_port_min = 32768;
103 static uint16_t tnl_udp_port_max = 61000;
106 is_vport_class(const struct netdev_class *class)
108 return class->construct == netdev_vport_construct;
112 netdev_vport_is_vport_class(const struct netdev_class *class)
114 return is_vport_class(class);
117 static const struct vport_class *
118 vport_class_cast(const struct netdev_class *class)
120 ovs_assert(is_vport_class(class));
121 return CONTAINER_OF(class, struct vport_class, netdev_class);
124 static struct netdev_vport *
125 netdev_vport_cast(const struct netdev *netdev)
127 ovs_assert(is_vport_class(netdev_get_class(netdev)));
128 return CONTAINER_OF(netdev, struct netdev_vport, up);
131 static const struct netdev_tunnel_config *
132 get_netdev_tunnel_config(const struct netdev *netdev)
134 return &netdev_vport_cast(netdev)->tnl_cfg;
138 netdev_vport_is_patch(const struct netdev *netdev)
140 const struct netdev_class *class = netdev_get_class(netdev);
142 return class->get_config == get_patch_config;
146 netdev_vport_is_layer3(const struct netdev *dev)
148 const char *type = netdev_get_type(dev);
150 return (!strcmp("lisp", type));
154 netdev_vport_needs_dst_port(const struct netdev *dev)
156 const struct netdev_class *class = netdev_get_class(dev);
157 const char *type = netdev_get_type(dev);
159 return (class->get_config == get_tunnel_config &&
160 (!strcmp("geneve", type) || !strcmp("vxlan", type) ||
161 !strcmp("lisp", type)));
165 netdev_vport_class_get_dpif_port(const struct netdev_class *class)
167 return is_vport_class(class) ? vport_class_cast(class)->dpif_port : NULL;
171 netdev_vport_get_dpif_port(const struct netdev *netdev,
172 char namebuf[], size_t bufsize)
174 const struct netdev_class *class = netdev_get_class(netdev);
175 const char *dpif_port = netdev_vport_class_get_dpif_port(class);
178 return netdev_get_name(netdev);
181 if (netdev_vport_needs_dst_port(netdev)) {
182 const struct netdev_vport *vport = netdev_vport_cast(netdev);
185 * Note: IFNAMSIZ is 16 bytes long. Implementations should choose
186 * a dpif port name that is short enough to fit including any
187 * port numbers but assert just in case.
189 BUILD_ASSERT(NETDEV_VPORT_NAME_BUFSIZE >= IFNAMSIZ);
190 ovs_assert(strlen(dpif_port) + 6 < IFNAMSIZ);
191 snprintf(namebuf, bufsize, "%s_%d", dpif_port,
192 ntohs(vport->tnl_cfg.dst_port));
200 netdev_vport_get_dpif_port_strdup(const struct netdev *netdev)
202 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
204 return xstrdup(netdev_vport_get_dpif_port(netdev, namebuf,
208 /* Whenever the route-table change number is incremented,
209 * netdev_vport_route_changed() should be called to update
210 * the corresponding tunnel interface status. */
212 netdev_vport_route_changed(void)
214 struct netdev **vports;
217 vports = netdev_get_vports(&n_vports);
218 for (i = 0; i < n_vports; i++) {
219 struct netdev *netdev_ = vports[i];
220 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
222 ovs_mutex_lock(&netdev->mutex);
223 /* Finds all tunnel vports. */
224 if (netdev->tnl_cfg.ip_dst) {
225 if (tunnel_check_status_change__(netdev)) {
226 netdev_change_seq_changed(netdev_);
229 ovs_mutex_unlock(&netdev->mutex);
231 netdev_close(netdev_);
237 static struct netdev *
238 netdev_vport_alloc(void)
240 struct netdev_vport *netdev = xzalloc(sizeof *netdev);
245 netdev_vport_construct(struct netdev *netdev_)
247 struct netdev_vport *dev = netdev_vport_cast(netdev_);
248 const char *type = netdev_get_type(netdev_);
250 ovs_mutex_init(&dev->mutex);
251 eth_addr_random(dev->etheraddr);
253 /* Add a default destination port for tunnel ports if none specified. */
254 if (!strcmp(type, "geneve")) {
255 dev->tnl_cfg.dst_port = htons(GENEVE_DST_PORT);
256 } else if (!strcmp(type, "vxlan")) {
257 dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
258 } else if (!strcmp(type, "lisp")) {
259 dev->tnl_cfg.dst_port = htons(LISP_DST_PORT);
266 netdev_vport_destruct(struct netdev *netdev_)
268 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
271 ovs_mutex_destroy(&netdev->mutex);
275 netdev_vport_dealloc(struct netdev *netdev_)
277 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
282 netdev_vport_set_etheraddr(struct netdev *netdev_,
283 const uint8_t mac[ETH_ADDR_LEN])
285 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
287 ovs_mutex_lock(&netdev->mutex);
288 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
289 ovs_mutex_unlock(&netdev->mutex);
290 netdev_change_seq_changed(netdev_);
296 netdev_vport_get_etheraddr(const struct netdev *netdev_,
297 uint8_t mac[ETH_ADDR_LEN])
299 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
301 ovs_mutex_lock(&netdev->mutex);
302 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
303 ovs_mutex_unlock(&netdev->mutex);
308 /* Checks if the tunnel status has changed and returns a boolean.
309 * Updates the tunnel status if it has changed. */
311 tunnel_check_status_change__(struct netdev_vport *netdev)
312 OVS_REQUIRES(netdev->mutex)
314 char iface[IFNAMSIZ];
320 route = netdev->tnl_cfg.ip_dst;
321 if (ovs_router_lookup(route, iface, &gw)) {
322 struct netdev *egress_netdev;
324 if (!netdev_open(iface, "system", &egress_netdev)) {
325 status = netdev_get_carrier(egress_netdev);
326 netdev_close(egress_netdev);
330 if (strcmp(netdev->egress_iface, iface)
331 || netdev->carrier_status != status) {
332 ovs_strlcpy(netdev->egress_iface, iface, IFNAMSIZ);
333 netdev->carrier_status = status;
342 tunnel_get_status(const struct netdev *netdev_, struct smap *smap)
344 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
346 if (netdev->egress_iface[0]) {
347 smap_add(smap, "tunnel_egress_iface", netdev->egress_iface);
349 smap_add(smap, "tunnel_egress_iface_carrier",
350 netdev->carrier_status ? "up" : "down");
357 netdev_vport_update_flags(struct netdev *netdev OVS_UNUSED,
358 enum netdev_flags off,
359 enum netdev_flags on OVS_UNUSED,
360 enum netdev_flags *old_flagsp)
362 if (off & (NETDEV_UP | NETDEV_PROMISC)) {
366 *old_flagsp = NETDEV_UP | NETDEV_PROMISC;
371 netdev_vport_run(void)
376 seq = route_table_get_change_seq();
377 if (rt_change_seqno != seq) {
378 rt_change_seqno = seq;
379 netdev_vport_route_changed();
384 netdev_vport_wait(void)
389 seq = route_table_get_change_seq();
390 if (rt_change_seqno != seq) {
391 poll_immediate_wake();
395 /* Code specific to tunnel types. */
398 parse_key(const struct smap *args, const char *name,
399 bool *present, bool *flow)
406 s = smap_get(args, name);
408 s = smap_get(args, "key");
416 if (!strcmp(s, "flow")) {
420 return htonll(strtoull(s, NULL, 0));
425 set_tunnel_config(struct netdev *dev_, const struct smap *args)
427 struct netdev_vport *dev = netdev_vport_cast(dev_);
428 const char *name = netdev_get_name(dev_);
429 const char *type = netdev_get_type(dev_);
430 bool ipsec_mech_set, needs_dst_port, has_csum;
431 struct netdev_tunnel_config tnl_cfg;
432 struct smap_node *node;
434 has_csum = strstr(type, "gre") || strstr(type, "geneve") ||
435 strstr(type, "vxlan");
436 ipsec_mech_set = false;
437 memset(&tnl_cfg, 0, sizeof tnl_cfg);
439 /* Add a default destination port for tunnel ports if none specified. */
440 if (!strcmp(type, "geneve")) {
441 tnl_cfg.dst_port = htons(GENEVE_DST_PORT);
444 if (!strcmp(type, "vxlan")) {
445 tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
448 if (!strcmp(type, "lisp")) {
449 tnl_cfg.dst_port = htons(LISP_DST_PORT);
452 needs_dst_port = netdev_vport_needs_dst_port(dev_);
453 tnl_cfg.ipsec = strstr(type, "ipsec");
454 tnl_cfg.dont_fragment = true;
456 SMAP_FOR_EACH (node, args) {
457 if (!strcmp(node->key, "remote_ip")) {
458 struct in_addr in_addr;
459 if (!strcmp(node->value, "flow")) {
460 tnl_cfg.ip_dst_flow = true;
461 tnl_cfg.ip_dst = htonl(0);
462 } else if (lookup_ip(node->value, &in_addr)) {
463 VLOG_WARN("%s: bad %s 'remote_ip'", name, type);
464 } else if (ip_is_multicast(in_addr.s_addr)) {
465 VLOG_WARN("%s: multicast remote_ip="IP_FMT" not allowed",
466 name, IP_ARGS(in_addr.s_addr));
469 tnl_cfg.ip_dst = in_addr.s_addr;
471 } else if (!strcmp(node->key, "local_ip")) {
472 struct in_addr in_addr;
473 if (!strcmp(node->value, "flow")) {
474 tnl_cfg.ip_src_flow = true;
475 tnl_cfg.ip_src = htonl(0);
476 } else if (lookup_ip(node->value, &in_addr)) {
477 VLOG_WARN("%s: bad %s 'local_ip'", name, type);
479 tnl_cfg.ip_src = in_addr.s_addr;
481 } else if (!strcmp(node->key, "tos")) {
482 if (!strcmp(node->value, "inherit")) {
483 tnl_cfg.tos_inherit = true;
487 tos = strtol(node->value, &endptr, 0);
488 if (*endptr == '\0' && tos == (tos & IP_DSCP_MASK)) {
491 VLOG_WARN("%s: invalid TOS %s", name, node->value);
494 } else if (!strcmp(node->key, "ttl")) {
495 if (!strcmp(node->value, "inherit")) {
496 tnl_cfg.ttl_inherit = true;
498 tnl_cfg.ttl = atoi(node->value);
500 } else if (!strcmp(node->key, "dst_port") && needs_dst_port) {
501 tnl_cfg.dst_port = htons(atoi(node->value));
502 } else if (!strcmp(node->key, "csum") && has_csum) {
503 if (!strcmp(node->value, "true")) {
506 } else if (!strcmp(node->key, "df_default")) {
507 if (!strcmp(node->value, "false")) {
508 tnl_cfg.dont_fragment = false;
510 } else if (!strcmp(node->key, "peer_cert") && tnl_cfg.ipsec) {
511 if (smap_get(args, "certificate")) {
512 ipsec_mech_set = true;
514 const char *use_ssl_cert;
516 /* If the "use_ssl_cert" is true, then "certificate" and
517 * "private_key" will be pulled from the SSL table. The
518 * use of this option is strongly discouraged, since it
519 * will like be removed when multiple SSL configurations
520 * are supported by OVS.
522 use_ssl_cert = smap_get(args, "use_ssl_cert");
523 if (!use_ssl_cert || strcmp(use_ssl_cert, "true")) {
524 VLOG_ERR("%s: 'peer_cert' requires 'certificate' argument",
528 ipsec_mech_set = true;
530 } else if (!strcmp(node->key, "psk") && tnl_cfg.ipsec) {
531 ipsec_mech_set = true;
532 } else if (tnl_cfg.ipsec
533 && (!strcmp(node->key, "certificate")
534 || !strcmp(node->key, "private_key")
535 || !strcmp(node->key, "use_ssl_cert"))) {
536 /* Ignore options not used by the netdev. */
537 } else if (!strcmp(node->key, "key") ||
538 !strcmp(node->key, "in_key") ||
539 !strcmp(node->key, "out_key")) {
540 /* Handled separately below. */
541 } else if (!strcmp(node->key, "exts")) {
542 char *str = xstrdup(node->value);
543 char *ext, *save_ptr = NULL;
547 ext = strtok_r(str, ",", &save_ptr);
549 if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
550 tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
552 VLOG_WARN("%s: unknown extension '%s'", name, ext);
555 ext = strtok_r(NULL, ",", &save_ptr);
560 VLOG_WARN("%s: unknown %s argument '%s'", name, type, node->key);
565 static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
566 static pid_t pid = 0;
569 ovs_mutex_lock(&mutex);
571 char *file_name = xasprintf("%s/%s", ovs_rundir(),
572 "ovs-monitor-ipsec.pid");
573 pid = read_pidfile(file_name);
576 ovs_mutex_unlock(&mutex);
580 VLOG_ERR("%s: IPsec requires the ovs-monitor-ipsec daemon",
585 if (smap_get(args, "peer_cert") && smap_get(args, "psk")) {
586 VLOG_ERR("%s: cannot define both 'peer_cert' and 'psk'", name);
590 if (!ipsec_mech_set) {
591 VLOG_ERR("%s: IPsec requires an 'peer_cert' or psk' argument",
597 if (!tnl_cfg.ip_dst && !tnl_cfg.ip_dst_flow) {
598 VLOG_ERR("%s: %s type requires valid 'remote_ip' argument",
602 if (tnl_cfg.ip_src_flow && !tnl_cfg.ip_dst_flow) {
603 VLOG_ERR("%s: %s type requires 'remote_ip=flow' with 'local_ip=flow'",
608 tnl_cfg.ttl = DEFAULT_TTL;
611 tnl_cfg.in_key = parse_key(args, "in_key",
612 &tnl_cfg.in_key_present,
613 &tnl_cfg.in_key_flow);
615 tnl_cfg.out_key = parse_key(args, "out_key",
616 &tnl_cfg.out_key_present,
617 &tnl_cfg.out_key_flow);
619 ovs_mutex_lock(&dev->mutex);
620 if (memcmp(&dev->tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) {
621 dev->tnl_cfg = tnl_cfg;
622 tunnel_check_status_change__(dev);
623 netdev_change_seq_changed(dev_);
625 ovs_mutex_unlock(&dev->mutex);
631 get_tunnel_config(const struct netdev *dev, struct smap *args)
633 struct netdev_vport *netdev = netdev_vport_cast(dev);
634 struct netdev_tunnel_config tnl_cfg;
636 ovs_mutex_lock(&netdev->mutex);
637 tnl_cfg = netdev->tnl_cfg;
638 ovs_mutex_unlock(&netdev->mutex);
640 if (tnl_cfg.ip_dst) {
641 smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_dst));
642 } else if (tnl_cfg.ip_dst_flow) {
643 smap_add(args, "remote_ip", "flow");
646 if (tnl_cfg.ip_src) {
647 smap_add_format(args, "local_ip", IP_FMT, IP_ARGS(tnl_cfg.ip_src));
648 } else if (tnl_cfg.ip_src_flow) {
649 smap_add(args, "local_ip", "flow");
652 if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) {
653 smap_add(args, "key", "flow");
654 } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present
655 && tnl_cfg.in_key == tnl_cfg.out_key) {
656 smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key));
658 if (tnl_cfg.in_key_flow) {
659 smap_add(args, "in_key", "flow");
660 } else if (tnl_cfg.in_key_present) {
661 smap_add_format(args, "in_key", "%"PRIu64,
662 ntohll(tnl_cfg.in_key));
665 if (tnl_cfg.out_key_flow) {
666 smap_add(args, "out_key", "flow");
667 } else if (tnl_cfg.out_key_present) {
668 smap_add_format(args, "out_key", "%"PRIu64,
669 ntohll(tnl_cfg.out_key));
673 if (tnl_cfg.ttl_inherit) {
674 smap_add(args, "ttl", "inherit");
675 } else if (tnl_cfg.ttl != DEFAULT_TTL) {
676 smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl);
679 if (tnl_cfg.tos_inherit) {
680 smap_add(args, "tos", "inherit");
681 } else if (tnl_cfg.tos) {
682 smap_add_format(args, "tos", "0x%x", tnl_cfg.tos);
685 if (tnl_cfg.dst_port) {
686 uint16_t dst_port = ntohs(tnl_cfg.dst_port);
687 const char *type = netdev_get_type(dev);
689 if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) ||
690 (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
691 (!strcmp("lisp", type) && dst_port != LISP_DST_PORT)) {
692 smap_add_format(args, "dst_port", "%d", dst_port);
697 smap_add(args, "csum", "true");
700 if (!tnl_cfg.dont_fragment) {
701 smap_add(args, "df_default", "false");
707 /* Code specific to patch ports. */
709 /* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d
710 * string that the caller must free.
712 * If 'netdev' is not a patch port, returns NULL. */
714 netdev_vport_patch_peer(const struct netdev *netdev_)
718 if (netdev_vport_is_patch(netdev_)) {
719 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
721 ovs_mutex_lock(&netdev->mutex);
723 peer = xstrdup(netdev->peer);
725 ovs_mutex_unlock(&netdev->mutex);
732 netdev_vport_inc_rx(const struct netdev *netdev,
733 const struct dpif_flow_stats *stats)
735 if (is_vport_class(netdev_get_class(netdev))) {
736 struct netdev_vport *dev = netdev_vport_cast(netdev);
738 ovs_mutex_lock(&dev->mutex);
739 dev->stats.rx_packets += stats->n_packets;
740 dev->stats.rx_bytes += stats->n_bytes;
741 ovs_mutex_unlock(&dev->mutex);
746 netdev_vport_inc_tx(const struct netdev *netdev,
747 const struct dpif_flow_stats *stats)
749 if (is_vport_class(netdev_get_class(netdev))) {
750 struct netdev_vport *dev = netdev_vport_cast(netdev);
752 ovs_mutex_lock(&dev->mutex);
753 dev->stats.tx_packets += stats->n_packets;
754 dev->stats.tx_bytes += stats->n_bytes;
755 ovs_mutex_unlock(&dev->mutex);
760 get_patch_config(const struct netdev *dev_, struct smap *args)
762 struct netdev_vport *dev = netdev_vport_cast(dev_);
764 ovs_mutex_lock(&dev->mutex);
766 smap_add(args, "peer", dev->peer);
768 ovs_mutex_unlock(&dev->mutex);
774 set_patch_config(struct netdev *dev_, const struct smap *args)
776 struct netdev_vport *dev = netdev_vport_cast(dev_);
777 const char *name = netdev_get_name(dev_);
780 peer = smap_get(args, "peer");
782 VLOG_ERR("%s: patch type requires valid 'peer' argument", name);
786 if (smap_count(args) > 1) {
787 VLOG_ERR("%s: patch type takes only a 'peer' argument", name);
791 if (!strcmp(name, peer)) {
792 VLOG_ERR("%s: patch peer must not be self", name);
796 ovs_mutex_lock(&dev->mutex);
797 if (!dev->peer || strcmp(dev->peer, peer)) {
799 dev->peer = xstrdup(peer);
800 netdev_change_seq_changed(dev_);
802 ovs_mutex_unlock(&dev->mutex);
808 get_stats(const struct netdev *netdev, struct netdev_stats *stats)
810 struct netdev_vport *dev = netdev_vport_cast(netdev);
812 ovs_mutex_lock(&dev->mutex);
814 ovs_mutex_unlock(&dev->mutex);
820 /* Tunnel push pop ops. */
822 static struct ip_header *
825 return (void *)((char *)eth + sizeof (struct eth_header));
828 static struct gre_base_hdr *
829 gre_hdr(struct ip_header *ip)
831 return (void *)((char *)ip + sizeof (struct ip_header));
835 ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl)
837 struct ip_header *nh;
840 nh = dp_packet_l3(packet);
841 l4 = dp_packet_l4(packet);
847 tnl->ip_src = get_16aligned_be32(&nh->ip_src);
848 tnl->ip_dst = get_16aligned_be32(&nh->ip_dst);
849 tnl->ip_tos = nh->ip_tos;
850 tnl->ip_ttl = nh->ip_ttl;
855 /* Pushes the 'size' bytes of 'header' into the headroom of 'packet',
856 * reallocating the packet if necessary. 'header' should contain an Ethernet
857 * header, followed by an IPv4 header (without options), and an L4 header.
859 * This function sets the IP header's ip_tot_len field (which should be zeroed
860 * as part of 'header') and puts its value into '*ip_tot_size' as well. Also
861 * updates IP header checksum.
863 * Return pointer to the L4 header added to 'packet'. */
865 push_ip_header(struct dp_packet *packet,
866 const void *header, int size, int *ip_tot_size)
868 struct eth_header *eth;
869 struct ip_header *ip;
871 eth = dp_packet_push_uninit(packet, size);
872 *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
874 memcpy(eth, header, size);
876 ip->ip_tot_len = htons(*ip_tot_size);
879 ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len);
885 udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl)
887 struct udp_header *udp;
889 udp = ip_extract_tnl_md(packet, tnl);
894 tnl->tp_src = udp->udp_src;
895 tnl->tp_dst = udp->udp_dst;
901 get_src_port(struct dp_packet *packet)
905 hash = dp_packet_get_dp_hash(packet);
907 return htons((((uint64_t) hash * (tnl_udp_port_max - tnl_udp_port_min)) >> 32) +
912 push_udp_header(struct dp_packet *packet, const void *header, int size)
914 struct udp_header *udp;
917 udp = push_ip_header(packet, header, size, &ip_tot_size);
919 /* set udp src port */
920 udp->udp_src = get_src_port(packet);
921 udp->udp_len = htons(ip_tot_size - sizeof (struct ip_header));
922 /* udp_csum is zero */
928 udp_build_header(struct netdev_tunnel_config *tnl_cfg,
929 struct ovs_action_push_tnl *data)
931 struct ip_header *ip;
932 struct udp_header *udp;
934 ip = ip_hdr(data->header);
935 ip->ip_proto = IPPROTO_UDP;
937 udp = (struct udp_header *) (ip + 1);
938 udp->udp_dst = tnl_cfg->dst_port;
944 gre_header_len(ovs_be16 flags)
946 int hlen = sizeof(struct eth_header) +
947 sizeof(struct ip_header) + 4;
949 if (flags & htons(GRE_CSUM)) {
952 if (flags & htons(GRE_KEY)) {
955 if (flags & htons(GRE_SEQ)) {
962 parse_gre_header(struct dp_packet *packet,
963 struct flow_tnl *tnl)
965 const struct gre_base_hdr *greh;
966 ovs_16aligned_be32 *options;
969 greh = ip_extract_tnl_md(packet, tnl);
974 if (greh->flags & ~(htons(GRE_CSUM | GRE_KEY | GRE_SEQ))) {
978 if (greh->protocol != htons(ETH_TYPE_TEB)) {
982 hlen = gre_header_len(greh->flags);
983 if (hlen > dp_packet_size(packet)) {
987 options = (ovs_16aligned_be32 *)(greh + 1);
988 if (greh->flags & htons(GRE_CSUM)) {
991 pkt_csum = csum(greh, dp_packet_size(packet) -
992 ((const unsigned char *)greh -
993 (const unsigned char *)dp_packet_l2(packet)));
997 tnl->flags = FLOW_TNL_F_CSUM;
1001 if (greh->flags & htons(GRE_KEY)) {
1002 tnl->tun_id = (OVS_FORCE ovs_be64) ((OVS_FORCE uint64_t)(get_16aligned_be32(options)) << 32);
1003 tnl->flags |= FLOW_TNL_F_KEY;
1007 if (greh->flags & htons(GRE_SEQ)) {
1015 reset_tnl_md(struct pkt_metadata *md)
1017 memset(&md->tunnel, 0, sizeof(md->tunnel));
1021 gre_extract_md(struct dp_packet *packet)
1023 struct pkt_metadata *md = &packet->md;
1024 struct flow_tnl *tnl = &md->tunnel;
1025 int hlen = sizeof(struct eth_header) +
1026 sizeof(struct ip_header) + 4;
1028 memset(md, 0, sizeof *md);
1029 if (hlen > dp_packet_size(packet)) {
1033 hlen = parse_gre_header(packet, tnl);
1038 dp_packet_reset_packet(packet, hlen);
1042 netdev_gre_pop_header(struct netdev *netdev_ OVS_UNUSED,
1043 struct dp_packet **pkt, int cnt)
1047 for (i = 0; i < cnt; i++) {
1048 gre_extract_md(pkt[i]);
1054 netdev_gre_push_header__(struct dp_packet *packet,
1055 const void *header, int size)
1057 struct gre_base_hdr *greh;
1060 greh = push_ip_header(packet, header, size, &ip_tot_size);
1062 if (greh->flags & htons(GRE_CSUM)) {
1063 ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1);
1064 *csum_opt = csum(greh, ip_tot_size - sizeof (struct ip_header));
1069 netdev_gre_push_header(const struct netdev *netdev OVS_UNUSED,
1070 struct dp_packet **packets, int cnt,
1071 const struct ovs_action_push_tnl *data)
1075 for (i = 0; i < cnt; i++) {
1076 netdev_gre_push_header__(packets[i], data->header, data->header_len);
1077 packets[i]->md = PKT_METADATA_INITIALIZER(u32_to_odp(data->out_port));
1084 netdev_gre_build_header(const struct netdev *netdev,
1085 struct ovs_action_push_tnl *data,
1086 const struct flow *tnl_flow)
1088 struct netdev_vport *dev = netdev_vport_cast(netdev);
1089 struct netdev_tunnel_config *tnl_cfg;
1090 struct ip_header *ip;
1091 struct gre_base_hdr *greh;
1092 ovs_16aligned_be32 *options;
1095 /* XXX: RCUfy tnl_cfg. */
1096 ovs_mutex_lock(&dev->mutex);
1097 tnl_cfg = &dev->tnl_cfg;
1099 ip = ip_hdr(data->header);
1100 ip->ip_proto = IPPROTO_GRE;
1103 greh->protocol = htons(ETH_TYPE_TEB);
1106 options = (ovs_16aligned_be32 *) (greh + 1);
1107 if (tnl_flow->tunnel.flags & FLOW_TNL_F_CSUM) {
1108 greh->flags |= htons(GRE_CSUM);
1109 put_16aligned_be32(options, 0);
1113 if (tnl_cfg->out_key_present) {
1114 greh->flags |= htons(GRE_KEY);
1115 put_16aligned_be32(options, (OVS_FORCE ovs_be32)
1116 ((OVS_FORCE uint64_t) tnl_flow->tunnel.tun_id >> 32));
1120 ovs_mutex_unlock(&dev->mutex);
1122 hlen = (uint8_t *) options - (uint8_t *) greh;
1124 data->header_len = sizeof(struct eth_header) +
1125 sizeof(struct ip_header) + hlen;
1126 data->tnl_type = OVS_VPORT_TYPE_GRE;
1131 vxlan_extract_md(struct dp_packet *packet)
1133 struct pkt_metadata *md = &packet->md;
1134 struct flow_tnl *tnl = &md->tunnel;
1135 struct vxlanhdr *vxh;
1137 memset(md, 0, sizeof *md);
1138 if (VXLAN_HLEN > dp_packet_size(packet)) {
1142 vxh = udp_extract_tnl_md(packet, tnl);
1147 if (get_16aligned_be32(&vxh->vx_flags) != htonl(VXLAN_FLAGS) ||
1148 (get_16aligned_be32(&vxh->vx_vni) & htonl(0xff))) {
1149 VLOG_WARN_RL(&err_rl, "invalid vxlan flags=%#x vni=%#x\n",
1150 ntohl(get_16aligned_be32(&vxh->vx_flags)),
1151 ntohl(get_16aligned_be32(&vxh->vx_vni)));
1155 tnl->tun_id = htonll(ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
1156 tnl->flags |= FLOW_TNL_F_KEY;
1158 dp_packet_reset_packet(packet, VXLAN_HLEN);
1162 netdev_vxlan_pop_header(struct netdev *netdev_ OVS_UNUSED,
1163 struct dp_packet **pkt, int cnt)
1167 for (i = 0; i < cnt; i++) {
1168 vxlan_extract_md(pkt[i]);
1174 netdev_vxlan_build_header(const struct netdev *netdev,
1175 struct ovs_action_push_tnl *data,
1176 const struct flow *tnl_flow)
1178 struct netdev_vport *dev = netdev_vport_cast(netdev);
1179 struct netdev_tunnel_config *tnl_cfg;
1180 struct vxlanhdr *vxh;
1182 /* XXX: RCUfy tnl_cfg. */
1183 ovs_mutex_lock(&dev->mutex);
1184 tnl_cfg = &dev->tnl_cfg;
1186 vxh = udp_build_header(tnl_cfg, data);
1188 put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS));
1189 put_16aligned_be32(&vxh->vx_vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8));
1191 ovs_mutex_unlock(&dev->mutex);
1192 data->header_len = VXLAN_HLEN;
1193 data->tnl_type = OVS_VPORT_TYPE_VXLAN;
1198 netdev_vxlan_push_header(const struct netdev *netdev OVS_UNUSED,
1199 struct dp_packet **packets, int cnt,
1200 const struct ovs_action_push_tnl *data)
1204 for (i = 0; i < cnt; i++) {
1205 push_udp_header(packets[i], data->header, VXLAN_HLEN);
1206 packets[i]->md = PKT_METADATA_INITIALIZER(u32_to_odp(data->out_port));
1212 geneve_extract_md(struct dp_packet *packet)
1214 struct pkt_metadata *md = &packet->md;
1215 struct flow_tnl *tnl = &md->tunnel;
1216 struct genevehdr *gnh;
1219 memset(md, 0, sizeof *md);
1220 if (GENEVE_BASE_HLEN > dp_packet_size(packet)) {
1221 VLOG_WARN_RL(&err_rl, "geneve packet too small: min header=%u packet size=%u\n",
1222 (unsigned int)GENEVE_BASE_HLEN, dp_packet_size(packet));
1226 gnh = udp_extract_tnl_md(packet, tnl);
1231 hlen = GENEVE_BASE_HLEN + gnh->opt_len * 4;
1232 if (hlen > dp_packet_size(packet)) {
1233 VLOG_WARN_RL(&err_rl, "geneve packet too small: header len=%u packet size=%u\n",
1234 hlen, dp_packet_size(packet));
1239 if (gnh->ver != 0) {
1240 VLOG_WARN_RL(&err_rl, "unknown geneve version: %"PRIu8"\n", gnh->ver);
1245 if (gnh->opt_len && gnh->critical) {
1246 VLOG_WARN_RL(&err_rl, "unknown geneve critical options: %"PRIu8" bytes\n",
1252 if (gnh->proto_type != htons(ETH_TYPE_TEB)) {
1253 VLOG_WARN_RL(&err_rl, "unknown geneve encapsulated protocol: %#x\n",
1254 ntohs(gnh->proto_type));
1259 tnl->flags |= gnh->oam ? FLOW_TNL_F_OAM : 0;
1260 tnl->tun_id = htonll(ntohl(get_16aligned_be32(&gnh->vni)) >> 8);
1261 tnl->flags |= FLOW_TNL_F_KEY;
1263 dp_packet_reset_packet(packet, hlen);
1267 netdev_geneve_pop_header(struct netdev *netdev_ OVS_UNUSED,
1268 struct dp_packet **pkt, int cnt)
1272 for (i = 0; i < cnt; i++) {
1273 geneve_extract_md(pkt[i]);
1279 netdev_geneve_build_header(const struct netdev *netdev,
1280 struct ovs_action_push_tnl *data,
1281 const struct flow *tnl_flow)
1283 struct netdev_vport *dev = netdev_vport_cast(netdev);
1284 struct netdev_tunnel_config *tnl_cfg;
1285 struct genevehdr *gnh;
1287 /* XXX: RCUfy tnl_cfg. */
1288 ovs_mutex_lock(&dev->mutex);
1289 tnl_cfg = &dev->tnl_cfg;
1291 gnh = udp_build_header(tnl_cfg, data);
1293 gnh->oam = !!(tnl_flow->tunnel.flags & FLOW_TNL_F_OAM);
1294 gnh->proto_type = htons(ETH_TYPE_TEB);
1295 put_16aligned_be32(&gnh->vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8));
1297 ovs_mutex_unlock(&dev->mutex);
1298 data->header_len = GENEVE_BASE_HLEN;
1299 data->tnl_type = OVS_VPORT_TYPE_GENEVE;
1304 netdev_geneve_push_header(const struct netdev *netdev OVS_UNUSED,
1305 struct dp_packet **packets, int cnt,
1306 const struct ovs_action_push_tnl *data)
1310 for (i = 0; i < cnt; i++) {
1311 push_udp_header(packets[i], data->header, data->header_len);
1312 packets[i]->md = PKT_METADATA_INITIALIZER(u32_to_odp(data->out_port));
1318 netdev_vport_range(struct unixctl_conn *conn, int argc,
1319 const char *argv[], void *aux OVS_UNUSED)
1324 struct ds ds = DS_EMPTY_INITIALIZER;
1326 ds_put_format(&ds, "Tunnel UDP source port range: %"PRIu16"-%"PRIu16"\n",
1327 tnl_udp_port_min, tnl_udp_port_max);
1329 unixctl_command_reply(conn, ds_cstr(&ds));
1338 val1 = atoi(argv[1]);
1339 if (val1 <= 0 || val1 > UINT16_MAX) {
1340 unixctl_command_reply(conn, "Invalid min.");
1343 val2 = atoi(argv[2]);
1344 if (val2 <= 0 || val2 > UINT16_MAX) {
1345 unixctl_command_reply(conn, "Invalid max.");
1350 tnl_udp_port_min = val2;
1351 tnl_udp_port_max = val1;
1353 tnl_udp_port_min = val1;
1354 tnl_udp_port_max = val2;
1356 seq_change(tnl_conf_seq);
1358 unixctl_command_reply(conn, "OK");
1362 #define VPORT_FUNCTIONS(GET_CONFIG, SET_CONFIG, \
1363 GET_TUNNEL_CONFIG, GET_STATUS, \
1365 PUSH_HEADER, POP_HEADER) \
1368 netdev_vport_wait, \
1370 netdev_vport_alloc, \
1371 netdev_vport_construct, \
1372 netdev_vport_destruct, \
1373 netdev_vport_dealloc, \
1376 GET_TUNNEL_CONFIG, \
1380 NULL, /* get_numa_id */ \
1381 NULL, /* set_multiq */ \
1384 NULL, /* send_wait */ \
1386 netdev_vport_set_etheraddr, \
1387 netdev_vport_get_etheraddr, \
1388 NULL, /* get_mtu */ \
1389 NULL, /* set_mtu */ \
1390 NULL, /* get_ifindex */ \
1391 NULL, /* get_carrier */ \
1392 NULL, /* get_carrier_resets */ \
1393 NULL, /* get_miimon */ \
1396 NULL, /* get_features */ \
1397 NULL, /* set_advertisements */ \
1399 NULL, /* set_policing */ \
1400 NULL, /* get_qos_types */ \
1401 NULL, /* get_qos_capabilities */ \
1402 NULL, /* get_qos */ \
1403 NULL, /* set_qos */ \
1404 NULL, /* get_queue */ \
1405 NULL, /* set_queue */ \
1406 NULL, /* delete_queue */ \
1407 NULL, /* get_queue_stats */ \
1408 NULL, /* queue_dump_start */ \
1409 NULL, /* queue_dump_next */ \
1410 NULL, /* queue_dump_done */ \
1411 NULL, /* dump_queue_stats */ \
1413 NULL, /* get_in4 */ \
1414 NULL, /* set_in4 */ \
1415 NULL, /* get_in6 */ \
1416 NULL, /* add_router */ \
1417 NULL, /* get_next_hop */ \
1419 NULL, /* arp_lookup */ \
1421 netdev_vport_update_flags, \
1423 NULL, /* rx_alloc */ \
1424 NULL, /* rx_construct */ \
1425 NULL, /* rx_destruct */ \
1426 NULL, /* rx_dealloc */ \
1427 NULL, /* rx_recv */ \
1428 NULL, /* rx_wait */ \
1429 NULL, /* rx_drain */
1432 #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER) \
1434 { NAME, VPORT_FUNCTIONS(get_tunnel_config, \
1435 set_tunnel_config, \
1436 get_netdev_tunnel_config, \
1437 tunnel_get_status, \
1438 BUILD_HEADER, PUSH_HEADER, POP_HEADER) }}
1441 netdev_vport_tunnel_register(void)
1443 /* The name of the dpif_port should be short enough to accomodate adding
1444 * a port number to the end if one is necessary. */
1445 static const struct vport_class vport_classes[] = {
1446 TUNNEL_CLASS("geneve", "genev_sys", netdev_geneve_build_header,
1447 netdev_geneve_push_header,
1448 netdev_geneve_pop_header),
1449 TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header,
1450 netdev_gre_push_header,
1451 netdev_gre_pop_header),
1452 TUNNEL_CLASS("ipsec_gre", "gre_sys", NULL, NULL, NULL),
1453 TUNNEL_CLASS("gre64", "gre64_sys", NULL, NULL, NULL),
1454 TUNNEL_CLASS("ipsec_gre64", "gre64_sys", NULL, NULL, NULL),
1455 TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
1456 netdev_vxlan_push_header,
1457 netdev_vxlan_pop_header),
1458 TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL)
1460 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1462 if (ovsthread_once_start(&once)) {
1465 for (i = 0; i < ARRAY_SIZE(vport_classes); i++) {
1466 netdev_register_provider(&vport_classes[i].netdev_class);
1469 unixctl_command_register("tnl/egress_port_range", "min max", 0, 2,
1470 netdev_vport_range, NULL);
1472 ovsthread_once_done(&once);
1477 netdev_vport_patch_register(void)
1479 static const struct vport_class patch_class =
1481 { "patch", VPORT_FUNCTIONS(get_patch_config,
1484 NULL, NULL, NULL, NULL) }};
1485 netdev_register_provider(&patch_class.netdev_class);