2 * Copyright (c) 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-vport.h"
23 #include <sys/socket.h>
25 #include <netinet/in.h>
26 #include <netinet/ip6.h>
27 #include <sys/ioctl.h>
29 #include "byte-order.h"
34 #include "dp-packet.h"
35 #include "dynamic-string.h"
40 #include "netdev-provider.h"
41 #include "odp-netlink.h"
42 #include "dp-packet.h"
43 #include "ovs-router.h"
45 #include "poll-loop.h"
46 #include "route-table.h"
48 #include "socket-util.h"
49 #include "openvswitch/vlog.h"
50 #include "unaligned.h"
54 VLOG_DEFINE_THIS_MODULE(netdev_vport);
55 static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
57 #define GENEVE_DST_PORT 6081
58 #define VXLAN_DST_PORT 4789
59 #define LISP_DST_PORT 4341
60 #define STT_DST_PORT 7471
62 #define VXLAN_HLEN (sizeof(struct udp_header) + \
63 sizeof(struct vxlanhdr))
65 #define GENEVE_BASE_HLEN (sizeof(struct udp_header) + \
66 sizeof(struct genevehdr))
68 #define DEFAULT_TTL 64
73 /* Protects all members below. */
74 struct ovs_mutex mutex;
76 struct eth_addr etheraddr;
77 struct netdev_stats stats;
80 struct netdev_tunnel_config tnl_cfg;
81 char egress_iface[IFNAMSIZ];
89 const char *dpif_port;
90 struct netdev_class netdev_class;
93 /* Last read of the route-table's change number. */
94 static uint64_t rt_change_seqno;
96 static int netdev_vport_construct(struct netdev *);
97 static int get_patch_config(const struct netdev *netdev, struct smap *args);
98 static int get_tunnel_config(const struct netdev *, struct smap *args);
99 static bool tunnel_check_status_change__(struct netdev_vport *);
101 static uint16_t tnl_udp_port_min = 32768;
102 static uint16_t tnl_udp_port_max = 61000;
105 is_vport_class(const struct netdev_class *class)
107 return class->construct == netdev_vport_construct;
111 netdev_vport_is_vport_class(const struct netdev_class *class)
113 return is_vport_class(class);
116 static const struct vport_class *
117 vport_class_cast(const struct netdev_class *class)
119 ovs_assert(is_vport_class(class));
120 return CONTAINER_OF(class, struct vport_class, netdev_class);
123 static struct netdev_vport *
124 netdev_vport_cast(const struct netdev *netdev)
126 ovs_assert(is_vport_class(netdev_get_class(netdev)));
127 return CONTAINER_OF(netdev, struct netdev_vport, up);
130 static const struct netdev_tunnel_config *
131 get_netdev_tunnel_config(const struct netdev *netdev)
133 return &netdev_vport_cast(netdev)->tnl_cfg;
137 netdev_vport_is_patch(const struct netdev *netdev)
139 const struct netdev_class *class = netdev_get_class(netdev);
141 return class->get_config == get_patch_config;
145 netdev_vport_is_layer3(const struct netdev *dev)
147 const char *type = netdev_get_type(dev);
149 return (!strcmp("lisp", type));
153 netdev_vport_needs_dst_port(const struct netdev *dev)
155 const struct netdev_class *class = netdev_get_class(dev);
156 const char *type = netdev_get_type(dev);
158 return (class->get_config == get_tunnel_config &&
159 (!strcmp("geneve", type) || !strcmp("vxlan", type) ||
160 !strcmp("lisp", type) || !strcmp("stt", type)) );
164 netdev_vport_class_get_dpif_port(const struct netdev_class *class)
166 return is_vport_class(class) ? vport_class_cast(class)->dpif_port : NULL;
170 netdev_vport_get_dpif_port(const struct netdev *netdev,
171 char namebuf[], size_t bufsize)
173 const struct netdev_class *class = netdev_get_class(netdev);
174 const char *dpif_port = netdev_vport_class_get_dpif_port(class);
177 return netdev_get_name(netdev);
180 if (netdev_vport_needs_dst_port(netdev)) {
181 const struct netdev_vport *vport = netdev_vport_cast(netdev);
184 * Note: IFNAMSIZ is 16 bytes long. Implementations should choose
185 * a dpif port name that is short enough to fit including any
186 * port numbers but assert just in case.
188 BUILD_ASSERT(NETDEV_VPORT_NAME_BUFSIZE >= IFNAMSIZ);
189 ovs_assert(strlen(dpif_port) + 6 < IFNAMSIZ);
190 snprintf(namebuf, bufsize, "%s_%d", dpif_port,
191 ntohs(vport->tnl_cfg.dst_port));
199 netdev_vport_get_dpif_port_strdup(const struct netdev *netdev)
201 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
203 return xstrdup(netdev_vport_get_dpif_port(netdev, namebuf,
207 /* Whenever the route-table change number is incremented,
208 * netdev_vport_route_changed() should be called to update
209 * the corresponding tunnel interface status. */
211 netdev_vport_route_changed(void)
213 struct netdev **vports;
216 vports = netdev_get_vports(&n_vports);
217 for (i = 0; i < n_vports; i++) {
218 struct netdev *netdev_ = vports[i];
219 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
221 ovs_mutex_lock(&netdev->mutex);
222 /* Finds all tunnel vports. */
223 if (ipv6_addr_is_set(&netdev->tnl_cfg.ipv6_dst)) {
224 if (tunnel_check_status_change__(netdev)) {
225 netdev_change_seq_changed(netdev_);
228 ovs_mutex_unlock(&netdev->mutex);
230 netdev_close(netdev_);
236 static struct netdev *
237 netdev_vport_alloc(void)
239 struct netdev_vport *netdev = xzalloc(sizeof *netdev);
244 netdev_vport_construct(struct netdev *netdev_)
246 struct netdev_vport *dev = netdev_vport_cast(netdev_);
247 const char *type = netdev_get_type(netdev_);
249 ovs_mutex_init(&dev->mutex);
250 eth_addr_random(&dev->etheraddr);
252 /* Add a default destination port for tunnel ports if none specified. */
253 if (!strcmp(type, "geneve")) {
254 dev->tnl_cfg.dst_port = htons(GENEVE_DST_PORT);
255 } else if (!strcmp(type, "vxlan")) {
256 dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
257 } else if (!strcmp(type, "lisp")) {
258 dev->tnl_cfg.dst_port = htons(LISP_DST_PORT);
259 } else if (!strcmp(type, "stt")) {
260 dev->tnl_cfg.dst_port = htons(STT_DST_PORT);
263 dev->tnl_cfg.dont_fragment = true;
264 dev->tnl_cfg.ttl = DEFAULT_TTL;
269 netdev_vport_destruct(struct netdev *netdev_)
271 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
274 ovs_mutex_destroy(&netdev->mutex);
278 netdev_vport_dealloc(struct netdev *netdev_)
280 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
285 netdev_vport_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
287 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
289 ovs_mutex_lock(&netdev->mutex);
290 netdev->etheraddr = mac;
291 ovs_mutex_unlock(&netdev->mutex);
292 netdev_change_seq_changed(netdev_);
298 netdev_vport_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
300 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
302 ovs_mutex_lock(&netdev->mutex);
303 *mac = netdev->etheraddr;
304 ovs_mutex_unlock(&netdev->mutex);
309 /* Checks if the tunnel status has changed and returns a boolean.
310 * Updates the tunnel status if it has changed. */
312 tunnel_check_status_change__(struct netdev_vport *netdev)
313 OVS_REQUIRES(netdev->mutex)
315 char iface[IFNAMSIZ];
317 struct in6_addr *route;
321 route = &netdev->tnl_cfg.ipv6_dst;
322 if (ovs_router_lookup(route, iface, &gw)) {
323 struct netdev *egress_netdev;
325 if (!netdev_open(iface, "system", &egress_netdev)) {
326 status = netdev_get_carrier(egress_netdev);
327 netdev_close(egress_netdev);
331 if (strcmp(netdev->egress_iface, iface)
332 || netdev->carrier_status != status) {
333 ovs_strlcpy(netdev->egress_iface, iface, IFNAMSIZ);
334 netdev->carrier_status = status;
343 tunnel_get_status(const struct netdev *netdev_, struct smap *smap)
345 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
347 if (netdev->egress_iface[0]) {
348 smap_add(smap, "tunnel_egress_iface", netdev->egress_iface);
350 smap_add(smap, "tunnel_egress_iface_carrier",
351 netdev->carrier_status ? "up" : "down");
358 netdev_vport_update_flags(struct netdev *netdev OVS_UNUSED,
359 enum netdev_flags off,
360 enum netdev_flags on OVS_UNUSED,
361 enum netdev_flags *old_flagsp)
363 if (off & (NETDEV_UP | NETDEV_PROMISC)) {
367 *old_flagsp = NETDEV_UP | NETDEV_PROMISC;
372 netdev_vport_run(void)
377 seq = route_table_get_change_seq();
378 if (rt_change_seqno != seq) {
379 rt_change_seqno = seq;
380 netdev_vport_route_changed();
385 netdev_vport_wait(void)
390 seq = route_table_get_change_seq();
391 if (rt_change_seqno != seq) {
392 poll_immediate_wake();
396 /* Code specific to tunnel types. */
399 parse_key(const struct smap *args, const char *name,
400 bool *present, bool *flow)
407 s = smap_get(args, name);
409 s = smap_get(args, "key");
417 if (!strcmp(s, "flow")) {
421 return htonll(strtoull(s, NULL, 0));
426 parse_tunnel_ip(const char *value, bool accept_mcast, bool *flow,
427 struct in6_addr *ipv6, uint16_t *protocol)
429 if (!strcmp(value, "flow")) {
434 if (addr_is_ipv6(value)) {
435 if (lookup_ipv6(value, ipv6)) {
438 if (!accept_mcast && ipv6_addr_is_multicast(ipv6)) {
441 *protocol = ETH_TYPE_IPV6;
444 if (lookup_ip(value, &ip)) {
447 if (!accept_mcast && ip_is_multicast(ip.s_addr)) {
450 in6_addr_set_mapped_ipv4(ipv6, ip.s_addr);
451 *protocol = ETH_TYPE_IP;
457 set_tunnel_config(struct netdev *dev_, const struct smap *args)
459 struct netdev_vport *dev = netdev_vport_cast(dev_);
460 const char *name = netdev_get_name(dev_);
461 const char *type = netdev_get_type(dev_);
462 bool ipsec_mech_set, needs_dst_port, has_csum;
463 uint16_t dst_proto = 0, src_proto = 0;
464 struct netdev_tunnel_config tnl_cfg;
465 struct smap_node *node;
467 has_csum = strstr(type, "gre") || strstr(type, "geneve") ||
468 strstr(type, "stt") || strstr(type, "vxlan");
469 ipsec_mech_set = false;
470 memset(&tnl_cfg, 0, sizeof tnl_cfg);
472 /* Add a default destination port for tunnel ports if none specified. */
473 if (!strcmp(type, "geneve")) {
474 tnl_cfg.dst_port = htons(GENEVE_DST_PORT);
477 if (!strcmp(type, "vxlan")) {
478 tnl_cfg.dst_port = htons(VXLAN_DST_PORT);
481 if (!strcmp(type, "lisp")) {
482 tnl_cfg.dst_port = htons(LISP_DST_PORT);
485 if (!strcmp(type, "stt")) {
486 tnl_cfg.dst_port = htons(STT_DST_PORT);
489 needs_dst_port = netdev_vport_needs_dst_port(dev_);
490 tnl_cfg.ipsec = strstr(type, "ipsec");
491 tnl_cfg.dont_fragment = true;
493 SMAP_FOR_EACH (node, args) {
494 if (!strcmp(node->key, "remote_ip")) {
496 err = parse_tunnel_ip(node->value, false, &tnl_cfg.ip_dst_flow,
497 &tnl_cfg.ipv6_dst, &dst_proto);
500 VLOG_WARN("%s: bad %s 'remote_ip'", name, type);
503 VLOG_WARN("%s: multicast remote_ip=%s not allowed",
507 if (dst_proto == ETH_TYPE_IPV6) {
508 VLOG_WARN("%s: IPv6 'remote_ip' is not supported", name);
511 } else if (!strcmp(node->key, "local_ip")) {
513 err = parse_tunnel_ip(node->value, true, &tnl_cfg.ip_src_flow,
514 &tnl_cfg.ipv6_src, &src_proto);
517 VLOG_WARN("%s: bad %s 'local_ip'", name, type);
520 if (src_proto == ETH_TYPE_IPV6) {
521 VLOG_WARN("%s: IPv6 'local_ip' is not supported", name);
524 } else if (!strcmp(node->key, "tos")) {
525 if (!strcmp(node->value, "inherit")) {
526 tnl_cfg.tos_inherit = true;
530 tos = strtol(node->value, &endptr, 0);
531 if (*endptr == '\0' && tos == (tos & IP_DSCP_MASK)) {
534 VLOG_WARN("%s: invalid TOS %s", name, node->value);
537 } else if (!strcmp(node->key, "ttl")) {
538 if (!strcmp(node->value, "inherit")) {
539 tnl_cfg.ttl_inherit = true;
541 tnl_cfg.ttl = atoi(node->value);
543 } else if (!strcmp(node->key, "dst_port") && needs_dst_port) {
544 tnl_cfg.dst_port = htons(atoi(node->value));
545 } else if (!strcmp(node->key, "csum") && has_csum) {
546 if (!strcmp(node->value, "true")) {
549 } else if (!strcmp(node->key, "df_default")) {
550 if (!strcmp(node->value, "false")) {
551 tnl_cfg.dont_fragment = false;
553 } else if (!strcmp(node->key, "peer_cert") && tnl_cfg.ipsec) {
554 if (smap_get(args, "certificate")) {
555 ipsec_mech_set = true;
557 const char *use_ssl_cert;
559 /* If the "use_ssl_cert" is true, then "certificate" and
560 * "private_key" will be pulled from the SSL table. The
561 * use of this option is strongly discouraged, since it
562 * will like be removed when multiple SSL configurations
563 * are supported by OVS.
565 use_ssl_cert = smap_get(args, "use_ssl_cert");
566 if (!use_ssl_cert || strcmp(use_ssl_cert, "true")) {
567 VLOG_ERR("%s: 'peer_cert' requires 'certificate' argument",
571 ipsec_mech_set = true;
573 } else if (!strcmp(node->key, "psk") && tnl_cfg.ipsec) {
574 ipsec_mech_set = true;
575 } else if (tnl_cfg.ipsec
576 && (!strcmp(node->key, "certificate")
577 || !strcmp(node->key, "private_key")
578 || !strcmp(node->key, "use_ssl_cert"))) {
579 /* Ignore options not used by the netdev. */
580 } else if (!strcmp(node->key, "key") ||
581 !strcmp(node->key, "in_key") ||
582 !strcmp(node->key, "out_key")) {
583 /* Handled separately below. */
584 } else if (!strcmp(node->key, "exts")) {
585 char *str = xstrdup(node->value);
586 char *ext, *save_ptr = NULL;
590 ext = strtok_r(str, ",", &save_ptr);
592 if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
593 tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
595 VLOG_WARN("%s: unknown extension '%s'", name, ext);
598 ext = strtok_r(NULL, ",", &save_ptr);
603 VLOG_WARN("%s: unknown %s argument '%s'", name, type, node->key);
608 static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
609 static pid_t pid = 0;
612 ovs_mutex_lock(&mutex);
614 char *file_name = xasprintf("%s/%s", ovs_rundir(),
615 "ovs-monitor-ipsec.pid");
616 pid = read_pidfile(file_name);
619 ovs_mutex_unlock(&mutex);
623 VLOG_ERR("%s: IPsec requires the ovs-monitor-ipsec daemon",
628 if (smap_get(args, "peer_cert") && smap_get(args, "psk")) {
629 VLOG_ERR("%s: cannot define both 'peer_cert' and 'psk'", name);
633 if (!ipsec_mech_set) {
634 VLOG_ERR("%s: IPsec requires an 'peer_cert' or psk' argument",
640 if (!ipv6_addr_is_set(&tnl_cfg.ipv6_dst) && !tnl_cfg.ip_dst_flow) {
641 VLOG_ERR("%s: %s type requires valid 'remote_ip' argument",
645 if (tnl_cfg.ip_src_flow && !tnl_cfg.ip_dst_flow) {
646 VLOG_ERR("%s: %s type requires 'remote_ip=flow' with 'local_ip=flow'",
650 if (src_proto && dst_proto && src_proto != dst_proto) {
651 VLOG_ERR("%s: 'remote_ip' and 'local_ip' has to be of the same address family",
656 tnl_cfg.ttl = DEFAULT_TTL;
659 tnl_cfg.in_key = parse_key(args, "in_key",
660 &tnl_cfg.in_key_present,
661 &tnl_cfg.in_key_flow);
663 tnl_cfg.out_key = parse_key(args, "out_key",
664 &tnl_cfg.out_key_present,
665 &tnl_cfg.out_key_flow);
667 ovs_mutex_lock(&dev->mutex);
668 if (memcmp(&dev->tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) {
669 dev->tnl_cfg = tnl_cfg;
670 tunnel_check_status_change__(dev);
671 netdev_change_seq_changed(dev_);
673 ovs_mutex_unlock(&dev->mutex);
679 get_tunnel_config(const struct netdev *dev, struct smap *args)
681 struct netdev_vport *netdev = netdev_vport_cast(dev);
682 struct netdev_tunnel_config tnl_cfg;
684 ovs_mutex_lock(&netdev->mutex);
685 tnl_cfg = netdev->tnl_cfg;
686 ovs_mutex_unlock(&netdev->mutex);
688 if (ipv6_addr_is_set(&tnl_cfg.ipv6_dst)) {
689 smap_add_ipv6(args, "remote_ip", &tnl_cfg.ipv6_dst);
690 } else if (tnl_cfg.ip_dst_flow) {
691 smap_add(args, "remote_ip", "flow");
694 if (ipv6_addr_is_set(&tnl_cfg.ipv6_src)) {
695 smap_add_ipv6(args, "local_ip", &tnl_cfg.ipv6_src);
696 } else if (tnl_cfg.ip_src_flow) {
697 smap_add(args, "local_ip", "flow");
700 if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) {
701 smap_add(args, "key", "flow");
702 } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present
703 && tnl_cfg.in_key == tnl_cfg.out_key) {
704 smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key));
706 if (tnl_cfg.in_key_flow) {
707 smap_add(args, "in_key", "flow");
708 } else if (tnl_cfg.in_key_present) {
709 smap_add_format(args, "in_key", "%"PRIu64,
710 ntohll(tnl_cfg.in_key));
713 if (tnl_cfg.out_key_flow) {
714 smap_add(args, "out_key", "flow");
715 } else if (tnl_cfg.out_key_present) {
716 smap_add_format(args, "out_key", "%"PRIu64,
717 ntohll(tnl_cfg.out_key));
721 if (tnl_cfg.ttl_inherit) {
722 smap_add(args, "ttl", "inherit");
723 } else if (tnl_cfg.ttl != DEFAULT_TTL) {
724 smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl);
727 if (tnl_cfg.tos_inherit) {
728 smap_add(args, "tos", "inherit");
729 } else if (tnl_cfg.tos) {
730 smap_add_format(args, "tos", "0x%x", tnl_cfg.tos);
733 if (tnl_cfg.dst_port) {
734 uint16_t dst_port = ntohs(tnl_cfg.dst_port);
735 const char *type = netdev_get_type(dev);
737 if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) ||
738 (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) ||
739 (!strcmp("lisp", type) && dst_port != LISP_DST_PORT) ||
740 (!strcmp("stt", type) && dst_port != STT_DST_PORT)) {
741 smap_add_format(args, "dst_port", "%d", dst_port);
746 smap_add(args, "csum", "true");
749 if (!tnl_cfg.dont_fragment) {
750 smap_add(args, "df_default", "false");
756 /* Code specific to patch ports. */
758 /* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d
759 * string that the caller must free.
761 * If 'netdev' is not a patch port, returns NULL. */
763 netdev_vport_patch_peer(const struct netdev *netdev_)
767 if (netdev_vport_is_patch(netdev_)) {
768 struct netdev_vport *netdev = netdev_vport_cast(netdev_);
770 ovs_mutex_lock(&netdev->mutex);
772 peer = xstrdup(netdev->peer);
774 ovs_mutex_unlock(&netdev->mutex);
781 netdev_vport_inc_rx(const struct netdev *netdev,
782 const struct dpif_flow_stats *stats)
784 if (is_vport_class(netdev_get_class(netdev))) {
785 struct netdev_vport *dev = netdev_vport_cast(netdev);
787 ovs_mutex_lock(&dev->mutex);
788 dev->stats.rx_packets += stats->n_packets;
789 dev->stats.rx_bytes += stats->n_bytes;
790 ovs_mutex_unlock(&dev->mutex);
795 netdev_vport_inc_tx(const struct netdev *netdev,
796 const struct dpif_flow_stats *stats)
798 if (is_vport_class(netdev_get_class(netdev))) {
799 struct netdev_vport *dev = netdev_vport_cast(netdev);
801 ovs_mutex_lock(&dev->mutex);
802 dev->stats.tx_packets += stats->n_packets;
803 dev->stats.tx_bytes += stats->n_bytes;
804 ovs_mutex_unlock(&dev->mutex);
809 get_patch_config(const struct netdev *dev_, struct smap *args)
811 struct netdev_vport *dev = netdev_vport_cast(dev_);
813 ovs_mutex_lock(&dev->mutex);
815 smap_add(args, "peer", dev->peer);
817 ovs_mutex_unlock(&dev->mutex);
823 set_patch_config(struct netdev *dev_, const struct smap *args)
825 struct netdev_vport *dev = netdev_vport_cast(dev_);
826 const char *name = netdev_get_name(dev_);
829 peer = smap_get(args, "peer");
831 VLOG_ERR("%s: patch type requires valid 'peer' argument", name);
835 if (smap_count(args) > 1) {
836 VLOG_ERR("%s: patch type takes only a 'peer' argument", name);
840 if (!strcmp(name, peer)) {
841 VLOG_ERR("%s: patch peer must not be self", name);
845 ovs_mutex_lock(&dev->mutex);
846 if (!dev->peer || strcmp(dev->peer, peer)) {
848 dev->peer = xstrdup(peer);
849 netdev_change_seq_changed(dev_);
851 ovs_mutex_unlock(&dev->mutex);
857 get_stats(const struct netdev *netdev, struct netdev_stats *stats)
859 struct netdev_vport *dev = netdev_vport_cast(netdev);
861 ovs_mutex_lock(&dev->mutex);
863 ovs_mutex_unlock(&dev->mutex);
869 /* Tunnel push pop ops. */
871 static struct ip_header *
874 return (void *)((char *)eth + sizeof (struct eth_header));
877 static struct ovs_16aligned_ip6_hdr *
880 return (void *)((char *)eth + sizeof (struct eth_header));
884 ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
888 struct ip_header *ip;
889 struct ovs_16aligned_ip6_hdr *ip6;
893 nh = dp_packet_l3(packet);
896 l4 = dp_packet_l4(packet);
902 *hlen = sizeof(struct eth_header);
904 l3_size = dp_packet_size(packet) -
905 ((char *)nh - (char *)dp_packet_data(packet));
907 if (IP_VER(ip->ip_ihl_ver) == 4) {
909 ovs_be32 ip_src, ip_dst;
911 if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
912 VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum");
916 if (ntohs(ip->ip_tot_len) > l3_size) {
917 VLOG_WARN_RL(&err_rl, "ip packet is truncated (IP length %d, actual %d)",
918 ntohs(ip->ip_tot_len), l3_size);
921 if (IP_IHL(ip->ip_ihl_ver) * 4 > sizeof(struct ip_header)) {
922 VLOG_WARN_RL(&err_rl, "ip options not supported on tunnel packets "
923 "(%d bytes)", IP_IHL(ip->ip_ihl_ver) * 4);
927 ip_src = get_16aligned_be32(&ip->ip_src);
928 ip_dst = get_16aligned_be32(&ip->ip_dst);
930 tnl->ip_src = ip_src;
931 tnl->ip_dst = ip_dst;
932 tnl->ip_tos = ip->ip_tos;
933 tnl->ip_ttl = ip->ip_ttl;
935 *hlen += IP_HEADER_LEN;
937 } else if (IP_VER(ip->ip_ihl_ver) == 6) {
939 memcpy(tnl->ipv6_src.s6_addr, ip6->ip6_src.be16, sizeof ip6->ip6_src);
940 memcpy(tnl->ipv6_dst.s6_addr, ip6->ip6_dst.be16, sizeof ip6->ip6_dst);
942 tnl->ip_ttl = ip6->ip6_hlim;
944 *hlen += IPV6_HEADER_LEN;
947 VLOG_WARN_RL(&err_rl, "ipv4 packet has invalid version (%d)",
948 IP_VER(ip->ip_ihl_ver));
956 is_header_ipv6(const void *header)
958 const struct eth_header *eth;
960 return eth->eth_type == htons(ETH_TYPE_IPV6);
963 /* Pushes the 'size' bytes of 'header' into the headroom of 'packet',
964 * reallocating the packet if necessary. 'header' should contain an Ethernet
965 * header, followed by an IPv4 header (without options), and an L4 header.
967 * This function sets the IP header's ip_tot_len field (which should be zeroed
968 * as part of 'header') and puts its value into '*ip_tot_size' as well. Also
969 * updates IP header checksum.
971 * Return pointer to the L4 header added to 'packet'. */
973 push_ip_header(struct dp_packet *packet,
974 const void *header, int size, int *ip_tot_size)
976 struct eth_header *eth;
977 struct ip_header *ip;
978 struct ovs_16aligned_ip6_hdr *ip6;
980 eth = dp_packet_push_uninit(packet, size);
981 *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
983 memcpy(eth, header, size);
985 if (is_header_ipv6(header)) {
987 *ip_tot_size -= IPV6_HEADER_LEN;
988 ip6->ip6_plen = htons(*ip_tot_size);
992 ip->ip_tot_len = htons(*ip_tot_size);
993 ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len);
994 *ip_tot_size -= IP_HEADER_LEN;
1000 udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
1003 struct udp_header *udp;
1005 udp = ip_extract_tnl_md(packet, tnl, hlen);
1010 if (udp->udp_csum) {
1012 if (is_header_ipv6(dp_packet_data(packet))) {
1013 csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
1015 csum = packet_csum_pseudoheader(dp_packet_l3(packet));
1018 csum = csum_continue(csum, udp, dp_packet_size(packet) -
1019 ((const unsigned char *)udp -
1020 (const unsigned char *)dp_packet_l2(packet)));
1021 if (csum_finish(csum)) {
1024 tnl->flags |= FLOW_TNL_F_CSUM;
1027 tnl->tp_src = udp->udp_src;
1028 tnl->tp_dst = udp->udp_dst;
1034 get_src_port(struct dp_packet *packet)
1038 hash = dp_packet_get_rss_hash(packet);
1040 return htons((((uint64_t) hash * (tnl_udp_port_max - tnl_udp_port_min)) >> 32) +
1045 push_udp_header(struct dp_packet *packet,
1046 const struct ovs_action_push_tnl *data)
1048 struct udp_header *udp;
1051 udp = push_ip_header(packet, data->header, data->header_len, &ip_tot_size);
1053 /* set udp src port */
1054 udp->udp_src = get_src_port(packet);
1055 udp->udp_len = htons(ip_tot_size);
1057 if (udp->udp_csum) {
1059 if (is_header_ipv6(dp_packet_data(packet))) {
1060 csum = packet_csum_pseudoheader6(ipv6_hdr(dp_packet_data(packet)));
1062 csum = packet_csum_pseudoheader(ip_hdr(dp_packet_data(packet)));
1065 csum = csum_continue(csum, udp, ip_tot_size);
1066 udp->udp_csum = csum_finish(csum);
1068 if (!udp->udp_csum) {
1069 udp->udp_csum = htons(0xffff);
1075 udp_build_header(struct netdev_tunnel_config *tnl_cfg,
1076 const struct flow *tnl_flow,
1077 struct ovs_action_push_tnl *data,
1080 struct ip_header *ip;
1081 struct ovs_16aligned_ip6_hdr *ip6;
1082 struct udp_header *udp;
1085 *hlen = sizeof(struct eth_header);
1087 is_ipv6 = is_header_ipv6(data->header);
1090 ip6 = ipv6_hdr(data->header);
1091 ip6->ip6_nxt = IPPROTO_UDP;
1092 udp = (struct udp_header *) (ip6 + 1);
1093 *hlen += IPV6_HEADER_LEN;
1095 ip = ip_hdr(data->header);
1096 ip->ip_proto = IPPROTO_UDP;
1097 udp = (struct udp_header *) (ip + 1);
1098 *hlen += IP_HEADER_LEN;
1101 udp->udp_dst = tnl_cfg->dst_port;
1103 if (is_ipv6 || tnl_flow->tunnel.flags & FLOW_TNL_F_CSUM) {
1104 /* Write a value in now to mark that we should compute the checksum
1105 * later. 0xffff is handy because it is transparent to the
1107 udp->udp_csum = htons(0xffff);
1114 gre_header_len(ovs_be16 flags)
1118 if (flags & htons(GRE_CSUM)) {
1121 if (flags & htons(GRE_KEY)) {
1124 if (flags & htons(GRE_SEQ)) {
1131 parse_gre_header(struct dp_packet *packet,
1132 struct flow_tnl *tnl)
1134 const struct gre_base_hdr *greh;
1135 ovs_16aligned_be32 *options;
1139 greh = ip_extract_tnl_md(packet, tnl, &ulen);
1144 if (greh->flags & ~(htons(GRE_CSUM | GRE_KEY | GRE_SEQ))) {
1148 if (greh->protocol != htons(ETH_TYPE_TEB)) {
1152 hlen = ulen + gre_header_len(greh->flags);
1153 if (hlen > dp_packet_size(packet)) {
1157 options = (ovs_16aligned_be32 *)(greh + 1);
1158 if (greh->flags & htons(GRE_CSUM)) {
1161 pkt_csum = csum(greh, dp_packet_size(packet) -
1162 ((const unsigned char *)greh -
1163 (const unsigned char *)dp_packet_l2(packet)));
1167 tnl->flags = FLOW_TNL_F_CSUM;
1171 if (greh->flags & htons(GRE_KEY)) {
1172 tnl->tun_id = (OVS_FORCE ovs_be64) ((OVS_FORCE uint64_t)(get_16aligned_be32(options)) << 32);
1173 tnl->flags |= FLOW_TNL_F_KEY;
1177 if (greh->flags & htons(GRE_SEQ)) {
1185 pkt_metadata_init_tnl(struct pkt_metadata *md)
1187 /* Zero up through the tunnel metadata options. The length and table
1188 * are before this and as long as they are empty, the options won't
1190 memset(md, 0, offsetof(struct pkt_metadata, tunnel.metadata.opts));
1194 netdev_gre_pop_header(struct dp_packet *packet)
1196 struct pkt_metadata *md = &packet->md;
1197 struct flow_tnl *tnl = &md->tunnel;
1198 int hlen = sizeof(struct eth_header) + 4;
1200 hlen += is_header_ipv6(dp_packet_data(packet)) ?
1201 IPV6_HEADER_LEN : IP_HEADER_LEN;
1203 pkt_metadata_init_tnl(md);
1204 if (hlen > dp_packet_size(packet)) {
1208 hlen = parse_gre_header(packet, tnl);
1213 dp_packet_reset_packet(packet, hlen);
1219 netdev_gre_push_header(struct dp_packet *packet,
1220 const struct ovs_action_push_tnl *data)
1222 struct gre_base_hdr *greh;
1225 greh = push_ip_header(packet, data->header, data->header_len, &ip_tot_size);
1227 if (greh->flags & htons(GRE_CSUM)) {
1228 ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1);
1229 *csum_opt = csum(greh, ip_tot_size);
1234 netdev_gre_build_header(const struct netdev *netdev,
1235 struct ovs_action_push_tnl *data,
1236 const struct flow *tnl_flow)
1238 struct netdev_vport *dev = netdev_vport_cast(netdev);
1239 struct netdev_tunnel_config *tnl_cfg;
1240 struct ip_header *ip;
1241 struct ovs_16aligned_ip6_hdr *ip6;
1242 struct gre_base_hdr *greh;
1243 ovs_16aligned_be32 *options;
1247 is_ipv6 = is_header_ipv6(data->header);
1249 /* XXX: RCUfy tnl_cfg. */
1250 ovs_mutex_lock(&dev->mutex);
1251 tnl_cfg = &dev->tnl_cfg;
1254 ip6 = ipv6_hdr(data->header);
1255 ip6->ip6_nxt = IPPROTO_GRE;
1256 greh = (struct gre_base_hdr *) (ip6 + 1);
1258 ip = ip_hdr(data->header);
1259 ip->ip_proto = IPPROTO_GRE;
1260 greh = (struct gre_base_hdr *) (ip + 1);
1263 greh->protocol = htons(ETH_TYPE_TEB);
1266 options = (ovs_16aligned_be32 *) (greh + 1);
1267 if (tnl_flow->tunnel.flags & FLOW_TNL_F_CSUM) {
1268 greh->flags |= htons(GRE_CSUM);
1269 put_16aligned_be32(options, 0);
1273 if (tnl_cfg->out_key_present) {
1274 greh->flags |= htons(GRE_KEY);
1275 put_16aligned_be32(options, (OVS_FORCE ovs_be32)
1276 ((OVS_FORCE uint64_t) tnl_flow->tunnel.tun_id >> 32));
1280 ovs_mutex_unlock(&dev->mutex);
1282 hlen = (uint8_t *) options - (uint8_t *) greh;
1284 data->header_len = sizeof(struct eth_header) + hlen +
1285 (is_ipv6 ? IPV6_HEADER_LEN : IP_HEADER_LEN);
1286 data->tnl_type = OVS_VPORT_TYPE_GRE;
1291 netdev_vxlan_pop_header(struct dp_packet *packet)
1293 struct pkt_metadata *md = &packet->md;
1294 struct flow_tnl *tnl = &md->tunnel;
1295 struct vxlanhdr *vxh;
1298 pkt_metadata_init_tnl(md);
1299 if (VXLAN_HLEN > dp_packet_l4_size(packet)) {
1303 vxh = udp_extract_tnl_md(packet, tnl, &hlen);
1308 if (get_16aligned_be32(&vxh->vx_flags) != htonl(VXLAN_FLAGS) ||
1309 (get_16aligned_be32(&vxh->vx_vni) & htonl(0xff))) {
1310 VLOG_WARN_RL(&err_rl, "invalid vxlan flags=%#x vni=%#x\n",
1311 ntohl(get_16aligned_be32(&vxh->vx_flags)),
1312 ntohl(get_16aligned_be32(&vxh->vx_vni)));
1315 tnl->tun_id = htonll(ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
1316 tnl->flags |= FLOW_TNL_F_KEY;
1318 dp_packet_reset_packet(packet, hlen + VXLAN_HLEN);
1324 netdev_vxlan_build_header(const struct netdev *netdev,
1325 struct ovs_action_push_tnl *data,
1326 const struct flow *tnl_flow)
1328 struct netdev_vport *dev = netdev_vport_cast(netdev);
1329 struct netdev_tunnel_config *tnl_cfg;
1330 struct vxlanhdr *vxh;
1333 /* XXX: RCUfy tnl_cfg. */
1334 ovs_mutex_lock(&dev->mutex);
1335 tnl_cfg = &dev->tnl_cfg;
1337 vxh = udp_build_header(tnl_cfg, tnl_flow, data, &hlen);
1339 put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS));
1340 put_16aligned_be32(&vxh->vx_vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8));
1342 ovs_mutex_unlock(&dev->mutex);
1343 data->header_len = hlen + VXLAN_HLEN;
1344 data->tnl_type = OVS_VPORT_TYPE_VXLAN;
1349 netdev_geneve_pop_header(struct dp_packet *packet)
1351 struct pkt_metadata *md = &packet->md;
1352 struct flow_tnl *tnl = &md->tunnel;
1353 struct genevehdr *gnh;
1354 unsigned int hlen, opts_len, ulen;
1356 pkt_metadata_init_tnl(md);
1357 if (GENEVE_BASE_HLEN > dp_packet_l4_size(packet)) {
1358 VLOG_WARN_RL(&err_rl, "geneve packet too small: min header=%u packet size=%"PRIuSIZE"\n",
1359 (unsigned int)GENEVE_BASE_HLEN, dp_packet_l4_size(packet));
1363 gnh = udp_extract_tnl_md(packet, tnl, &ulen);
1368 opts_len = gnh->opt_len * 4;
1369 hlen = ulen + GENEVE_BASE_HLEN + opts_len;
1370 if (hlen > dp_packet_size(packet)) {
1371 VLOG_WARN_RL(&err_rl, "geneve packet too small: header len=%u packet size=%u\n",
1372 hlen, dp_packet_size(packet));
1376 if (gnh->ver != 0) {
1377 VLOG_WARN_RL(&err_rl, "unknown geneve version: %"PRIu8"\n", gnh->ver);
1381 if (gnh->proto_type != htons(ETH_TYPE_TEB)) {
1382 VLOG_WARN_RL(&err_rl, "unknown geneve encapsulated protocol: %#x\n",
1383 ntohs(gnh->proto_type));
1387 tnl->flags |= gnh->oam ? FLOW_TNL_F_OAM : 0;
1388 tnl->tun_id = htonll(ntohl(get_16aligned_be32(&gnh->vni)) >> 8);
1389 tnl->flags |= FLOW_TNL_F_KEY;
1391 memcpy(tnl->metadata.opts.gnv, gnh->options, opts_len);
1392 tnl->metadata.present.len = opts_len;
1393 tnl->flags |= FLOW_TNL_F_UDPIF;
1395 dp_packet_reset_packet(packet, hlen);
1401 netdev_geneve_build_header(const struct netdev *netdev,
1402 struct ovs_action_push_tnl *data,
1403 const struct flow *tnl_flow)
1405 struct netdev_vport *dev = netdev_vport_cast(netdev);
1406 struct netdev_tunnel_config *tnl_cfg;
1407 struct genevehdr *gnh;
1412 /* XXX: RCUfy tnl_cfg. */
1413 ovs_mutex_lock(&dev->mutex);
1414 tnl_cfg = &dev->tnl_cfg;
1416 gnh = udp_build_header(tnl_cfg, tnl_flow, data, &hlen);
1418 put_16aligned_be32(&gnh->vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8));
1420 ovs_mutex_unlock(&dev->mutex);
1422 opt_len = tun_metadata_to_geneve_header(&tnl_flow->tunnel,
1423 gnh->options, &crit_opt);
1425 gnh->opt_len = opt_len / 4;
1426 gnh->oam = !!(tnl_flow->tunnel.flags & FLOW_TNL_F_OAM);
1427 gnh->critical = crit_opt ? 1 : 0;
1428 gnh->proto_type = htons(ETH_TYPE_TEB);
1430 data->header_len = hlen + GENEVE_BASE_HLEN + opt_len;
1431 data->tnl_type = OVS_VPORT_TYPE_GENEVE;
1436 netdev_vport_range(struct unixctl_conn *conn, int argc,
1437 const char *argv[], void *aux OVS_UNUSED)
1442 struct ds ds = DS_EMPTY_INITIALIZER;
1444 ds_put_format(&ds, "Tunnel UDP source port range: %"PRIu16"-%"PRIu16"\n",
1445 tnl_udp_port_min, tnl_udp_port_max);
1447 unixctl_command_reply(conn, ds_cstr(&ds));
1456 val1 = atoi(argv[1]);
1457 if (val1 <= 0 || val1 > UINT16_MAX) {
1458 unixctl_command_reply(conn, "Invalid min.");
1461 val2 = atoi(argv[2]);
1462 if (val2 <= 0 || val2 > UINT16_MAX) {
1463 unixctl_command_reply(conn, "Invalid max.");
1468 tnl_udp_port_min = val2;
1469 tnl_udp_port_max = val1;
1471 tnl_udp_port_min = val1;
1472 tnl_udp_port_max = val2;
1474 seq_change(tnl_conf_seq);
1476 unixctl_command_reply(conn, "OK");
1480 #define VPORT_FUNCTIONS(GET_CONFIG, SET_CONFIG, \
1481 GET_TUNNEL_CONFIG, GET_STATUS, \
1483 PUSH_HEADER, POP_HEADER) \
1486 netdev_vport_wait, \
1488 netdev_vport_alloc, \
1489 netdev_vport_construct, \
1490 netdev_vport_destruct, \
1491 netdev_vport_dealloc, \
1494 GET_TUNNEL_CONFIG, \
1498 NULL, /* get_numa_id */ \
1499 NULL, /* set_multiq */ \
1502 NULL, /* send_wait */ \
1504 netdev_vport_set_etheraddr, \
1505 netdev_vport_get_etheraddr, \
1506 NULL, /* get_mtu */ \
1507 NULL, /* set_mtu */ \
1508 NULL, /* get_ifindex */ \
1509 NULL, /* get_carrier */ \
1510 NULL, /* get_carrier_resets */ \
1511 NULL, /* get_miimon */ \
1514 NULL, /* get_features */ \
1515 NULL, /* set_advertisements */ \
1517 NULL, /* set_policing */ \
1518 NULL, /* get_qos_types */ \
1519 NULL, /* get_qos_capabilities */ \
1520 NULL, /* get_qos */ \
1521 NULL, /* set_qos */ \
1522 NULL, /* get_queue */ \
1523 NULL, /* set_queue */ \
1524 NULL, /* delete_queue */ \
1525 NULL, /* get_queue_stats */ \
1526 NULL, /* queue_dump_start */ \
1527 NULL, /* queue_dump_next */ \
1528 NULL, /* queue_dump_done */ \
1529 NULL, /* dump_queue_stats */ \
1531 NULL, /* get_in4 */ \
1532 NULL, /* set_in4 */ \
1533 NULL, /* get_in6 */ \
1534 NULL, /* add_router */ \
1535 NULL, /* get_next_hop */ \
1537 NULL, /* arp_lookup */ \
1539 netdev_vport_update_flags, \
1541 NULL, /* rx_alloc */ \
1542 NULL, /* rx_construct */ \
1543 NULL, /* rx_destruct */ \
1544 NULL, /* rx_dealloc */ \
1545 NULL, /* rx_recv */ \
1546 NULL, /* rx_wait */ \
1547 NULL, /* rx_drain */
1550 #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER) \
1553 VPORT_FUNCTIONS(get_tunnel_config, \
1554 set_tunnel_config, \
1555 get_netdev_tunnel_config, \
1556 tunnel_get_status, \
1557 BUILD_HEADER, PUSH_HEADER, POP_HEADER) }}
1560 netdev_vport_tunnel_register(void)
1562 /* The name of the dpif_port should be short enough to accomodate adding
1563 * a port number to the end if one is necessary. */
1564 static const struct vport_class vport_classes[] = {
1565 TUNNEL_CLASS("geneve", "genev_sys", netdev_geneve_build_header,
1567 netdev_geneve_pop_header),
1568 TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header,
1569 netdev_gre_push_header,
1570 netdev_gre_pop_header),
1571 TUNNEL_CLASS("ipsec_gre", "gre_sys", NULL, NULL, NULL),
1572 TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
1574 netdev_vxlan_pop_header),
1575 TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL),
1576 TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL),
1578 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1580 if (ovsthread_once_start(&once)) {
1583 for (i = 0; i < ARRAY_SIZE(vport_classes); i++) {
1584 netdev_register_provider(&vport_classes[i].netdev_class);
1587 unixctl_command_register("tnl/egress_port_range", "min max", 0, 2,
1588 netdev_vport_range, NULL);
1590 ovsthread_once_done(&once);
1595 netdev_vport_patch_register(void)
1597 static const struct vport_class patch_class =
1600 VPORT_FUNCTIONS(get_patch_config,
1603 NULL, NULL, NULL, NULL) }};
1604 netdev_register_provider(&patch_class.netdev_class);