2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
50 #include <net/dst_metadata.h>
52 #if IS_ENABLED(CONFIG_IPV6)
54 #include <net/ip6_fib.h>
55 #include <net/ip6_route.h>
62 1. The most important issue is detecting local dead loops.
63 They would cause complete host lockup in transmit, which
64 would be "resolved" by stack overflow or, if queueing is enabled,
65 with infinite looping in net_bh.
67 We cannot track such dead loops during route installation,
68 it is infeasible task. The most general solutions would be
69 to keep skb->encapsulation counter (sort of local ttl),
70 and silently drop packet when it expires. It is a good
71 solution, but it supposes maintaining new variable in ALL
72 skb, even if no tunneling is used.
74 Current solution: xmit_recursion breaks dead loops. This is a percpu
75 counter, since when we enter the first ndo_xmit(), cpu migration is
76 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
78 2. Networking dead loops would not kill routers, but would really
79 kill network. IP hop limit plays role of "t->recursion" in this case,
80 if we copy it from packet being encapsulated to upper header.
81 It is very good solution, but it introduces two problems:
83 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
84 do not work over tunnels.
85 - traceroute does not work. I planned to relay ICMP from tunnel,
86 so that this problem would be solved and traceroute output
87 would even more informative. This idea appeared to be wrong:
88 only Linux complies to rfc1812 now (yes, guys, Linux is the only
89 true router now :-)), all routers (at least, in neighbourhood of mine)
90 return only 8 bytes of payload. It is the end.
92 Hence, if we want that OSPF worked or traceroute said something reasonable,
93 we should search for another solution.
95 One of them is to parse packet trying to detect inner encapsulation
96 made by our node. It is difficult or even impossible, especially,
97 taking into account fragmentation. TO be short, ttl is not solution at all.
99 Current solution: The solution was UNEXPECTEDLY SIMPLE.
100 We force DF flag on tunnels with preconfigured hop limit,
101 that is ALL. :-) Well, it does not remove the problem completely,
102 but exponential growth of network traffic is changed to linear
103 (branches, that exceed pmtu are pruned) and tunnel mtu
104 rapidly degrades to value <68, where looping stops.
105 Yes, it is not good if there exists a router in the loop,
106 which does not force DF, even when encapsulating packets have DF set.
107 But it is not our problem! Nobody could accuse us, we made
108 all that we could make. Even if it is your gated who injected
109 fatal route to network, even if it were you who configured
110 fatal static route: you are innocent. :-)
115 static bool log_ecn_error = true;
116 module_param(log_ecn_error, bool, 0644);
117 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
122 static int ipgre_net_id __read_mostly;
123 static int gre_tap_net_id __read_mostly;
125 static void ipgre_err(struct sk_buff *skb, u32 info,
126 const struct tnl_ptk_info *tpi)
129 /* All the routers (except for Linux) return only
130 8 bytes of packet payload. It means, that precise relaying of
131 ICMP in the real Internet is absolutely infeasible.
133 Moreover, Cisco "wise men" put GRE key to the third word
134 in GRE header. It makes impossible maintaining even soft
135 state for keyed GRE tunnels with enabled checksum. Tell
138 Well, I wonder, rfc1812 was written by Cisco employee,
139 what the hell these idiots break standards established
142 struct net *net = dev_net(skb->dev);
143 struct ip_tunnel_net *itn;
144 const struct iphdr *iph;
145 const int type = icmp_hdr(skb)->type;
146 const int code = icmp_hdr(skb)->code;
151 case ICMP_PARAMETERPROB:
154 case ICMP_DEST_UNREACH:
157 case ICMP_PORT_UNREACH:
158 /* Impossible event. */
161 /* All others are translated to HOST_UNREACH.
162 rfc2003 contains "deep thoughts" about NET_UNREACH,
163 I believe they are just ether pollution. --ANK
169 case ICMP_TIME_EXCEEDED:
170 if (code != ICMP_EXC_TTL)
178 if (tpi->proto == htons(ETH_P_TEB))
179 itn = net_generic(net, gre_tap_net_id);
181 itn = net_generic(net, ipgre_net_id);
183 iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185 iph->daddr, iph->saddr, tpi->key);
190 if (t->parms.iph.daddr == 0 ||
191 ipv4_is_multicast(t->parms.iph.daddr))
194 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
197 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
201 t->err_time = jiffies;
204 static void gre_err(struct sk_buff *skb, u32 info)
206 /* All the routers (except for Linux) return only
207 * 8 bytes of packet payload. It means, that precise relaying of
208 * ICMP in the real Internet is absolutely infeasible.
210 * Moreover, Cisco "wise men" put GRE key to the third word
211 * in GRE header. It makes impossible maintaining even soft
213 * GRE tunnels with enabled checksum. Tell them "thank you".
215 * Well, I wonder, rfc1812 was written by Cisco employee,
216 * what the hell these idiots break standards established
220 const int type = icmp_hdr(skb)->type;
221 const int code = icmp_hdr(skb)->code;
222 struct tnl_ptk_info tpi;
223 bool csum_err = false;
226 if (gre_parse_header(skb, &tpi, &csum_err, &hdr_len)) {
227 if (!csum_err) /* ignore csum errors. */
231 if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
234 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
235 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
236 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
239 if (type == ICMP_REDIRECT) {
240 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
245 ipgre_err(skb, info, &tpi);
248 static __be64 key_to_tunnel_id(__be32 key)
251 return (__force __be64)((__force u32)key);
253 return (__force __be64)((__force u64)key << 32);
257 /* Returns the least-significant 32 bits of a __be64. */
258 static __be32 tunnel_id_to_key(__be64 x)
261 return (__force __be32)x;
263 return (__force __be32)((__force u64)x >> 32);
267 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
269 struct net *net = dev_net(skb->dev);
270 struct metadata_dst *tun_dst = NULL;
271 struct ip_tunnel_net *itn;
272 const struct iphdr *iph;
273 struct ip_tunnel *tunnel;
275 if (tpi->proto == htons(ETH_P_TEB))
276 itn = net_generic(net, gre_tap_net_id);
278 itn = net_generic(net, ipgre_net_id);
281 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
282 iph->saddr, iph->daddr, tpi->key);
285 skb_pop_mac_header(skb);
286 if (tunnel->collect_md) {
290 flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
291 tun_id = key_to_tunnel_id(tpi->key);
292 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
294 return PACKET_REJECT;
297 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
300 return PACKET_REJECT;
303 static int gre_rcv(struct sk_buff *skb)
305 struct tnl_ptk_info tpi;
306 bool csum_err = false;
309 #ifdef CONFIG_NET_IPGRE_BROADCAST
310 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
311 /* Looped back packet, drop it! */
312 if (rt_is_output_route(skb_rtable(skb)))
317 if (gre_parse_header(skb, &tpi, &csum_err, &hdr_len) < 0)
320 if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
323 if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
326 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
332 static __sum16 gre_checksum(struct sk_buff *skb)
336 if (skb->ip_summed == CHECKSUM_PARTIAL)
337 csum = lco_csum(skb);
339 csum = skb_checksum(skb, 0, skb->len, 0);
340 return csum_fold(csum);
343 static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
344 __be16 proto, __be32 key, __be32 seq)
346 struct gre_base_hdr *greh;
348 skb_push(skb, hdr_len);
350 skb_reset_transport_header(skb);
351 greh = (struct gre_base_hdr *)skb->data;
352 greh->flags = gre_tnl_flags_to_gre_flags(flags);
353 greh->protocol = proto;
355 if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
356 __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
358 if (flags & TUNNEL_SEQ) {
362 if (flags & TUNNEL_KEY) {
366 if (flags & TUNNEL_CSUM &&
367 !(skb_shinfo(skb)->gso_type &
368 (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
370 *(__sum16 *)ptr = gre_checksum(skb);
375 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
376 const struct iphdr *tnl_params,
379 struct ip_tunnel *tunnel = netdev_priv(dev);
381 if (tunnel->parms.o_flags & TUNNEL_SEQ)
384 /* Push GRE header. */
385 build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
386 proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
388 skb_set_inner_protocol(skb, proto);
389 ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
392 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
394 return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
397 static struct rtable *gre_get_rt(struct sk_buff *skb,
398 struct net_device *dev,
400 const struct ip_tunnel_key *key)
402 struct net *net = dev_net(dev);
404 memset(fl, 0, sizeof(*fl));
405 fl->daddr = key->u.ipv4.dst;
406 fl->saddr = key->u.ipv4.src;
407 fl->flowi4_tos = RT_TOS(key->tos);
408 fl->flowi4_mark = skb->mark;
409 fl->flowi4_proto = IPPROTO_GRE;
411 return ip_route_output_key(net, fl);
414 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
416 struct ip_tunnel_info *tun_info;
417 const struct ip_tunnel_key *key;
418 struct rtable *rt = NULL;
426 tun_info = skb_tunnel_info(skb);
427 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
428 ip_tunnel_info_af(tun_info) != AF_INET))
431 key = &tun_info->key;
432 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
434 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
436 rt = gre_get_rt(skb, dev, &fl, key);
440 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
444 tunnel_hlen = gre_calc_hlen(key->tun_flags);
446 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
447 + tunnel_hlen + sizeof(struct iphdr);
448 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
449 int head_delta = SKB_DATA_ALIGN(min_headroom -
452 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
458 /* Push Tunnel header. */
459 if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
462 flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
463 build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
464 tunnel_id_to_key(tun_info->key.tun_id), 0);
466 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
468 iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
469 key->tos, key->ttl, df, false);
476 dev->stats.tx_dropped++;
479 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
481 struct ip_tunnel_info *info = skb_tunnel_info(skb);
485 if (ip_tunnel_info_af(info) != AF_INET)
488 rt = gre_get_rt(skb, dev, &fl4, &info->key);
493 info->key.u.ipv4.src = fl4.saddr;
497 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
498 struct net_device *dev)
500 struct ip_tunnel *tunnel = netdev_priv(dev);
501 const struct iphdr *tnl_params;
503 if (tunnel->collect_md) {
504 gre_fb_xmit(skb, dev);
508 if (dev->header_ops) {
509 /* Need space for new headers */
510 if (skb_cow_head(skb, dev->needed_headroom -
511 (tunnel->hlen + sizeof(struct iphdr))))
514 tnl_params = (const struct iphdr *)skb->data;
516 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
519 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
520 skb_reset_mac_header(skb);
522 if (skb_cow_head(skb, dev->needed_headroom))
525 tnl_params = &tunnel->parms.iph;
528 if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
531 __gre_xmit(skb, dev, tnl_params, skb->protocol);
536 dev->stats.tx_dropped++;
540 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
541 struct net_device *dev)
543 struct ip_tunnel *tunnel = netdev_priv(dev);
545 if (tunnel->collect_md) {
546 gre_fb_xmit(skb, dev);
550 if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
553 if (skb_cow_head(skb, dev->needed_headroom))
556 __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
561 dev->stats.tx_dropped++;
565 static int ipgre_tunnel_ioctl(struct net_device *dev,
566 struct ifreq *ifr, int cmd)
569 struct ip_tunnel_parm p;
571 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
573 if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
574 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
575 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
576 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
579 p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
580 p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
582 err = ip_tunnel_ioctl(dev, &p, cmd);
586 p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
587 p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
589 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
594 /* Nice toy. Unfortunately, useless in real life :-)
595 It allows to construct virtual multiprotocol broadcast "LAN"
596 over the Internet, provided multicast routing is tuned.
599 I have no idea was this bicycle invented before me,
600 so that I had to set ARPHRD_IPGRE to a random value.
601 I have an impression, that Cisco could make something similar,
602 but this feature is apparently missing in IOS<=11.2(8).
604 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
605 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
607 ping -t 255 224.66.66.66
609 If nobody answers, mbone does not work.
611 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
612 ip addr add 10.66.66.<somewhat>/24 dev Universe
614 ifconfig Universe add fe80::<Your_real_addr>/10
615 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
618 ftp fec0:6666:6666::193.233.7.65
621 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
623 const void *daddr, const void *saddr, unsigned int len)
625 struct ip_tunnel *t = netdev_priv(dev);
627 struct gre_base_hdr *greh;
629 iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
630 greh = (struct gre_base_hdr *)(iph+1);
631 greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
632 greh->protocol = htons(type);
634 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
636 /* Set the source hardware address. */
638 memcpy(&iph->saddr, saddr, 4);
640 memcpy(&iph->daddr, daddr, 4);
642 return t->hlen + sizeof(*iph);
644 return -(t->hlen + sizeof(*iph));
647 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
649 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
650 memcpy(haddr, &iph->saddr, 4);
654 static const struct header_ops ipgre_header_ops = {
655 .create = ipgre_header,
656 .parse = ipgre_header_parse,
659 #ifdef CONFIG_NET_IPGRE_BROADCAST
660 static int ipgre_open(struct net_device *dev)
662 struct ip_tunnel *t = netdev_priv(dev);
664 if (ipv4_is_multicast(t->parms.iph.daddr)) {
668 rt = ip_route_output_gre(t->net, &fl4,
672 RT_TOS(t->parms.iph.tos),
675 return -EADDRNOTAVAIL;
678 if (!__in_dev_get_rtnl(dev))
679 return -EADDRNOTAVAIL;
680 t->mlink = dev->ifindex;
681 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
686 static int ipgre_close(struct net_device *dev)
688 struct ip_tunnel *t = netdev_priv(dev);
690 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
691 struct in_device *in_dev;
692 in_dev = inetdev_by_index(t->net, t->mlink);
694 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
700 static const struct net_device_ops ipgre_netdev_ops = {
701 .ndo_init = ipgre_tunnel_init,
702 .ndo_uninit = ip_tunnel_uninit,
703 #ifdef CONFIG_NET_IPGRE_BROADCAST
704 .ndo_open = ipgre_open,
705 .ndo_stop = ipgre_close,
707 .ndo_start_xmit = ipgre_xmit,
708 .ndo_do_ioctl = ipgre_tunnel_ioctl,
709 .ndo_change_mtu = ip_tunnel_change_mtu,
710 .ndo_get_stats64 = ip_tunnel_get_stats64,
711 .ndo_get_iflink = ip_tunnel_get_iflink,
714 #define GRE_FEATURES (NETIF_F_SG | \
719 static void ipgre_tunnel_setup(struct net_device *dev)
721 dev->netdev_ops = &ipgre_netdev_ops;
722 dev->type = ARPHRD_IPGRE;
723 ip_tunnel_setup(dev, ipgre_net_id);
726 static void __gre_tunnel_init(struct net_device *dev)
728 struct ip_tunnel *tunnel;
731 tunnel = netdev_priv(dev);
732 tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
733 tunnel->parms.iph.protocol = IPPROTO_GRE;
735 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
737 t_hlen = tunnel->hlen + sizeof(struct iphdr);
739 dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
740 dev->mtu = ETH_DATA_LEN - t_hlen - 4;
742 dev->features |= GRE_FEATURES;
743 dev->hw_features |= GRE_FEATURES;
745 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
746 /* TCP offload with GRE SEQ is not supported, nor
747 * can we support 2 levels of outer headers requiring
750 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
751 (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
752 dev->features |= NETIF_F_GSO_SOFTWARE;
753 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
756 /* Can use a lockless transmit, unless we generate
759 dev->features |= NETIF_F_LLTX;
763 static int ipgre_tunnel_init(struct net_device *dev)
765 struct ip_tunnel *tunnel = netdev_priv(dev);
766 struct iphdr *iph = &tunnel->parms.iph;
768 __gre_tunnel_init(dev);
770 memcpy(dev->dev_addr, &iph->saddr, 4);
771 memcpy(dev->broadcast, &iph->daddr, 4);
773 dev->flags = IFF_NOARP;
778 #ifdef CONFIG_NET_IPGRE_BROADCAST
779 if (ipv4_is_multicast(iph->daddr)) {
782 dev->flags = IFF_BROADCAST;
783 dev->header_ops = &ipgre_header_ops;
787 dev->header_ops = &ipgre_header_ops;
789 return ip_tunnel_init(dev);
792 static const struct gre_protocol ipgre_protocol = {
794 .err_handler = gre_err,
797 static int __net_init ipgre_init_net(struct net *net)
799 return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
802 static void __net_exit ipgre_exit_net(struct net *net)
804 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
805 ip_tunnel_delete_net(itn, &ipgre_link_ops);
808 static struct pernet_operations ipgre_net_ops = {
809 .init = ipgre_init_net,
810 .exit = ipgre_exit_net,
812 .size = sizeof(struct ip_tunnel_net),
815 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
823 if (data[IFLA_GRE_IFLAGS])
824 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
825 if (data[IFLA_GRE_OFLAGS])
826 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
827 if (flags & (GRE_VERSION|GRE_ROUTING))
833 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
837 if (tb[IFLA_ADDRESS]) {
838 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
840 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
841 return -EADDRNOTAVAIL;
847 if (data[IFLA_GRE_REMOTE]) {
848 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
854 return ipgre_tunnel_validate(tb, data);
857 static void ipgre_netlink_parms(struct net_device *dev,
858 struct nlattr *data[],
860 struct ip_tunnel_parm *parms)
862 memset(parms, 0, sizeof(*parms));
864 parms->iph.protocol = IPPROTO_GRE;
869 if (data[IFLA_GRE_LINK])
870 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
872 if (data[IFLA_GRE_IFLAGS])
873 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
875 if (data[IFLA_GRE_OFLAGS])
876 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
878 if (data[IFLA_GRE_IKEY])
879 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
881 if (data[IFLA_GRE_OKEY])
882 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
884 if (data[IFLA_GRE_LOCAL])
885 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
887 if (data[IFLA_GRE_REMOTE])
888 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
890 if (data[IFLA_GRE_TTL])
891 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
893 if (data[IFLA_GRE_TOS])
894 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
896 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
897 parms->iph.frag_off = htons(IP_DF);
899 if (data[IFLA_GRE_COLLECT_METADATA]) {
900 struct ip_tunnel *t = netdev_priv(dev);
902 t->collect_md = true;
906 /* This function returns true when ENCAP attributes are present in the nl msg */
907 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
908 struct ip_tunnel_encap *ipencap)
912 memset(ipencap, 0, sizeof(*ipencap));
917 if (data[IFLA_GRE_ENCAP_TYPE]) {
919 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
922 if (data[IFLA_GRE_ENCAP_FLAGS]) {
924 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
927 if (data[IFLA_GRE_ENCAP_SPORT]) {
929 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
932 if (data[IFLA_GRE_ENCAP_DPORT]) {
934 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
940 static int gre_tap_init(struct net_device *dev)
942 __gre_tunnel_init(dev);
943 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
945 return ip_tunnel_init(dev);
948 static const struct net_device_ops gre_tap_netdev_ops = {
949 .ndo_init = gre_tap_init,
950 .ndo_uninit = ip_tunnel_uninit,
951 .ndo_start_xmit = gre_tap_xmit,
952 .ndo_set_mac_address = eth_mac_addr,
953 .ndo_validate_addr = eth_validate_addr,
954 .ndo_change_mtu = ip_tunnel_change_mtu,
955 .ndo_get_stats64 = ip_tunnel_get_stats64,
956 .ndo_get_iflink = ip_tunnel_get_iflink,
957 .ndo_fill_metadata_dst = gre_fill_metadata_dst,
960 static void ipgre_tap_setup(struct net_device *dev)
963 dev->netdev_ops = &gre_tap_netdev_ops;
964 dev->priv_flags &= ~IFF_TX_SKB_SHARING;
965 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
966 ip_tunnel_setup(dev, gre_tap_net_id);
969 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
970 struct nlattr *tb[], struct nlattr *data[])
972 struct ip_tunnel_parm p;
973 struct ip_tunnel_encap ipencap;
975 if (ipgre_netlink_encap_parms(data, &ipencap)) {
976 struct ip_tunnel *t = netdev_priv(dev);
977 int err = ip_tunnel_encap_setup(t, &ipencap);
983 ipgre_netlink_parms(dev, data, tb, &p);
984 return ip_tunnel_newlink(dev, tb, &p);
987 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
988 struct nlattr *data[])
990 struct ip_tunnel_parm p;
991 struct ip_tunnel_encap ipencap;
993 if (ipgre_netlink_encap_parms(data, &ipencap)) {
994 struct ip_tunnel *t = netdev_priv(dev);
995 int err = ip_tunnel_encap_setup(t, &ipencap);
1001 ipgre_netlink_parms(dev, data, tb, &p);
1002 return ip_tunnel_changelink(dev, tb, &p);
1005 static size_t ipgre_get_size(const struct net_device *dev)
1010 /* IFLA_GRE_IFLAGS */
1012 /* IFLA_GRE_OFLAGS */
1018 /* IFLA_GRE_LOCAL */
1020 /* IFLA_GRE_REMOTE */
1026 /* IFLA_GRE_PMTUDISC */
1028 /* IFLA_GRE_ENCAP_TYPE */
1030 /* IFLA_GRE_ENCAP_FLAGS */
1032 /* IFLA_GRE_ENCAP_SPORT */
1034 /* IFLA_GRE_ENCAP_DPORT */
1036 /* IFLA_GRE_COLLECT_METADATA */
1041 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1043 struct ip_tunnel *t = netdev_priv(dev);
1044 struct ip_tunnel_parm *p = &t->parms;
1046 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1047 nla_put_be16(skb, IFLA_GRE_IFLAGS,
1048 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1049 nla_put_be16(skb, IFLA_GRE_OFLAGS,
1050 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1051 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1052 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1053 nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1054 nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1055 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1056 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1057 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1058 !!(p->iph.frag_off & htons(IP_DF))))
1059 goto nla_put_failure;
1061 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1063 nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1065 nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1067 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1069 goto nla_put_failure;
1071 if (t->collect_md) {
1072 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1073 goto nla_put_failure;
1082 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1083 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1084 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1085 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1086 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1087 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1088 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1089 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1090 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1091 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1092 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1093 [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
1094 [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
1095 [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
1096 [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
1097 [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
1100 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1102 .maxtype = IFLA_GRE_MAX,
1103 .policy = ipgre_policy,
1104 .priv_size = sizeof(struct ip_tunnel),
1105 .setup = ipgre_tunnel_setup,
1106 .validate = ipgre_tunnel_validate,
1107 .newlink = ipgre_newlink,
1108 .changelink = ipgre_changelink,
1109 .dellink = ip_tunnel_dellink,
1110 .get_size = ipgre_get_size,
1111 .fill_info = ipgre_fill_info,
1112 .get_link_net = ip_tunnel_get_link_net,
1115 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1117 .maxtype = IFLA_GRE_MAX,
1118 .policy = ipgre_policy,
1119 .priv_size = sizeof(struct ip_tunnel),
1120 .setup = ipgre_tap_setup,
1121 .validate = ipgre_tap_validate,
1122 .newlink = ipgre_newlink,
1123 .changelink = ipgre_changelink,
1124 .dellink = ip_tunnel_dellink,
1125 .get_size = ipgre_get_size,
1126 .fill_info = ipgre_fill_info,
1127 .get_link_net = ip_tunnel_get_link_net,
1130 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1131 u8 name_assign_type)
1133 struct nlattr *tb[IFLA_MAX + 1];
1134 struct net_device *dev;
1135 struct ip_tunnel *t;
1138 memset(&tb, 0, sizeof(tb));
1140 dev = rtnl_create_link(net, name, name_assign_type,
1141 &ipgre_tap_ops, tb);
1145 /* Configure flow based GRE device. */
1146 t = netdev_priv(dev);
1147 t->collect_md = true;
1149 err = ipgre_newlink(net, dev, tb, NULL);
1153 /* openvswitch users expect packet sizes to be unrestricted,
1154 * so set the largest MTU we can.
1156 err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1163 return ERR_PTR(err);
1165 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1167 static int __net_init ipgre_tap_init_net(struct net *net)
1169 return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1172 static void __net_exit ipgre_tap_exit_net(struct net *net)
1174 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1175 ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1178 static struct pernet_operations ipgre_tap_net_ops = {
1179 .init = ipgre_tap_init_net,
1180 .exit = ipgre_tap_exit_net,
1181 .id = &gre_tap_net_id,
1182 .size = sizeof(struct ip_tunnel_net),
1185 static int __init ipgre_init(void)
1189 pr_info("GRE over IPv4 tunneling driver\n");
1191 err = register_pernet_device(&ipgre_net_ops);
1195 err = register_pernet_device(&ipgre_tap_net_ops);
1197 goto pnet_tap_faied;
1199 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1201 pr_info("%s: can't add protocol\n", __func__);
1202 goto add_proto_failed;
1205 err = rtnl_link_register(&ipgre_link_ops);
1207 goto rtnl_link_failed;
1209 err = rtnl_link_register(&ipgre_tap_ops);
1211 goto tap_ops_failed;
1216 rtnl_link_unregister(&ipgre_link_ops);
1218 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1220 unregister_pernet_device(&ipgre_tap_net_ops);
1222 unregister_pernet_device(&ipgre_net_ops);
1226 static void __exit ipgre_fini(void)
1228 rtnl_link_unregister(&ipgre_tap_ops);
1229 rtnl_link_unregister(&ipgre_link_ops);
1230 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1231 unregister_pernet_device(&ipgre_tap_net_ops);
1232 unregister_pernet_device(&ipgre_net_ops);
1235 module_init(ipgre_init);
1236 module_exit(ipgre_fini);
1237 MODULE_LICENSE("GPL");
1238 MODULE_ALIAS_RTNL_LINK("gre");
1239 MODULE_ALIAS_RTNL_LINK("gretap");
1240 MODULE_ALIAS_NETDEV("gre0");
1241 MODULE_ALIAS_NETDEV("gretap0");