1 /* ip_gre driver port to Linux 2.6.18 and greater plus enhancements */
3 #include <linux/version.h>
4 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
5 #define HAVE_NETDEV_STATS
7 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
8 #define HAVE_NETDEV_HEADER_OPS
10 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
11 #define HAVE_NETDEV_NEEDED_HEADROOM
15 * Linux NET3: GRE over IP protocol decoder.
17 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
26 #include <linux/capability.h>
27 #include <linux/ethtool.h>
28 #include <linux/module.h>
29 #include <linux/types.h>
30 #include <linux/kernel.h>
31 #include <asm/uaccess.h>
32 #include <linux/skbuff.h>
33 #include <linux/netdevice.h>
35 #include <linux/tcp.h>
36 #include <linux/udp.h>
37 #include <linux/if_arp.h>
38 #include <linux/if_vlan.h>
39 #include <linux/mroute.h>
40 #include <linux/init.h>
41 #include <linux/in6.h>
42 #include <linux/inetdevice.h>
43 #include <linux/igmp.h>
44 #include <linux/netfilter_ipv4.h>
45 #include <linux/etherdevice.h>
46 #include <linux/if_ether.h>
51 #include <net/protocol.h>
55 #include <net/checksum.h>
56 #include <net/dsfield.h>
57 #include <net/inet_ecn.h>
59 #include <net/net_namespace.h>
60 #include <net/netns/generic.h>
63 #include <net/ip6_fib.h>
64 #include <net/ip6_route.h>
68 #include "openvswitch/gre.h"
70 #ifndef GRE_IOCTL_ONLY
71 #include <net/rtnetlink.h>
78 1. The most important issue is detecting local dead loops.
79 They would cause complete host lockup in transmit, which
80 would be "resolved" by stack overflow or, if queueing is enabled,
81 with infinite looping in net_bh.
83 We cannot track such dead loops during route installation,
84 it is infeasible task. The most general solutions would be
85 to keep skb->encapsulation counter (sort of local ttl),
86 and silently drop packet when it expires. It is the best
87 solution, but it supposes maintaing new variable in ALL
88 skb, even if no tunneling is used.
90 Current solution: HARD_TX_LOCK lock breaks dead loops.
94 2. Networking dead loops would not kill routers, but would really
95 kill network. IP hop limit plays role of "t->recursion" in this case,
96 if we copy it from packet being encapsulated to upper header.
97 It is very good solution, but it introduces two problems:
99 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
100 do not work over tunnels.
101 - traceroute does not work. I planned to relay ICMP from tunnel,
102 so that this problem would be solved and traceroute output
103 would even more informative. This idea appeared to be wrong:
104 only Linux complies to rfc1812 now (yes, guys, Linux is the only
105 true router now :-)), all routers (at least, in neighbourhood of mine)
106 return only 8 bytes of payload. It is the end.
108 Hence, if we want that OSPF worked or traceroute said something reasonable,
109 we should search for another solution.
111 One of them is to parse packet trying to detect inner encapsulation
112 made by our node. It is difficult or even impossible, especially,
113 taking into account fragmentation. TO be short, tt is not solution at all.
115 Current solution: The solution was UNEXPECTEDLY SIMPLE.
116 We force DF flag on tunnels with preconfigured hop limit,
117 that is ALL. :-) Well, it does not remove the problem completely,
118 but exponential growth of network traffic is changed to linear
119 (branches, that exceed pmtu are pruned) and tunnel mtu
120 fastly degrades to value <68, where looping stops.
121 Yes, it is not good if there exists a router in the loop,
122 which does not force DF, even when encapsulating packets have DF set.
123 But it is not our problem! Nobody could accuse us, we made
124 all that we could make. Even if it is your gated who injected
125 fatal route to network, even if it were you who configured
126 fatal static route: you are innocent. :-)
128 XXX: Forcing the DF flag on was done only when setting up tunnels via the
129 ioctl interface and not Netlink. Since it prevents some operations
130 and isn't very transparent I removed it. It seems nobody really
131 cared about it anyways.
132 Moral: don't create loops.
134 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
135 practically identical code. It would be good to glue them
136 together, but it is not very evident, how to make them modular.
137 sit is integral part of IPv6, ipip and gre are naturally modular.
138 We could extract common parts (hash table, ioctl etc)
139 to a separate module (ip_tunnel.c).
144 #ifndef GRE_IOCTL_ONLY
145 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
146 static struct rtnl_link_ops ipgre_tap_ops __read_mostly;
148 static int ipgre_tunnel_init(struct net_device *dev);
149 static void ipgre_tunnel_setup(struct net_device *dev);
150 static void ipgre_tap_setup(struct net_device *dev);
151 static int ipgre_tunnel_bind_dev(struct net_device *dev);
152 static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
157 /* The absolute minimum fragment size. Note that there are many other
158 * definitions of the minimum MTU. */
159 #define IP_MIN_MTU 68
161 static inline __be16 *gre_flags(void *header_start)
166 static inline __be16 *gre_protocol(void *header_start)
168 return header_start + 2;
171 static int ipgre_net_id __read_mostly;
173 struct ip_tunnel *tunnels[4][HASH_SIZE];
175 struct net_device *fb_tunnel_dev;
178 /* Tunnel hash table */
188 We require exact key match i.e. if a key is present in packet
189 it will match only tunnel with the same key; if it is not present,
190 it will match only keyless tunnel.
192 All keysless packets, if not matched configured keyless tunnels
193 will match fallback tunnel.
196 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
198 #define tunnels_r_l tunnels[3]
199 #define tunnels_r tunnels[2]
200 #define tunnels_l tunnels[1]
201 #define tunnels_wc tunnels[0]
203 * Locking : hash tables are protected by RCU and a spinlock
205 static DEFINE_SPINLOCK(ipgre_lock);
207 #define for_each_ip_tunnel_rcu(start) \
208 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
210 /* Given src, dst and key, find appropriate for input tunnel. */
212 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
213 __be32 remote, __be32 local,
214 __be32 key, __be16 gre_proto)
216 struct net *net = dev_net(dev);
217 int link = dev->ifindex;
218 unsigned h0 = HASH(remote);
219 unsigned h1 = HASH(key);
220 struct ip_tunnel *t, *cand = NULL;
221 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
222 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
223 ARPHRD_ETHER : ARPHRD_IPGRE;
224 int score, cand_score = 4;
226 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
227 if (local != t->parms.iph.saddr ||
228 remote != t->parms.iph.daddr ||
229 key != t->parms.i_key ||
230 !(t->dev->flags & IFF_UP))
233 if (t->dev->type != ARPHRD_IPGRE &&
234 t->dev->type != dev_type)
238 if (t->parms.link != link)
240 if (t->dev->type != dev_type)
245 if (score < cand_score) {
251 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
252 if (remote != t->parms.iph.daddr ||
253 key != t->parms.i_key ||
254 !(t->dev->flags & IFF_UP))
257 if (t->dev->type != ARPHRD_IPGRE &&
258 t->dev->type != dev_type)
262 if (t->parms.link != link)
264 if (t->dev->type != dev_type)
269 if (score < cand_score) {
275 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
276 if ((local != t->parms.iph.saddr &&
277 (local != t->parms.iph.daddr ||
278 !ipv4_is_multicast(local))) ||
279 key != t->parms.i_key ||
280 !(t->dev->flags & IFF_UP))
283 if (t->dev->type != ARPHRD_IPGRE &&
284 t->dev->type != dev_type)
288 if (t->parms.link != link)
290 if (t->dev->type != dev_type)
295 if (score < cand_score) {
301 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
302 if (t->parms.i_key != key ||
303 !(t->dev->flags & IFF_UP))
306 if (t->dev->type != ARPHRD_IPGRE &&
307 t->dev->type != dev_type)
311 if (t->parms.link != link)
313 if (t->dev->type != dev_type)
318 if (score < cand_score) {
327 dev = ign->fb_tunnel_dev;
328 if (dev->flags & IFF_UP)
329 return netdev_priv(dev);
334 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
335 struct ip_tunnel_parm *parms)
337 __be32 remote = parms->iph.daddr;
338 __be32 local = parms->iph.saddr;
339 __be32 key = parms->i_key;
340 unsigned h = HASH(key);
345 if (remote && !ipv4_is_multicast(remote)) {
350 return &ign->tunnels[prio][h];
353 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
356 return __ipgre_bucket(ign, &t->parms);
359 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
361 struct ip_tunnel **tp = ipgre_bucket(ign, t);
363 spin_lock_bh(&ipgre_lock);
365 rcu_assign_pointer(*tp, t);
366 spin_unlock_bh(&ipgre_lock);
369 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
371 struct ip_tunnel **tp;
373 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
375 spin_lock_bh(&ipgre_lock);
377 spin_unlock_bh(&ipgre_lock);
383 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
384 struct ip_tunnel_parm *parms,
387 __be32 remote = parms->iph.daddr;
388 __be32 local = parms->iph.saddr;
389 __be32 key = parms->i_key;
390 int link = parms->link;
391 struct ip_tunnel *t, **tp;
392 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
394 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
395 if (local == t->parms.iph.saddr &&
396 remote == t->parms.iph.daddr &&
397 key == t->parms.i_key &&
398 link == t->parms.link &&
399 type == t->dev->type)
405 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
406 struct ip_tunnel_parm *parms, int gretap, int create)
408 struct ip_tunnel *t, *nt;
409 struct net_device *dev;
411 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413 t = ipgre_tunnel_find(net, parms, gretap ? ARPHRD_ETHER : ARPHRD_IPGRE);
418 strlcpy(name, parms->name, IFNAMSIZ);
420 sprintf(name, "gre%%d");
422 dev = alloc_netdev(sizeof(*t), name, gretap ? ipgre_tap_setup
423 : ipgre_tunnel_setup);
427 dev_net_set(dev, net);
429 if (strchr(name, '%')) {
430 if (dev_alloc_name(dev, name) < 0)
435 random_ether_addr(dev->dev_addr);
437 #ifndef GRE_IOCTL_ONLY
438 dev->rtnl_link_ops = gretap ? &ipgre_tap_ops : &ipgre_link_ops;
440 nt = netdev_priv(dev);
443 dev->mtu = ipgre_tunnel_bind_dev(dev);
445 if (register_netdevice(dev) < 0)
449 ipgre_tunnel_link(ign, nt);
457 static void ipgre_tunnel_uninit(struct net_device *dev)
459 struct net *net = dev_net(dev);
460 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
462 ipgre_tunnel_unlink(ign, netdev_priv(dev));
466 static unsigned int tunnel_hard_header_len(struct net_device *dev)
468 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
469 return dev->hard_header_len;
471 return (dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0;
475 static void icmp_err_frag(struct sk_buff *skb, struct ip_tunnel *t,
478 int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
479 int header_len = t->hlen + tunnel_hard_header_len(t->dev);
480 unsigned int orig_mac_header = skb_mac_header(skb) - skb->data;
481 unsigned int orig_nw_header = skb_network_header(skb) - skb->data;
483 /* Add the size of the IP header since this is the smallest
484 * packet size the we might do something with and we might as
485 * well fail early if we don't have it. Plus it allows us to
486 * safely look at the VLAN header if there is one. The final
487 * size is checked before use. */
488 if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr)))
491 if (t->dev->type == ARPHRD_ETHER) {
492 skb_set_mac_header(skb, t->hlen);
493 encap_proto = eth_hdr(skb)->h_proto;
495 if (encap_proto == htons(ETH_P_8021Q)) {
496 header_len += VLAN_HLEN;
498 vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
502 skb_set_network_header(skb, header_len);
503 skb->protocol = encap_proto;
506 if (skb->protocol == htons(ETH_P_IP)) {
507 if (mtu < IP_MIN_MTU) {
508 if (ntohs(ip_hdr(skb)->tot_len) >= IP_MIN_MTU)
514 header_len += sizeof(struct iphdr);
515 } else if (skb->protocol == htons(ETH_P_IPV6)) {
516 if (mtu < IPV6_MIN_MTU) {
517 unsigned int packet_length;
519 if (!pskb_may_pull(skb, header_len +
520 sizeof(struct ipv6hdr)))
523 packet_length = sizeof(struct ipv6hdr) +
524 ntohs(ipv6_hdr(skb)->payload_len);
526 if (packet_length >= IPV6_MIN_MTU
527 || ntohs(ipv6_hdr(skb)->payload_len) == 0)
533 header_len += sizeof(struct ipv6hdr);
537 if (pskb_may_pull(skb, header_len)) {
538 __pskb_pull(skb, t->hlen);
539 send_frag_needed(skb, t->dev, mtu);
540 skb_push(skb, t->hlen);
544 skb_set_mac_header(skb, orig_mac_header);
545 skb_set_network_header(skb, orig_nw_header);
546 skb->protocol = htons(ETH_P_IP);
549 static void ipgre_err(struct sk_buff *skb, u32 info)
552 /* All the routers (except for Linux) return only
553 8 bytes of packet payload. It means, that precise relaying of
554 ICMP in the real Internet is absolutely infeasible.
556 Moreover, Cisco "wise men" put GRE key to the third word
557 in GRE header. It makes impossible maintaining even soft state for keyed
558 GRE tunnels with enabled checksum. Tell them "thank you".
560 Well, I wonder, rfc1812 was written by Cisco employee,
561 what the hell these idiots break standrads established
565 struct iphdr *iph = (struct iphdr *)skb->data;
567 int grehlen = (iph->ihl << 2) + 4;
568 const int type = icmp_hdr(skb)->type;
569 const int code = icmp_hdr(skb)->code;
574 WARN_ON_ONCE(skb_shared(skb));
576 if (!pskb_may_pull(skb, grehlen))
579 iph = (struct iphdr *)skb->data;
580 p = (__be16 *)(skb->data + (iph->ihl << 2));
581 flags = *gre_flags(p);
582 gre_proto = *gre_protocol(p);
584 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
585 if (flags&(GRE_VERSION|GRE_ROUTING))
594 /* If only 8 bytes returned, keyed message will be dropped here */
595 if (!pskb_may_pull(skb, grehlen))
598 iph = (struct iphdr *)skb->data;
602 case ICMP_PARAMETERPROB:
605 case ICMP_DEST_UNREACH:
608 /* Impossible event. */
609 case ICMP_PORT_UNREACH:
611 case ICMP_FRAG_NEEDED:
612 /* Soft state for pmtu is maintained by IP core but we
613 * also want to relay the message back. */
616 /* All others are translated to HOST_UNREACH.
617 rfc2003 contains "deep thoughts" about NET_UNREACH,
618 I believe they are just ether pollution. --ANK
623 case ICMP_TIME_EXCEEDED:
624 if (code != ICMP_EXC_TTL)
630 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
632 *(((__be32 *)skb->data) + (grehlen / 4) - 1)
635 if (t == NULL || t->parms.iph.daddr == 0 ||
636 ipv4_is_multicast(t->parms.iph.daddr))
639 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
642 if (code == ICMP_FRAG_NEEDED) {
643 /* Invalidates pointers. */
644 icmp_err_frag(skb, t, gre_proto);
648 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
652 t->err_time = jiffies;
658 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
660 if (INET_ECN_is_ce(iph->tos)) {
661 __be16 protocol = skb->protocol;
662 unsigned int nw_header = skb_network_header(skb) - skb->data;
664 if (skb->dev->type == ARPHRD_ETHER
665 && skb->protocol == htons(ETH_P_8021Q)) {
666 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
669 protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
670 nw_header += VLAN_HLEN;
673 if (protocol == htons(ETH_P_IP)) {
674 if (unlikely(!pskb_may_pull(skb, nw_header
675 + sizeof(struct iphdr))))
678 IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
679 } else if (protocol == htons(ETH_P_IPV6)) {
680 if (unlikely(!pskb_may_pull(skb, nw_header
681 + sizeof(struct ipv6hdr))))
684 IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
691 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
694 if (skb->protocol == htons(ETH_P_IP))
695 inner = old_iph->tos;
696 else if (skb->protocol == htons(ETH_P_IPV6))
697 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
698 return INET_ECN_encapsulate(tos, inner);
701 static int ipgre_rcv(struct sk_buff *skb)
709 struct ip_tunnel *tunnel;
714 if (!pskb_may_pull(skb, 16))
719 flags = *gre_flags(h);
721 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
722 /* - Version must be 0.
723 - We do not support routing headers.
725 if (flags&(GRE_VERSION|GRE_ROUTING))
728 if (flags&GRE_CSUM) {
729 switch (skb->ip_summed) {
730 case CHECKSUM_COMPLETE:
731 csum = csum_fold(skb->csum);
737 csum = __skb_checksum_complete(skb);
738 skb->ip_summed = CHECKSUM_COMPLETE;
743 key = *(__be32*)(h + offset);
747 seqno = ntohl(*(__be32*)(h + offset));
752 gre_proto = *gre_protocol(h);
755 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
756 iph->saddr, iph->daddr, key,
758 struct net_device_stats *stats;
759 #ifdef HAVE_NETDEV_STATS
760 stats = &tunnel->dev->stats;
762 stats = &tunnel->stat;
767 skb->protocol = gre_proto;
768 /* WCCP version 1 and 2 protocol decoding.
769 * - Change protocol to IP
770 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
772 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
773 skb->protocol = htons(ETH_P_IP);
774 if ((*(h + offset) & 0xF0) != 0x40)
778 skb->mac_header = skb->network_header;
779 __pskb_pull(skb, offset);
780 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
781 skb->pkt_type = PACKET_HOST;
782 #ifdef CONFIG_NET_IPGRE_BROADCAST
783 if (ipv4_is_multicast(iph->daddr)) {
784 /* Looped back packet, drop it! */
785 if (skb_rtable(skb)->fl.iif == 0)
788 skb->pkt_type = PACKET_BROADCAST;
792 if (((flags&GRE_CSUM) && csum) ||
793 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
794 stats->rx_crc_errors++;
798 if (tunnel->parms.i_flags&GRE_SEQ) {
799 if (!(flags&GRE_SEQ) ||
800 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
801 stats->rx_fifo_errors++;
805 tunnel->i_seqno = seqno + 1;
810 /* Warning: All skb pointers will be invalidated! */
811 if (tunnel->dev->type == ARPHRD_ETHER) {
812 if (!pskb_may_pull(skb, ETH_HLEN)) {
813 stats->rx_length_errors++;
819 skb->protocol = eth_type_trans(skb, tunnel->dev);
820 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
824 stats->rx_bytes += len;
825 skb->dev = tunnel->dev;
829 skb_reset_network_header(skb);
831 /* Invalidates pointers. */
832 ipgre_ecn_decapsulate(iph, skb);
838 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
847 static bool check_ipv4_address(__be32 addr)
849 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
850 || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
856 static bool ipv4_should_icmp(struct sk_buff *skb)
858 struct iphdr *old_iph = ip_hdr(skb);
860 /* Don't respond to L2 broadcast. */
861 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
864 /* Don't respond to L3 broadcast or invalid addresses. */
865 if (!check_ipv4_address(old_iph->daddr) ||
866 !check_ipv4_address(old_iph->saddr))
869 /* Only respond to the first fragment. */
870 if (old_iph->frag_off & htons(IP_OFFSET))
873 /* Don't respond to ICMP error messages. */
874 if (old_iph->protocol == IPPROTO_ICMP) {
875 u8 icmp_type, *icmp_typep;
877 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
878 (old_iph->ihl << 2) +
879 offsetof(struct icmphdr, type) -
880 skb->data, sizeof(icmp_type),
886 if (*icmp_typep > NR_ICMP_TYPES
887 || (*icmp_typep <= ICMP_PARAMETERPROB
888 && *icmp_typep != ICMP_ECHOREPLY
889 && *icmp_typep != ICMP_ECHO))
896 static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
897 unsigned int mtu, unsigned int payload_length)
899 struct iphdr *iph, *old_iph = ip_hdr(skb);
900 struct icmphdr *icmph;
903 iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
904 icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
905 payload = skb_put(nskb, payload_length);
909 iph->ihl = sizeof(struct iphdr) >> 2;
910 iph->tos = (old_iph->tos & IPTOS_TOS_MASK) |
911 IPTOS_PREC_INTERNETCONTROL;
912 iph->tot_len = htons(sizeof(struct iphdr)
913 + sizeof(struct icmphdr)
915 get_random_bytes(&iph->id, sizeof iph->id);
918 iph->protocol = IPPROTO_ICMP;
919 iph->daddr = old_iph->saddr;
920 iph->saddr = old_iph->daddr;
925 icmph->type = ICMP_DEST_UNREACH;
926 icmph->code = ICMP_FRAG_NEEDED;
927 icmph->un.gateway = htonl(mtu);
930 nskb->csum = csum_partial((u8 *)icmph, sizeof *icmph, 0);
931 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
932 payload, payload_length,
934 icmph->checksum = csum_fold(nskb->csum);
937 static bool ipv6_should_icmp(struct sk_buff *skb)
939 struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
941 int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
942 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
944 /* Check source address is valid. */
945 addr_type = ipv6_addr_type(&old_ipv6h->saddr);
946 if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
949 /* Don't reply to unspecified addresses. */
950 if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
953 /* Don't respond to ICMP error messages. */
954 payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
958 if (nexthdr == NEXTHDR_ICMP) {
959 u8 icmp_type, *icmp_typep;
961 icmp_typep = skb_header_pointer(skb, payload_off +
962 offsetof(struct icmp6hdr,
964 sizeof(icmp_type), &icmp_type);
966 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
973 static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
974 unsigned int mtu, unsigned int payload_length)
976 struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
977 struct icmp6hdr *icmp6h;
980 ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
981 icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
982 payload = skb_put(nskb, payload_length);
987 memset(&ipv6h->flow_lbl, 0, sizeof ipv6h->flow_lbl);
988 ipv6h->payload_len = htons(sizeof(struct icmp6hdr)
990 ipv6h->nexthdr = NEXTHDR_ICMP;
991 ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT;
992 ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
993 ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
996 icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG;
997 icmp6h->icmp6_code = 0;
998 icmp6h->icmp6_cksum = 0;
999 icmp6h->icmp6_mtu = htonl(mtu);
1001 nskb->csum = csum_partial((u8 *)icmp6h, sizeof *icmp6h, 0);
1002 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
1003 payload, payload_length,
1005 icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
1006 sizeof(struct icmp6hdr)
1008 ipv6h->nexthdr, nskb->csum);
1011 static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
1014 unsigned int eth_hdr_len = ETH_HLEN;
1015 unsigned int total_length, header_length, payload_length;
1016 struct ethhdr *eh, *old_eh = eth_hdr(skb);
1017 struct sk_buff *nskb;
1018 struct net_device_stats *stats;
1020 /* Normal IP stack. */
1021 if (!dev->br_port) {
1022 if (skb->protocol == htons(ETH_P_IP)) {
1023 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
1028 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
1037 if (skb->protocol == htons(ETH_P_IP)) {
1038 if (mtu < IP_MIN_MTU)
1041 if (!ipv4_should_icmp(skb))
1044 if (mtu < IPV6_MIN_MTU)
1047 /* In theory we should do PMTUD on IPv6 multicast messages but
1048 * we don't have an address to send from so just fragment. */
1049 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
1052 if (!ipv6_should_icmp(skb))
1057 if (old_eh->h_proto == htons(ETH_P_8021Q))
1058 eth_hdr_len = VLAN_ETH_HLEN;
1060 payload_length = skb->len - eth_hdr_len;
1061 if (skb->protocol == htons(ETH_P_IP)) {
1062 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
1063 total_length = min_t(unsigned int, header_length +
1064 payload_length, 576);
1066 header_length = sizeof(struct ipv6hdr) +
1067 sizeof(struct icmp6hdr);
1068 total_length = min_t(unsigned int, header_length +
1069 payload_length, IPV6_MIN_MTU);
1071 total_length = min(total_length, dev->mtu);
1072 payload_length = total_length - header_length;
1074 nskb = netdev_alloc_skb_ip_align(dev, eth_hdr_len + header_length
1079 /* Ethernet / VLAN */
1080 eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
1081 memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
1082 memcpy(eh->h_source, dev->dev_addr, ETH_ALEN);
1083 eh->h_proto = old_eh->h_proto;
1084 if (old_eh->h_proto == htons(ETH_P_8021Q)) {
1085 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
1087 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
1088 vh->h_vlan_encapsulated_proto = skb->protocol;
1090 nskb->protocol = eth_type_trans(nskb, dev);
1093 if (skb->protocol == htons(ETH_P_IP))
1094 ipv4_build_icmp(skb, nskb, mtu, payload_length);
1096 ipv6_build_icmp(skb, nskb, mtu, payload_length);
1099 #ifdef HAVE_NETDEV_STATS
1100 stats = &dev->stats;
1102 stats = &((struct ip_tunnel *)netdev_priv(dev))->stat;
1104 stats->rx_packets++;
1105 stats->rx_bytes += nskb->len;
1111 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
1113 struct ip_tunnel *tunnel = netdev_priv(dev);
1114 struct net_device_stats *stats;
1115 #ifdef HAVE_NETDEV_QUEUE_STATS
1116 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
1118 struct iphdr *old_iph;
1119 struct ipv6hdr *old_ipv6h;
1123 struct rtable *rt; /* Route to the other host */
1124 struct net_device *tdev; /* Device to other host */
1125 struct iphdr *iph; /* Our new IP header */
1126 unsigned int max_headroom; /* The extra header space needed */
1130 __be16 original_protocol;
1131 bool is_vlan = false;
1133 #ifdef HAVE_NETDEV_STATS
1134 stats = &dev->stats;
1136 stats = &tunnel->stat;
1139 WARN_ON_ONCE(skb_shared(skb));
1141 /* Validate the protocol headers before we try to use them. */
1142 original_protocol = skb->protocol;
1144 if (dev->type == ARPHRD_ETHER && skb->protocol == htons(ETH_P_8021Q)) {
1145 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1148 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
1149 skb_set_network_header(skb, VLAN_ETH_HLEN);
1153 old_iph = ip_hdr(skb);
1154 old_ipv6h = ipv6_hdr(skb);
1156 if (skb->protocol == htons(ETH_P_IP)) {
1157 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1158 + sizeof(struct iphdr) - skb->data)))
1160 } else if (skb->protocol == htons(ETH_P_IPV6)) {
1161 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1162 + sizeof(struct ipv6hdr) - skb->data)))
1166 if (dev->type == ARPHRD_ETHER)
1167 IPCB(skb)->flags = 0;
1169 #ifdef HAVE_NETDEV_HEADER_OPS
1170 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
1172 if (dev->hard_header && dev->type == ARPHRD_IPGRE) {
1175 tiph = (struct iphdr *)skb->data;
1177 gre_hlen = tunnel->hlen;
1178 tiph = &tunnel->parms.iph;
1181 if ((dst = tiph->daddr) == 0) {
1184 if (skb_dst(skb) == NULL) {
1185 stats->tx_fifo_errors++;
1189 if (skb->protocol == htons(ETH_P_IP)) {
1190 rt = skb_rtable(skb);
1191 if ((dst = rt->rt_gateway) == 0)
1195 else if (skb->protocol == htons(ETH_P_IPV6)) {
1196 struct in6_addr *addr6;
1198 struct neighbour *neigh = skb_dst(skb)->neighbour;
1203 addr6 = (struct in6_addr *)&neigh->primary_key;
1204 addr_type = ipv6_addr_type(addr6);
1206 if (addr_type == IPV6_ADDR_ANY) {
1207 addr6 = &ipv6_hdr(skb)->daddr;
1208 addr_type = ipv6_addr_type(addr6);
1211 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
1214 dst = addr6->s6_addr32[3];
1224 if (skb->protocol == htons(ETH_P_IP))
1226 else if (skb->protocol == htons(ETH_P_IPV6))
1227 tos = ipv6_get_dsfield(ipv6_hdr(skb));
1231 struct flowi fl = { .oif = tunnel->parms.link,
1234 .saddr = tiph->saddr,
1235 .tos = RT_TOS(tos) } },
1236 .proto = IPPROTO_GRE };
1237 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
1238 stats->tx_carrier_errors++;
1242 tdev = rt->u.dst.dev;
1246 stats->collisions++;
1250 df = tiph->frag_off;
1252 mtu = dst_mtu(&rt->u.dst) - tunnel_hard_header_len(dev)
1253 - (is_vlan ? VLAN_HLEN : 0)
1256 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
1258 if (skb->protocol == htons(ETH_P_IP))
1259 mtu = max(mtu, IP_MIN_MTU);
1260 if (skb->protocol == htons(ETH_P_IPV6))
1261 mtu = max(mtu, IPV6_MIN_MTU);
1264 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1266 if (skb->protocol == htons(ETH_P_IP)) {
1267 df |= (old_iph->frag_off&htons(IP_DF));
1269 if ((old_iph->frag_off&htons(IP_DF)) &&
1270 mtu < ntohs(old_iph->tot_len)) {
1271 if (send_frag_needed(skb, dev, mtu)) {
1276 } else if (skb->protocol == htons(ETH_P_IPV6)) {
1277 unsigned int packet_length = skb->len
1278 - tunnel_hard_header_len(dev)
1279 - (is_vlan ? VLAN_HLEN : 0);
1282 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
1284 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
1285 if ((tunnel->parms.iph.daddr &&
1286 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
1287 rt6->rt6i_dst.plen == 128) {
1288 rt6->rt6i_flags |= RTF_MODIFIED;
1289 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
1294 /* IPv6 requires PMTUD if the packet is above the minimum MTU.*/
1295 if (packet_length > IPV6_MIN_MTU)
1298 if (mtu < packet_length - tunnel->hlen + gre_hlen) {
1299 if (send_frag_needed(skb, dev, mtu)) {
1306 if (tunnel->err_count > 0) {
1307 if (time_before(jiffies,
1308 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
1309 tunnel->err_count--;
1311 dst_link_failure(skb);
1313 tunnel->err_count = 0;
1316 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
1318 if (skb_headroom(skb) < max_headroom ||
1319 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
1320 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
1323 #ifdef HAVE_NETDEV_QUEUE_STATS
1326 stats->tx_dropped++;
1329 return NETDEV_TX_OK;
1332 skb_set_owner_w(new_skb, skb->sk);
1335 old_iph = ip_hdr(skb);
1338 skb_reset_transport_header(skb);
1339 skb_push(skb, gre_hlen);
1340 skb_reset_network_header(skb);
1341 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1342 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
1346 skb_dst_set(skb, &rt->u.dst);
1349 * Push down and install the GRE header.
1354 iph->ihl = sizeof(struct iphdr) >> 2;
1356 iph->protocol = IPPROTO_GRE;
1357 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
1358 iph->daddr = rt->rt_dst;
1359 iph->saddr = rt->rt_src;
1361 /* Allow our local IP stack to fragment the outer packet even if the
1362 * DF bit is set. If we got this far there is nothing more that we
1363 * can do with the inner packet. */
1366 if ((iph->ttl = tiph->ttl) == 0) {
1367 if (skb->protocol == htons(ETH_P_IP))
1368 iph->ttl = old_iph->ttl;
1369 else if (skb->protocol == htons(ETH_P_IPV6))
1370 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
1372 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
1375 *gre_flags(iph + 1) = tunnel->parms.o_flags;
1376 *gre_protocol(iph + 1) = (dev->type == ARPHRD_ETHER) ?
1377 htons(ETH_P_TEB) : original_protocol;
1379 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
1380 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
1382 if (tunnel->parms.o_flags&GRE_SEQ) {
1384 *ptr = htonl(tunnel->o_seqno);
1387 if (tunnel->parms.o_flags&GRE_KEY) {
1388 *ptr = tunnel->parms.o_key;
1391 if (tunnel->parms.o_flags&GRE_CSUM) {
1393 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
1400 return NETDEV_TX_OK;
1403 dst_link_failure(skb);
1408 return NETDEV_TX_OK;
1411 static int ipgre_tunnel_bind_dev(struct net_device *dev)
1413 struct net_device *tdev = NULL;
1414 struct ip_tunnel *tunnel;
1416 int hlen = LL_MAX_HEADER;
1417 int mtu = ETH_DATA_LEN;
1418 int addend = sizeof(struct iphdr) + 4;
1420 tunnel = netdev_priv(dev);
1421 iph = &tunnel->parms.iph;
1423 /* Guess output device to choose reasonable mtu and needed_headroom */
1426 struct flowi fl = { .oif = tunnel->parms.link,
1428 { .daddr = iph->daddr,
1429 .saddr = iph->saddr,
1430 .tos = RT_TOS(iph->tos) } },
1431 .proto = IPPROTO_GRE };
1433 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
1434 tdev = rt->u.dst.dev;
1438 if (dev->type != ARPHRD_ETHER)
1439 dev->flags |= IFF_POINTOPOINT;
1442 if (!tdev && tunnel->parms.link)
1443 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1446 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1447 hlen = tdev->hard_header_len + tdev->needed_headroom;
1449 hlen = tdev->hard_header_len;
1453 dev->iflink = tunnel->parms.link;
1455 /* Precalculate GRE options length */
1456 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1457 if (tunnel->parms.o_flags&GRE_CSUM)
1459 if (tunnel->parms.o_flags&GRE_KEY)
1461 if (tunnel->parms.o_flags&GRE_SEQ)
1464 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1465 dev->needed_headroom = hlen + addend;
1467 dev->hard_header_len = hlen + addend;
1469 mtu -= tunnel_hard_header_len(dev) + addend;
1470 tunnel->hlen = addend;
1472 if (mtu < IP_MIN_MTU)
1475 /* If we could be connected to a bridge set the normal Ethernet MTU
1476 * since all devices on the bridge are required to have the same MTU.
1477 * Even though this isn't our optimal MTU we can handle it. */
1478 if (dev->type == ARPHRD_ETHER)
1485 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1488 struct ip_tunnel_parm p;
1489 struct ip_tunnel *t;
1490 struct net *net = dev_net(dev);
1491 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1492 int add_tunnel, gretap;
1497 if (dev == ign->fb_tunnel_dev) {
1498 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1502 t = ipgre_tunnel_locate(net, &p, false, 0);
1505 t = netdev_priv(dev);
1506 memcpy(&p, &t->parms, sizeof(p));
1507 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1516 if (!capable(CAP_NET_ADMIN))
1520 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1524 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1525 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1526 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1529 add_tunnel = (cmd == SIOCADDTUNNEL || cmd == SIOCADDGRETAP);
1530 gretap = (cmd == SIOCADDGRETAP || cmd == SIOCCHGGRETAP);
1532 if (!(p.i_flags&GRE_KEY))
1534 if (!(p.o_flags&GRE_KEY))
1537 t = ipgre_tunnel_locate(net, &p, gretap, add_tunnel);
1539 if (dev != ign->fb_tunnel_dev && !add_tunnel) {
1541 if (t->dev != dev) {
1546 unsigned nflags = 0;
1548 t = netdev_priv(dev);
1550 if (ipv4_is_multicast(p.iph.daddr))
1551 nflags = IFF_BROADCAST;
1552 else if (p.iph.daddr)
1553 nflags = IFF_POINTOPOINT;
1555 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1559 ipgre_tunnel_unlink(ign, t);
1560 t->parms.iph.saddr = p.iph.saddr;
1561 t->parms.iph.daddr = p.iph.daddr;
1562 t->parms.i_key = p.i_key;
1563 t->parms.o_key = p.o_key;
1564 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1565 memcpy(dev->broadcast, &p.iph.daddr, 4);
1566 ipgre_tunnel_link(ign, t);
1567 netdev_state_change(dev);
1574 t->parms.iph.ttl = p.iph.ttl;
1575 t->parms.iph.tos = p.iph.tos;
1576 t->parms.iph.frag_off = p.iph.frag_off;
1577 if (t->parms.link != p.link) {
1578 t->parms.link = p.link;
1579 dev->mtu = ipgre_tunnel_bind_dev(dev);
1580 netdev_state_change(dev);
1583 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1586 err = (add_tunnel ? -ENOBUFS : -ENOENT);
1591 if (!capable(CAP_NET_ADMIN))
1594 if (dev == ign->fb_tunnel_dev) {
1596 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1599 if ((t = ipgre_tunnel_locate(net, &p, false, 0)) == NULL)
1602 if (t == netdev_priv(ign->fb_tunnel_dev))
1606 unregister_netdevice(dev);
1618 #ifndef HAVE_NETDEV_STATS
1619 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1621 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1625 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1627 struct ip_tunnel *tunnel = netdev_priv(dev);
1628 if (new_mtu < IP_MIN_MTU ||
1629 new_mtu > 0xFFF8 - tunnel_hard_header_len(dev) - tunnel->hlen)
1635 /* Nice toy. Unfortunately, useless in real life :-)
1636 It allows to construct virtual multiprotocol broadcast "LAN"
1637 over the Internet, provided multicast routing is tuned.
1640 I have no idea was this bicycle invented before me,
1641 so that I had to set ARPHRD_IPGRE to a random value.
1642 I have an impression, that Cisco could make something similar,
1643 but this feature is apparently missing in IOS<=11.2(8).
1645 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1646 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1648 ping -t 255 224.66.66.66
1650 If nobody answers, mbone does not work.
1652 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1653 ip addr add 10.66.66.<somewhat>/24 dev Universe
1654 ifconfig Universe up
1655 ifconfig Universe add fe80::<Your_real_addr>/10
1656 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1659 ftp fec0:6666:6666::193.233.7.65
1664 #ifdef HAVE_NETDEV_HEADER_OPS
1665 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1666 unsigned short type,
1667 const void *daddr, const void *saddr, unsigned len)
1669 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1670 void *daddr, void *saddr, unsigned len)
1673 struct ip_tunnel *t = netdev_priv(dev);
1674 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1675 __be16 *p = (__be16*)(iph+1);
1677 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1678 p[0] = t->parms.o_flags;
1682 * Set the source hardware address.
1686 memcpy(&iph->saddr, saddr, 4);
1689 memcpy(&iph->daddr, daddr, 4);
1692 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1698 #ifdef HAVE_NETDEV_HEADER_OPS
1699 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1701 static int ipgre_header_parse(struct sk_buff *skb, unsigned char *haddr)
1704 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1705 memcpy(haddr, &iph->saddr, 4);
1709 #ifdef HAVE_NETDEV_HEADER_OPS
1710 static const struct header_ops ipgre_header_ops = {
1711 .create = ipgre_header,
1712 .parse = ipgre_header_parse,
1716 #ifdef CONFIG_NET_IPGRE_BROADCAST
1717 static int ipgre_open(struct net_device *dev)
1719 struct ip_tunnel *t = netdev_priv(dev);
1721 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1722 struct flowi fl = { .oif = t->parms.link,
1724 { .daddr = t->parms.iph.daddr,
1725 .saddr = t->parms.iph.saddr,
1726 .tos = RT_TOS(t->parms.iph.tos) } },
1727 .proto = IPPROTO_GRE };
1729 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1730 return -EADDRNOTAVAIL;
1731 dev = rt->u.dst.dev;
1733 if (__in_dev_get_rtnl(dev) == NULL)
1734 return -EADDRNOTAVAIL;
1735 t->mlink = dev->ifindex;
1736 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1741 static int ipgre_close(struct net_device *dev)
1743 struct ip_tunnel *t = netdev_priv(dev);
1745 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1746 struct in_device *in_dev;
1747 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1749 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1758 static void ethtool_getinfo(struct net_device *dev,
1759 struct ethtool_drvinfo *info)
1761 strcpy(info->driver, "ip_gre");
1762 strcpy(info->version, "Open vSwitch "VERSION BUILDNR);
1763 strcpy(info->bus_info, dev->type == ARPHRD_ETHER ? "gretap" : "gre");
1766 static struct ethtool_ops ethtool_ops = {
1767 .get_drvinfo = ethtool_getinfo,
1770 #ifdef HAVE_NET_DEVICE_OPS
1771 static const struct net_device_ops ipgre_netdev_ops = {
1772 .ndo_init = ipgre_tunnel_init,
1773 .ndo_uninit = ipgre_tunnel_uninit,
1774 #ifdef CONFIG_NET_IPGRE_BROADCAST
1775 .ndo_open = ipgre_open,
1776 .ndo_stop = ipgre_close,
1778 .ndo_start_xmit = ipgre_tunnel_xmit,
1779 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1780 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1784 static void ipgre_tunnel_setup(struct net_device *dev)
1786 #ifdef HAVE_NET_DEVICE_OPS
1787 dev->netdev_ops = &ipgre_netdev_ops;
1789 dev->init = ipgre_tunnel_init;
1790 dev->uninit = ipgre_tunnel_uninit;
1791 dev->hard_start_xmit = ipgre_tunnel_xmit;
1792 #ifndef HAVE_NETDEV_STATS
1793 dev->get_stats = ipgre_tunnel_get_stats;
1795 dev->do_ioctl = ipgre_tunnel_ioctl;
1796 dev->change_mtu = ipgre_tunnel_change_mtu;
1797 #endif /* HAVE_NET_DEVICE_OPS */
1798 dev->destructor = free_netdev;
1800 dev->type = ARPHRD_IPGRE;
1801 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1802 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1804 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1806 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1807 dev->flags = IFF_NOARP;
1810 dev->features |= NETIF_F_NETNS_LOCAL;
1811 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1813 SET_ETHTOOL_OPS(dev, ðtool_ops);
1816 static int ipgre_tunnel_init(struct net_device *dev)
1818 struct ip_tunnel *tunnel;
1821 tunnel = netdev_priv(dev);
1822 iph = &tunnel->parms.iph;
1825 strcpy(tunnel->parms.name, dev->name);
1827 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1828 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1831 #ifdef CONFIG_NET_IPGRE_BROADCAST
1832 if (ipv4_is_multicast(iph->daddr)) {
1835 dev->flags = IFF_BROADCAST;
1836 #ifdef HAVE_NETDEV_HEADER_OPS
1837 dev->header_ops = &ipgre_header_ops;
1839 dev->hard_header = ipgre_header;
1840 dev->hard_header_parse = ipgre_header_parse;
1842 #ifndef HAVE_NET_DEVICE_OPS
1843 dev->open = ipgre_open;
1844 dev->stop = ipgre_close;
1849 #ifdef HAVE_NETDEV_HEADER_OPS
1850 dev->header_ops = &ipgre_header_ops;
1852 dev->hard_header = ipgre_header;
1853 dev->hard_header_parse = ipgre_header_parse;
1860 #ifdef HAVE_NET_DEVICE_OPS
1861 static void ipgre_fb_tunnel_init(struct net_device *dev)
1863 static int ipgre_fb_tunnel_init(struct net_device *dev)
1866 struct ip_tunnel *tunnel = netdev_priv(dev);
1867 struct iphdr *iph = &tunnel->parms.iph;
1868 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1871 strcpy(tunnel->parms.name, dev->name);
1874 iph->protocol = IPPROTO_GRE;
1876 tunnel->hlen = sizeof(struct iphdr) + 4;
1879 ign->tunnels_wc[0] = tunnel;
1881 #ifndef HAVE_NET_DEVICE_OPS
1886 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
1887 static struct net_protocol ipgre_protocol = {
1889 static const struct net_protocol ipgre_protocol = {
1891 .handler = ipgre_rcv,
1892 .err_handler = ipgre_err,
1893 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
1898 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1902 for (prio = 0; prio < 4; prio++) {
1904 for (h = 0; h < HASH_SIZE; h++) {
1905 struct ip_tunnel *t = ign->tunnels[prio][h];
1908 unregister_netdevice_queue(t->dev, head);
1915 static int ipgre_init_net(struct net *net)
1917 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1920 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), GRE_IOCTL_DEVICE,
1921 ipgre_tunnel_setup);
1922 if (!ign->fb_tunnel_dev) {
1926 dev_net_set(ign->fb_tunnel_dev, net);
1928 #ifdef HAVE_NET_DEVICE_OPS
1929 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1931 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1933 #ifndef GRE_IOCTL_ONLY
1934 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1937 if ((err = register_netdev(ign->fb_tunnel_dev)))
1943 free_netdev(ign->fb_tunnel_dev);
1948 static void ipgre_exit_net(struct net *net)
1950 struct ipgre_net *ign;
1953 ign = net_generic(net, ipgre_net_id);
1955 ipgre_destroy_tunnels(ign, &list);
1956 unregister_netdevice_many(&list);
1960 static struct pernet_operations ipgre_net_ops = {
1961 .init = ipgre_init_net,
1962 .exit = ipgre_exit_net,
1963 .id = &ipgre_net_id,
1964 .size = sizeof(struct ipgre_net),
1967 static int ipgre_tap_init(struct net_device *dev)
1969 struct ip_tunnel *tunnel;
1971 tunnel = netdev_priv(dev);
1974 strcpy(tunnel->parms.name, dev->name);
1976 ipgre_tunnel_bind_dev(dev);
1981 #ifdef HAVE_NET_DEVICE_OPS
1982 static const struct net_device_ops ipgre_tap_netdev_ops = {
1983 .ndo_init = ipgre_tap_init,
1984 .ndo_uninit = ipgre_tunnel_uninit,
1985 .ndo_start_xmit = ipgre_tunnel_xmit,
1986 .ndo_set_mac_address = eth_mac_addr,
1987 .ndo_validate_addr = eth_validate_addr,
1988 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1989 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1993 static void ipgre_tap_setup(struct net_device *dev)
1997 #ifdef HAVE_NET_DEVICE_OPS
1998 dev->netdev_ops = &ipgre_tap_netdev_ops;
2000 dev->init = ipgre_tap_init;
2001 dev->uninit = ipgre_tunnel_uninit;
2002 dev->hard_start_xmit = ipgre_tunnel_xmit;
2003 #ifndef HAVE_NETDEV_STATS
2004 dev->get_stats = ipgre_tunnel_get_stats;
2006 dev->do_ioctl = ipgre_tunnel_ioctl;
2007 dev->change_mtu = ipgre_tunnel_change_mtu;
2008 #endif /* HAVE_NET_DEVICE_OPS */
2009 dev->destructor = free_netdev;
2012 dev->features |= NETIF_F_NETNS_LOCAL;
2013 dev->tx_queue_len = 0;
2015 SET_ETHTOOL_OPS(dev, ðtool_ops);
2018 #ifndef GRE_IOCTL_ONLY
2019 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
2027 if (data[IFLA_GRE_IFLAGS])
2028 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
2029 if (data[IFLA_GRE_OFLAGS])
2030 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
2031 if (flags & (GRE_VERSION|GRE_ROUTING))
2037 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
2041 if (tb[IFLA_ADDRESS]) {
2042 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
2044 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
2045 return -EADDRNOTAVAIL;
2051 if (data[IFLA_GRE_REMOTE]) {
2052 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
2058 return ipgre_tunnel_validate(tb, data);
2061 static void ipgre_netlink_parms(struct nlattr *data[],
2062 struct ip_tunnel_parm *parms)
2064 memset(parms, 0, sizeof(*parms));
2066 parms->iph.protocol = IPPROTO_GRE;
2071 if (data[IFLA_GRE_LINK])
2072 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
2074 if (data[IFLA_GRE_IFLAGS])
2075 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
2077 if (data[IFLA_GRE_OFLAGS])
2078 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
2080 if (data[IFLA_GRE_IKEY])
2081 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
2083 if (data[IFLA_GRE_OKEY])
2084 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
2086 if (data[IFLA_GRE_LOCAL])
2087 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
2089 if (data[IFLA_GRE_REMOTE])
2090 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
2092 if (data[IFLA_GRE_TTL])
2093 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
2095 if (data[IFLA_GRE_TOS])
2096 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
2098 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
2099 parms->iph.frag_off = htons(IP_DF);
2102 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
2103 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
2104 struct nlattr *data[])
2106 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
2107 struct nlattr *data[])
2110 struct ip_tunnel *nt;
2111 struct net *net = dev_net(dev);
2112 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
2116 nt = netdev_priv(dev);
2117 ipgre_netlink_parms(data, &nt->parms);
2119 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
2122 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
2123 random_ether_addr(dev->dev_addr);
2125 mtu = ipgre_tunnel_bind_dev(dev);
2129 err = register_netdevice(dev);
2134 ipgre_tunnel_link(ign, nt);
2140 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
2141 struct nlattr *data[])
2143 struct ip_tunnel *t, *nt;
2144 struct net *net = dev_net(dev);
2145 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
2146 struct ip_tunnel_parm p;
2149 if (dev == ign->fb_tunnel_dev)
2152 nt = netdev_priv(dev);
2153 ipgre_netlink_parms(data, &p);
2155 t = ipgre_tunnel_locate(net, &p, false, 0);
2163 if (dev->type != ARPHRD_ETHER) {
2164 unsigned nflags = 0;
2166 if (ipv4_is_multicast(p.iph.daddr))
2167 nflags = IFF_BROADCAST;
2168 else if (p.iph.daddr)
2169 nflags = IFF_POINTOPOINT;
2171 if ((dev->flags ^ nflags) &
2172 (IFF_POINTOPOINT | IFF_BROADCAST))
2176 ipgre_tunnel_unlink(ign, t);
2177 t->parms.iph.saddr = p.iph.saddr;
2178 t->parms.iph.daddr = p.iph.daddr;
2179 t->parms.i_key = p.i_key;
2180 if (dev->type != ARPHRD_ETHER) {
2181 memcpy(dev->dev_addr, &p.iph.saddr, 4);
2182 memcpy(dev->broadcast, &p.iph.daddr, 4);
2184 ipgre_tunnel_link(ign, t);
2185 netdev_state_change(dev);
2188 t->parms.o_key = p.o_key;
2189 t->parms.iph.ttl = p.iph.ttl;
2190 t->parms.iph.tos = p.iph.tos;
2191 t->parms.iph.frag_off = p.iph.frag_off;
2193 if (t->parms.link != p.link) {
2194 t->parms.link = p.link;
2195 mtu = ipgre_tunnel_bind_dev(dev);
2198 netdev_state_change(dev);
2204 static size_t ipgre_get_size(const struct net_device *dev)
2209 /* IFLA_GRE_IFLAGS */
2211 /* IFLA_GRE_OFLAGS */
2217 /* IFLA_GRE_LOCAL */
2219 /* IFLA_GRE_REMOTE */
2225 /* IFLA_GRE_PMTUDISC */
2230 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
2232 struct ip_tunnel *t = netdev_priv(dev);
2233 struct ip_tunnel_parm *p = &t->parms;
2235 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
2236 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
2237 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
2238 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
2239 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
2240 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
2241 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
2242 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
2243 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
2244 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
2252 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
2253 [IFLA_GRE_LINK] = { .type = NLA_U32 },
2254 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
2255 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
2256 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
2257 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
2258 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
2259 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
2260 [IFLA_GRE_TTL] = { .type = NLA_U8 },
2261 [IFLA_GRE_TOS] = { .type = NLA_U8 },
2262 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
2265 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
2267 .maxtype = IFLA_GRE_MAX,
2268 .policy = ipgre_policy,
2269 .priv_size = sizeof(struct ip_tunnel),
2270 .setup = ipgre_tunnel_setup,
2271 .validate = ipgre_tunnel_validate,
2272 .newlink = ipgre_newlink,
2273 .changelink = ipgre_changelink,
2274 .get_size = ipgre_get_size,
2275 .fill_info = ipgre_fill_info,
2278 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
2280 .maxtype = IFLA_GRE_MAX,
2281 .policy = ipgre_policy,
2282 .priv_size = sizeof(struct ip_tunnel),
2283 .setup = ipgre_tap_setup,
2284 .validate = ipgre_tap_validate,
2285 .newlink = ipgre_newlink,
2286 .changelink = ipgre_changelink,
2287 .get_size = ipgre_get_size,
2288 .fill_info = ipgre_fill_info,
2293 * And now the modules code and kernel interface.
2296 static int __init ipgre_init(void)
2300 printk(KERN_INFO "Open vSwitch GRE over IPv4, built "__DATE__" "
2303 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
2304 printk(KERN_INFO "ipgre init: can't add protocol\n");
2308 err = register_pernet_device(&ipgre_net_ops);
2310 goto pernet_device_failed;
2312 #ifndef GRE_IOCTL_ONLY
2313 err = rtnl_link_register(&ipgre_link_ops);
2315 goto rtnl_link_failed;
2317 err = rtnl_link_register(&ipgre_tap_ops);
2319 goto tap_ops_failed;
2325 #ifndef GRE_IOCTL_ONLY
2327 rtnl_link_unregister(&ipgre_link_ops);
2329 unregister_pernet_device(&ipgre_net_ops);
2331 pernet_device_failed:
2332 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
2337 static void __exit ipgre_fini(void)
2339 #ifndef GRE_IOCTL_ONLY
2340 rtnl_link_unregister(&ipgre_tap_ops);
2341 rtnl_link_unregister(&ipgre_link_ops);
2343 unregister_pernet_device(&ipgre_net_ops);
2344 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
2345 printk(KERN_INFO "ipgre close: can't remove protocol\n");
2348 module_init(ipgre_init);
2349 module_exit(ipgre_fini);
2350 MODULE_DESCRIPTION("GRE over IPv4 tunneling driver");
2351 MODULE_LICENSE("GPL");
2352 #ifndef GRE_IOCTL_ONLY
2353 MODULE_ALIAS_RTNL_LINK("gre");
2354 MODULE_ALIAS_RTNL_LINK("gretap");