datapath: backport: ip_tunnel_core: iptunnel_handle_offloads returns int and doesn...
[cascardo/ovs.git] / datapath / linux / compat / geneve.c
1 /*
2  * GENEVE: Generic Network Virtualization Encapsulation
3  *
4  * Copyright (c) 2015 Red Hat, Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/netdevice.h>
16 #include <linux/etherdevice.h>
17 #include <linux/hash.h>
18 #include <linux/if_link.h>
19 #include <linux/if_vlan.h>
20
21 #include <net/dst_metadata.h>
22 #include <net/net_namespace.h>
23 #include <net/netns/generic.h>
24 #include <net/rtnetlink.h>
25 #include <net/geneve.h>
26 #include <net/protocol.h>
27
28 #include "gso.h"
29 #include "vport-netdev.h"
30 #include "compat.h"
31
32 #ifndef HAVE_METADATA_DST
33 #define GENEVE_NETDEV_VER       "0.6"
34
35 #define GENEVE_UDP_PORT         6081
36
37 #define GENEVE_N_VID            (1u << 24)
38 #define GENEVE_VID_MASK         (GENEVE_N_VID - 1)
39
40 #define VNI_HASH_BITS           10
41 #define VNI_HASH_SIZE           (1<<VNI_HASH_BITS)
42
43 #define GENEVE_VER 0
44 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
45
46 /* per-network namespace private data for this module */
47 struct geneve_net {
48         struct list_head        geneve_list;
49         struct list_head        sock_list;
50 };
51
52 static int geneve_net_id;
53
54 /* Pseudo network device */
55 struct geneve_dev {
56         struct hlist_node  hlist;       /* vni hash table */
57         struct net         *net;        /* netns for packet i/o */
58         struct net_device  *dev;        /* netdev for geneve tunnel */
59         struct geneve_sock *sock;       /* socket used for geneve tunnel */
60         u8                 vni[3];      /* virtual network ID for tunnel */
61         u8                 ttl;         /* TTL override */
62         u8                 tos;         /* TOS override */
63         struct sockaddr_in remote;      /* IPv4 address for link partner */
64         struct list_head   next;        /* geneve's per namespace list */
65         __be16             dst_port;
66         bool               collect_md;
67 };
68
69 struct geneve_sock {
70         bool                    collect_md;
71         struct list_head        list;
72         struct socket           *sock;
73         struct rcu_head         rcu;
74         int                     refcnt;
75 #ifdef HAVE_UDP_OFFLOAD
76         struct udp_offload      udp_offloads;
77 #endif
78         struct hlist_head       vni_list[VNI_HASH_SIZE];
79 };
80
81 static inline __u32 geneve_net_vni_hash(u8 vni[3])
82 {
83         __u32 vnid;
84
85         vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
86         return hash_32(vnid, VNI_HASH_BITS);
87 }
88
89 static __be64 vni_to_tunnel_id(const __u8 *vni)
90 {
91 #ifdef __BIG_ENDIAN
92         return (vni[0] << 16) | (vni[1] << 8) | vni[2];
93 #else
94         return (__force __be64)(((__force u64)vni[0] << 40) |
95                                 ((__force u64)vni[1] << 48) |
96                                 ((__force u64)vni[2] << 56));
97 #endif
98 }
99
100 static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
101                                         __be32 addr, u8 vni[])
102 {
103         struct hlist_head *vni_list_head;
104         struct geneve_dev *geneve;
105         __u32 hash;
106
107         /* Find the device for this VNI */
108         hash = geneve_net_vni_hash(vni);
109         vni_list_head = &gs->vni_list[hash];
110         hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
111                 if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
112                     addr == geneve->remote.sin_addr.s_addr)
113                         return geneve;
114         }
115         return NULL;
116 }
117
118 static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
119 {
120         return (struct genevehdr *)(udp_hdr(skb) + 1);
121 }
122
123 /* geneve receive/decap routine */
124 static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
125 {
126         struct genevehdr *gnvh = geneve_hdr(skb);
127         struct metadata_dst *tun_dst;
128         struct geneve_dev *geneve = NULL;
129         struct pcpu_sw_netstats *stats;
130         struct iphdr *iph;
131         u8 *vni;
132         __be32 addr;
133         int err;
134         union {
135                 struct metadata_dst dst;
136                 char buf[sizeof(struct metadata_dst) + 256];
137         } buf;
138
139         iph = ip_hdr(skb); /* outer IP header... */
140
141         if (gs->collect_md) {
142                 static u8 zero_vni[3];
143
144                 vni = zero_vni;
145                 addr = 0;
146         } else {
147                 vni = gnvh->vni;
148                 addr = iph->saddr;
149         }
150
151         geneve = geneve_lookup(gs, addr, vni);
152         if (!geneve)
153                 goto drop;
154
155         if (ip_tunnel_collect_metadata() || gs->collect_md) {
156                 __be16 flags;
157
158                 flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
159                         (gnvh->oam ? TUNNEL_OAM : 0) |
160                         (gnvh->critical ? TUNNEL_CRIT_OPT : 0);
161
162                 tun_dst = &buf.dst;
163                 ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, flags,
164                                    vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4);
165                 /* Update tunnel dst according to Geneve options. */
166                 ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
167                                         gnvh->options, gnvh->opt_len * 4);
168         } else {
169                 /* Drop packets w/ critical options,
170                  * since we don't support any...
171                  */
172                 tun_dst = NULL;
173                 if (gnvh->critical)
174                         goto drop;
175         }
176
177         skb_reset_mac_header(skb);
178         skb->protocol = eth_type_trans(skb, geneve->dev);
179         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
180
181         if (tun_dst)
182                 ovs_skb_dst_set(skb, &tun_dst->dst);
183         else
184                 goto drop;
185         /* Ignore packet loops (and multicast echo) */
186         if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
187                 goto drop;
188
189         skb_reset_network_header(skb);
190
191         err = IP_ECN_decapsulate(iph, skb);
192
193         if (unlikely(err)) {
194                 if (err > 1) {
195                         ++geneve->dev->stats.rx_frame_errors;
196                         ++geneve->dev->stats.rx_errors;
197                         goto drop;
198                 }
199         }
200
201         stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)geneve->dev->tstats);
202         u64_stats_update_begin(&stats->syncp);
203         stats->rx_packets++;
204         stats->rx_bytes += skb->len;
205         u64_stats_update_end(&stats->syncp);
206         netdev_port_receive(skb, &tun_dst->u.tun_info);
207         return;
208 drop:
209         /* Consume bad packet */
210         kfree_skb(skb);
211 }
212
213 /* Setup stats when device is created */
214 static int geneve_init(struct net_device *dev)
215 {
216         dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
217         if (!dev->tstats)
218                 return -ENOMEM;
219
220         return 0;
221 }
222
223 static void geneve_uninit(struct net_device *dev)
224 {
225         free_percpu(dev->tstats);
226 }
227
228 /* Callback from net/ipv4/udp.c to receive packets */
229 static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
230 {
231         struct genevehdr *geneveh;
232         struct geneve_sock *gs;
233         int opts_len;
234
235         /* Need Geneve and inner Ethernet header to be present */
236         if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
237                 goto error;
238
239         /* Return packets with reserved bits set */
240         geneveh = geneve_hdr(skb);
241         if (unlikely(geneveh->ver != GENEVE_VER))
242                 goto error;
243
244         if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
245                 goto error;
246
247         opts_len = geneveh->opt_len * 4;
248         if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
249                                  htons(ETH_P_TEB), false))
250                 goto drop;
251
252         gs = rcu_dereference_sk_user_data(sk);
253         if (!gs)
254                 goto drop;
255
256         geneve_rx(gs, skb);
257         return 0;
258
259 drop:
260         /* Consume bad packet */
261         kfree_skb(skb);
262         return 0;
263
264 error:
265         /* Let the UDP layer deal with the skb */
266         return 1;
267 }
268
269 static struct socket *geneve_create_sock(struct net *net, bool ipv6,
270                                          __be16 port)
271 {
272         struct socket *sock;
273         struct udp_port_cfg udp_conf;
274         int err;
275
276         memset(&udp_conf, 0, sizeof(udp_conf));
277
278         if (ipv6) {
279                 udp_conf.family = AF_INET6;
280         } else {
281                 udp_conf.family = AF_INET;
282                 udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
283         }
284
285         udp_conf.local_udp_port = port;
286
287         /* Open UDP socket */
288         err = udp_sock_create(net, &udp_conf, &sock);
289         if (err < 0)
290                 return ERR_PTR(err);
291
292         return sock;
293 }
294
295 #ifdef HAVE_UDP_OFFLOAD
296 static void geneve_notify_add_rx_port(struct geneve_sock *gs)
297 {
298         struct sock *sk = gs->sock->sk;
299         sa_family_t sa_family = sk->sk_family;
300         int err;
301
302         if (sa_family == AF_INET) {
303                 err = udp_add_offload(&gs->udp_offloads);
304                 if (err)
305                         pr_warn("geneve: udp_add_offload failed with status %d\n",
306                                 err);
307         }
308 }
309
310 static int geneve_hlen(struct genevehdr *gh)
311 {
312         return sizeof(*gh) + gh->opt_len * 4;
313 }
314
315 #ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
316 static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
317                                            struct sk_buff *skb)
318 #else
319 static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
320                                            struct sk_buff *skb,
321                                            struct udp_offload *uoff)
322 #endif
323 {
324         struct sk_buff *p, **pp = NULL;
325         struct genevehdr *gh, *gh2;
326         unsigned int hlen, gh_len, off_gnv;
327         const struct packet_offload *ptype;
328         __be16 type;
329         int flush = 1;
330
331         off_gnv = skb_gro_offset(skb);
332         hlen = off_gnv + sizeof(*gh);
333         gh = skb_gro_header_fast(skb, off_gnv);
334         if (skb_gro_header_hard(skb, hlen)) {
335                 gh = skb_gro_header_slow(skb, hlen, off_gnv);
336                 if (unlikely(!gh))
337                         goto out;
338         }
339
340         if (gh->ver != GENEVE_VER || gh->oam)
341                 goto out;
342         gh_len = geneve_hlen(gh);
343
344         hlen = off_gnv + gh_len;
345         if (skb_gro_header_hard(skb, hlen)) {
346                 gh = skb_gro_header_slow(skb, hlen, off_gnv);
347                 if (unlikely(!gh))
348                         goto out;
349         }
350
351         flush = 0;
352
353         for (p = *head; p; p = p->next) {
354                 if (!NAPI_GRO_CB(p)->same_flow)
355                         continue;
356
357                 gh2 = (struct genevehdr *)(p->data + off_gnv);
358                 if (gh->opt_len != gh2->opt_len ||
359                     memcmp(gh, gh2, gh_len)) {
360                         NAPI_GRO_CB(p)->same_flow = 0;
361                         continue;
362                 }
363         }
364
365         type = gh->proto_type;
366
367         rcu_read_lock();
368         ptype = gro_find_receive_by_type(type);
369         if (!ptype) {
370                 flush = 1;
371                 goto out_unlock;
372         }
373
374         skb_gro_pull(skb, gh_len);
375         skb_gro_postpull_rcsum(skb, gh, gh_len);
376         pp = ptype->callbacks.gro_receive(head, skb);
377
378 out_unlock:
379         rcu_read_unlock();
380 out:
381         NAPI_GRO_CB(skb)->flush |= flush;
382
383         return pp;
384 }
385
386 #ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
387 static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
388 #else
389 static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
390                                struct udp_offload *uoff)
391 #endif
392 {
393         struct genevehdr *gh;
394         struct packet_offload *ptype;
395         __be16 type;
396         int gh_len;
397         int err = -ENOSYS;
398
399         udp_tunnel_gro_complete(skb, nhoff);
400
401         gh = (struct genevehdr *)(skb->data + nhoff);
402         gh_len = geneve_hlen(gh);
403         type = gh->proto_type;
404
405         rcu_read_lock();
406         ptype = gro_find_complete_by_type(type);
407         if (ptype)
408                 err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
409
410         rcu_read_unlock();
411         return err;
412 }
413 #endif
414
415 /* Create new listen socket if needed */
416 static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
417                                                 bool ipv6)
418 {
419         struct geneve_net *gn = net_generic(net, geneve_net_id);
420         struct geneve_sock *gs;
421         struct socket *sock;
422         struct udp_tunnel_sock_cfg tunnel_cfg;
423         int h;
424
425         gs = kzalloc(sizeof(*gs), GFP_KERNEL);
426         if (!gs)
427                 return ERR_PTR(-ENOMEM);
428
429         sock = geneve_create_sock(net, ipv6, port);
430         if (IS_ERR(sock)) {
431                 kfree(gs);
432                 return ERR_CAST(sock);
433         }
434
435         gs->sock = sock;
436         gs->refcnt = 1;
437         for (h = 0; h < VNI_HASH_SIZE; ++h)
438                 INIT_HLIST_HEAD(&gs->vni_list[h]);
439
440         /* Initialize the geneve udp offloads structure */
441 #ifdef HAVE_UDP_OFFLOAD
442         gs->udp_offloads.port = port;
443         gs->udp_offloads.callbacks.gro_receive  = geneve_gro_receive;
444         gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
445         geneve_notify_add_rx_port(gs);
446 #endif
447         /* Mark socket as an encapsulation socket */
448         tunnel_cfg.sk_user_data = gs;
449         tunnel_cfg.encap_type = 1;
450         tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
451         tunnel_cfg.encap_destroy = NULL;
452         setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
453         list_add(&gs->list, &gn->sock_list);
454         return gs;
455 }
456
457 static void geneve_notify_del_rx_port(struct geneve_sock *gs)
458 {
459 #ifdef HAVE_UDP_OFFLOAD
460         struct sock *sk = gs->sock->sk;
461         sa_family_t sa_family = sk->sk_family;
462
463         if (sa_family == AF_INET)
464                 udp_del_offload(&gs->udp_offloads);
465 #endif
466 }
467
468 static void free_gs_rcu(struct rcu_head *rcu)
469 {
470         struct geneve_sock *gs = container_of(rcu, struct geneve_sock, rcu);
471
472         kfree(gs);
473 }
474
475 static void geneve_sock_release(struct geneve_sock *gs)
476 {
477         if (--gs->refcnt)
478                 return;
479
480         list_del(&gs->list);
481         geneve_notify_del_rx_port(gs);
482         udp_tunnel_sock_release(gs->sock);
483         call_rcu(&gs->rcu, free_gs_rcu);
484 }
485
486 static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
487                                             __be16 dst_port)
488 {
489         struct geneve_sock *gs;
490
491         list_for_each_entry(gs, &gn->sock_list, list) {
492                 if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
493                     inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) {
494                         return gs;
495                 }
496         }
497         return NULL;
498 }
499
500 static int geneve_open(struct net_device *dev)
501 {
502         struct geneve_dev *geneve = netdev_priv(dev);
503         struct net *net = geneve->net;
504         struct geneve_net *gn = net_generic(net, geneve_net_id);
505         struct geneve_sock *gs;
506         __u32 hash;
507
508         gs = geneve_find_sock(gn, geneve->dst_port);
509         if (gs) {
510                 gs->refcnt++;
511                 goto out;
512         }
513
514         gs = geneve_socket_create(net, geneve->dst_port, false);
515         if (IS_ERR(gs))
516                 return PTR_ERR(gs);
517
518 out:
519         gs->collect_md = geneve->collect_md;
520         geneve->sock = gs;
521
522         hash = geneve_net_vni_hash(geneve->vni);
523         hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
524         return 0;
525 }
526
527 static int geneve_stop(struct net_device *dev)
528 {
529         struct geneve_dev *geneve = netdev_priv(dev);
530         struct geneve_sock *gs = geneve->sock;
531
532         if (!hlist_unhashed(&geneve->hlist))
533                 hlist_del_rcu(&geneve->hlist);
534         geneve_sock_release(gs);
535         return 0;
536 }
537
538 static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
539                             __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
540                             bool csum)
541 {
542         struct genevehdr *gnvh;
543         int min_headroom;
544         int err;
545
546         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
547                         + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
548                         + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
549         err = skb_cow_head(skb, min_headroom);
550         if (unlikely(err))
551                 goto free_rt;
552
553         skb = vlan_hwaccel_push_inside(skb);
554         if (!skb) {
555                 err = -ENOMEM;
556                 goto free_rt;
557         }
558
559         err = udp_tunnel_handle_offloads(skb, csum, false);
560         if (err)
561                 goto free_rt;
562         gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
563         gnvh->ver = GENEVE_VER;
564         gnvh->opt_len = opt_len / 4;
565         gnvh->oam = !!(tun_flags & TUNNEL_OAM);
566         gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
567         gnvh->rsvd1 = 0;
568         memcpy(gnvh->vni, vni, 3);
569         gnvh->proto_type = htons(ETH_P_TEB);
570         gnvh->rsvd2 = 0;
571         memcpy(gnvh->options, opt, opt_len);
572
573         ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
574         return 0;
575
576 free_rt:
577         ip_rt_put(rt);
578         return err;
579 }
580
581 static struct rtable *geneve_get_rt(struct sk_buff *skb,
582                                     struct net_device *dev,
583                                     struct flowi4 *fl4,
584                                     struct ip_tunnel_info *info)
585 {
586         struct geneve_dev *geneve = netdev_priv(dev);
587         struct rtable *rt = NULL;
588         __u8 tos;
589
590         memset(fl4, 0, sizeof(*fl4));
591         fl4->flowi4_mark = skb->mark;
592         fl4->flowi4_proto = IPPROTO_UDP;
593
594         if (info) {
595                 fl4->daddr = info->key.u.ipv4.dst;
596                 fl4->saddr = info->key.u.ipv4.src;
597                 fl4->flowi4_tos = RT_TOS(info->key.tos);
598         } else {
599                 tos = geneve->tos;
600                 if (tos == 1) {
601                         const struct iphdr *iip = ip_hdr(skb);
602
603                         tos = ip_tunnel_get_dsfield(iip, skb);
604                 }
605
606                 fl4->flowi4_tos = RT_TOS(tos);
607                 fl4->daddr = geneve->remote.sin_addr.s_addr;
608         }
609
610         rt = ip_route_output_key(geneve->net, fl4);
611         if (IS_ERR(rt)) {
612                 netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
613                 return ERR_PTR(-ENETUNREACH);
614         }
615         if (rt->dst.dev == dev) { /* is this necessary? */
616                 netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
617                 ip_rt_put(rt);
618                 return ERR_PTR(-ELOOP);
619         }
620         return rt;
621 }
622
623 /* Convert 64 bit tunnel ID to 24 bit VNI. */
624 static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
625 {
626 #ifdef __BIG_ENDIAN
627         vni[0] = (__force __u8)(tun_id >> 16);
628         vni[1] = (__force __u8)(tun_id >> 8);
629         vni[2] = (__force __u8)tun_id;
630 #else
631         vni[0] = (__force __u8)((__force u64)tun_id >> 40);
632         vni[1] = (__force __u8)((__force u64)tun_id >> 48);
633         vni[2] = (__force __u8)((__force u64)tun_id >> 56);
634 #endif
635 }
636
637 netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb)
638 {
639         struct net_device *dev = skb->dev;
640         struct geneve_dev *geneve = netdev_priv(dev);
641         struct geneve_sock *gs = geneve->sock;
642         struct ip_tunnel_info *info = NULL;
643         struct rtable *rt = NULL;
644         const struct iphdr *iip; /* interior IP header */
645         int err = -EINVAL;
646         struct flowi4 fl4;
647         __u8 tos, ttl;
648         __be16 sport;
649         bool udp_csum;
650         __be16 df;
651
652         if (geneve->collect_md) {
653                 info = skb_tunnel_info(skb);
654                 if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) {
655                         netdev_dbg(dev, "no tunnel metadata\n");
656                         goto tx_error;
657                 }
658                 if (info && ip_tunnel_info_af(info) != AF_INET)
659                         goto tx_error;
660         }
661
662         rt = geneve_get_rt(skb, dev, &fl4, info);
663         if (IS_ERR(rt)) {
664                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
665                 err = PTR_ERR(rt);
666                 goto tx_error;
667         }
668
669         sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
670         skb_reset_mac_header(skb);
671
672         iip = ip_hdr(skb);
673
674         if (info) {
675                 const struct ip_tunnel_key *key = &info->key;
676                 u8 *opts = NULL;
677                 u8 vni[3];
678
679                 tunnel_id_to_vni(key->tun_id, vni);
680                 if (key->tun_flags & TUNNEL_GENEVE_OPT)
681                         opts = ip_tunnel_info_opts(info);
682
683                 udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
684                 err = geneve_build_skb(rt, skb, key->tun_flags, vni,
685                                        info->options_len, opts, udp_csum);
686                 if (unlikely(err))
687                         goto tx_error;
688
689                 tos = ip_tunnel_ecn_encap(key->tos, iip, skb);
690                 ttl = key->ttl;
691                 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
692         } else {
693                 udp_csum = false;
694                 err = geneve_build_skb(rt, skb, 0, geneve->vni,
695                                        0, NULL, udp_csum);
696                 if (unlikely(err))
697                         goto tx_error;
698
699                 tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
700                 ttl = geneve->ttl;
701                 if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
702                         ttl = 1;
703                 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
704                 df = 0;
705         }
706         err = udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr,
707                                   tos, ttl, df, sport, geneve->dst_port,
708                                   !net_eq(geneve->net, dev_net(geneve->dev)),
709                                   !udp_csum);
710
711         iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *) dev->tstats);
712         return NETDEV_TX_OK;
713
714 tx_error:
715         dev_kfree_skb(skb);
716
717         if (err == -ELOOP)
718                 dev->stats.collisions++;
719         else if (err == -ENETUNREACH)
720                 dev->stats.tx_carrier_errors++;
721         else
722                 dev->stats.tx_errors++;
723         return NETDEV_TX_OK;
724 }
725 EXPORT_SYMBOL(rpl_geneve_xmit);
726
727 static netdev_tx_t geneve_dev_xmit(struct sk_buff *skb, struct net_device *dev)
728 {
729         /* Drop All packets coming from networking stack. OVS-CB is
730          * not initialized for these packets.
731          */
732
733         dev_kfree_skb(skb);
734         dev->stats.tx_dropped++;
735         return NETDEV_TX_OK;
736 }
737
738 static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict)
739 {
740         /* The max_mtu calculation does not take account of GENEVE
741          * options, to avoid excluding potentially valid
742          * configurations.
743          */
744         int max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - sizeof(struct iphdr)
745                       - dev->hard_header_len;
746
747         if (new_mtu < 68)
748                 return -EINVAL;
749
750         if (new_mtu > max_mtu) {
751                 if (strict)
752                         return -EINVAL;
753
754                 new_mtu = max_mtu;
755         }
756
757         dev->mtu = new_mtu;
758         return 0;
759 }
760
761 static int geneve_change_mtu(struct net_device *dev, int new_mtu)
762 {
763         return __geneve_change_mtu(dev, new_mtu, true);
764 }
765
766 int ovs_geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
767 {
768         struct ip_tunnel_info *info = skb_tunnel_info(skb);
769         struct geneve_dev *geneve = netdev_priv(dev);
770         struct rtable *rt;
771         struct flowi4 fl4;
772
773         if (ip_tunnel_info_af(info) != AF_INET)
774                 return -EINVAL;
775
776         rt = geneve_get_rt(skb, dev, &fl4, info);
777         if (IS_ERR(rt))
778                 return PTR_ERR(rt);
779
780         ip_rt_put(rt);
781         info->key.u.ipv4.src = fl4.saddr;
782         info->key.tp_src = udp_flow_src_port(geneve->net, skb,
783                                              1, USHRT_MAX, true);
784         info->key.tp_dst = geneve->dst_port;
785         return 0;
786 }
787 EXPORT_SYMBOL_GPL(ovs_geneve_fill_metadata_dst);
788
789 static const struct net_device_ops geneve_netdev_ops = {
790         .ndo_init               = geneve_init,
791         .ndo_uninit             = geneve_uninit,
792         .ndo_get_stats64        = ip_tunnel_get_stats64,
793         .ndo_open               = geneve_open,
794         .ndo_stop               = geneve_stop,
795         .ndo_start_xmit         = geneve_dev_xmit,
796         .ndo_change_mtu         = geneve_change_mtu,
797         .ndo_validate_addr      = eth_validate_addr,
798         .ndo_set_mac_address    = eth_mac_addr,
799 #ifdef HAVE_NDO_FILL_METADATA_DST
800         .ndo_fill_metadata_dst  = geneve_fill_metadata_dst,
801 #endif
802 };
803
804 static void geneve_get_drvinfo(struct net_device *dev,
805                                struct ethtool_drvinfo *drvinfo)
806 {
807         strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
808         strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
809 }
810
811 static const struct ethtool_ops geneve_ethtool_ops = {
812         .get_drvinfo    = geneve_get_drvinfo,
813         .get_link       = ethtool_op_get_link,
814 };
815
816 /* Info for udev, that this is a virtual tunnel endpoint */
817 static struct device_type geneve_type = {
818         .name = "geneve",
819 };
820
821 /* Initialize the device structure. */
822 static void geneve_setup(struct net_device *dev)
823 {
824         ether_setup(dev);
825
826         dev->netdev_ops = &geneve_netdev_ops;
827         dev->ethtool_ops = &geneve_ethtool_ops;
828         dev->destructor = free_netdev;
829
830         SET_NETDEV_DEVTYPE(dev, &geneve_type);
831
832         dev->features    |= NETIF_F_LLTX;
833         dev->features    |= NETIF_F_SG | NETIF_F_HW_CSUM;
834         dev->features    |= NETIF_F_RXCSUM;
835         dev->features    |= NETIF_F_GSO_SOFTWARE;
836
837 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
838         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
839         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
840 #endif
841 #if 0
842         /* Not required */
843         netif_keep_dst(dev);
844 #endif
845         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
846         eth_hw_addr_random(dev);
847 }
848
849 static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
850         [IFLA_GENEVE_ID]                = { .type = NLA_U32 },
851         [IFLA_GENEVE_REMOTE]            = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
852         [IFLA_GENEVE_TTL]               = { .type = NLA_U8 },
853         [IFLA_GENEVE_TOS]               = { .type = NLA_U8 },
854         [IFLA_GENEVE_PORT]              = { .type = NLA_U16 },
855         [IFLA_GENEVE_COLLECT_METADATA]  = { .type = NLA_FLAG },
856 };
857
858 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
859 {
860         if (tb[IFLA_ADDRESS]) {
861                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
862                         return -EINVAL;
863
864                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
865                         return -EADDRNOTAVAIL;
866         }
867
868         if (!data)
869                 return -EINVAL;
870
871         if (data[IFLA_GENEVE_ID]) {
872                 __u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);
873
874                 if (vni >= GENEVE_VID_MASK)
875                         return -ERANGE;
876         }
877
878         return 0;
879 }
880
881 static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
882                                           __be16 dst_port,
883                                           __be32 rem_addr,
884                                           u8 vni[],
885                                           bool *tun_on_same_port,
886                                           bool *tun_collect_md)
887 {
888         struct geneve_dev *geneve, *t;
889
890         *tun_on_same_port = false;
891         *tun_collect_md = false;
892         t = NULL;
893         list_for_each_entry(geneve, &gn->geneve_list, next) {
894                 if (geneve->dst_port == dst_port) {
895                         *tun_collect_md = geneve->collect_md;
896                         *tun_on_same_port = true;
897                 }
898                 if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
899                     rem_addr == geneve->remote.sin_addr.s_addr &&
900                     dst_port == geneve->dst_port)
901                         t = geneve;
902         }
903         return t;
904 }
905
906 static int geneve_configure(struct net *net, struct net_device *dev,
907                             __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
908                             __be16 dst_port, bool metadata)
909 {
910         struct geneve_net *gn = net_generic(net, geneve_net_id);
911         struct geneve_dev *t, *geneve = netdev_priv(dev);
912         bool tun_collect_md, tun_on_same_port;
913         int err;
914
915         if (metadata) {
916                 if (rem_addr || vni || tos || ttl)
917                         return -EINVAL;
918         }
919
920         geneve->net = net;
921         geneve->dev = dev;
922
923         geneve->vni[0] = (vni & 0x00ff0000) >> 16;
924         geneve->vni[1] = (vni & 0x0000ff00) >> 8;
925         geneve->vni[2] =  vni & 0x000000ff;
926
927         geneve->remote.sin_addr.s_addr = rem_addr;
928         if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
929                 return -EINVAL;
930
931         geneve->ttl = ttl;
932         geneve->tos = tos;
933         geneve->dst_port = dst_port;
934         geneve->collect_md = metadata;
935
936         t = geneve_find_dev(gn, dst_port, rem_addr, geneve->vni,
937                             &tun_on_same_port, &tun_collect_md);
938         if (t)
939                 return -EBUSY;
940
941         if (metadata) {
942                 if (tun_on_same_port)
943                         return -EPERM;
944         } else {
945                 if (tun_collect_md)
946                         return -EPERM;
947         }
948
949         err = register_netdevice(dev);
950         if (err)
951                 return err;
952
953         list_add(&geneve->next, &gn->geneve_list);
954         return 0;
955 }
956
957 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
958 static int geneve_newlink(struct net_device *dev,
959                           struct nlattr *tb[], struct nlattr *data[])
960 {
961         struct net *net = &init_net;
962 #else
963 static int geneve_newlink(struct net *net, struct net_device *dev,
964                           struct nlattr *tb[], struct nlattr *data[])
965 {
966 #endif
967         __be16 dst_port = htons(GENEVE_UDP_PORT);
968         __u8 ttl = 0, tos = 0;
969         bool metadata = false;
970         __be32 rem_addr;
971         __u32 vni;
972
973         if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
974                 return -EINVAL;
975
976         vni = nla_get_u32(data[IFLA_GENEVE_ID]);
977         rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
978
979         if (data[IFLA_GENEVE_TTL])
980                 ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
981
982         if (data[IFLA_GENEVE_TOS])
983                 tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
984
985         if (data[IFLA_GENEVE_PORT])
986                 dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
987
988         if (data[IFLA_GENEVE_COLLECT_METADATA])
989                 metadata = true;
990
991         return geneve_configure(net, dev, rem_addr, vni,
992                                 ttl, tos, dst_port, metadata);
993 }
994
995 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
996 static void geneve_dellink(struct net_device *dev)
997 #else
998 static void geneve_dellink(struct net_device *dev, struct list_head *head)
999 #endif
1000 {
1001         struct geneve_dev *geneve = netdev_priv(dev);
1002
1003         list_del(&geneve->next);
1004         unregister_netdevice_queue(dev, head);
1005 }
1006
1007 static size_t geneve_get_size(const struct net_device *dev)
1008 {
1009         return nla_total_size(sizeof(__u32)) +  /* IFLA_GENEVE_ID */
1010                 nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */
1011                 nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
1012                 nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
1013                 nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
1014                 nla_total_size(0) +      /* IFLA_GENEVE_COLLECT_METADATA */
1015                 0;
1016 }
1017
1018 static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
1019 {
1020         struct geneve_dev *geneve = netdev_priv(dev);
1021         __u32 vni;
1022
1023         vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2];
1024         if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
1025                 goto nla_put_failure;
1026
1027         if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
1028                             geneve->remote.sin_addr.s_addr))
1029                 goto nla_put_failure;
1030
1031         if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
1032             nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
1033                 goto nla_put_failure;
1034
1035         if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
1036                 goto nla_put_failure;
1037
1038         if (geneve->collect_md) {
1039                 if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
1040                         goto nla_put_failure;
1041         }
1042
1043         return 0;
1044
1045 nla_put_failure:
1046         return -EMSGSIZE;
1047 }
1048
1049 static struct rtnl_link_ops geneve_link_ops __read_mostly = {
1050         .kind           = "ovs_geneve",
1051         .maxtype        = IFLA_GENEVE_MAX,
1052         .policy         = geneve_policy,
1053         .priv_size      = sizeof(struct geneve_dev),
1054         .setup          = geneve_setup,
1055         .validate       = geneve_validate,
1056         .newlink        = geneve_newlink,
1057         .dellink        = geneve_dellink,
1058         .get_size       = geneve_get_size,
1059         .fill_info      = geneve_fill_info,
1060 };
1061
1062 struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name,
1063                                         u8 name_assign_type, u16 dst_port)
1064 {
1065         struct nlattr *tb[IFLA_MAX + 1];
1066         struct net_device *dev;
1067         int err;
1068
1069         memset(tb, 0, sizeof(tb));
1070         dev = rtnl_create_link(net, (char *) name, name_assign_type,
1071                                &geneve_link_ops, tb);
1072         if (IS_ERR(dev))
1073                 return dev;
1074
1075         err = geneve_configure(net, dev, 0, 0, 0, 0, htons(dst_port), true);
1076         if (err)
1077                 goto err;
1078
1079         /* openvswitch users expect packet sizes to be unrestricted,
1080          * so set the largest MTU we can.
1081          */
1082         err = __geneve_change_mtu(dev, IP_MAX_MTU, false);
1083         if (err)
1084                 goto err;
1085
1086         return dev;
1087
1088 err:
1089         free_netdev(dev);
1090         return ERR_PTR(err);
1091 }
1092 EXPORT_SYMBOL_GPL(rpl_geneve_dev_create_fb);
1093
1094 static __net_init int geneve_init_net(struct net *net)
1095 {
1096         struct geneve_net *gn = net_generic(net, geneve_net_id);
1097
1098         INIT_LIST_HEAD(&gn->geneve_list);
1099         INIT_LIST_HEAD(&gn->sock_list);
1100         return 0;
1101 }
1102
1103 static void __net_exit geneve_exit_net(struct net *net)
1104 {
1105         struct geneve_net *gn = net_generic(net, geneve_net_id);
1106         struct geneve_dev *geneve, *next;
1107         struct net_device *dev, *aux;
1108         LIST_HEAD(list);
1109
1110         rtnl_lock();
1111
1112         /* gather any geneve devices that were moved into this ns */
1113         for_each_netdev_safe(net, dev, aux)
1114                 if (dev->rtnl_link_ops == &geneve_link_ops)
1115                         unregister_netdevice_queue(dev, &list);
1116
1117         /* now gather any other geneve devices that were created in this ns */
1118         list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
1119                 /* If geneve->dev is in the same netns, it was already added
1120                  * to the list by the previous loop.
1121                  */
1122                 if (!net_eq(dev_net(geneve->dev), net))
1123                         unregister_netdevice_queue(geneve->dev, &list);
1124         }
1125
1126         /* unregister the devices gathered above */
1127         unregister_netdevice_many(&list);
1128         rtnl_unlock();
1129 }
1130
1131 static struct pernet_operations geneve_net_ops = {
1132         .init = geneve_init_net,
1133         .exit = geneve_exit_net,
1134         .id   = &geneve_net_id,
1135         .size = sizeof(struct geneve_net),
1136 };
1137
1138 int rpl_geneve_init_module(void)
1139 {
1140         int rc;
1141
1142         rc = register_pernet_subsys(&geneve_net_ops);
1143         if (rc)
1144                 goto out1;
1145
1146         rc = rtnl_link_register(&geneve_link_ops);
1147         if (rc)
1148                 goto out2;
1149
1150         pr_info("Geneve tunneling driver\n");
1151         return 0;
1152 out2:
1153         unregister_pernet_subsys(&geneve_net_ops);
1154 out1:
1155         return rc;
1156 }
1157
1158 void rpl_geneve_cleanup_module(void)
1159 {
1160         rtnl_link_unregister(&geneve_link_ops);
1161         unregister_pernet_subsys(&geneve_net_ops);
1162 }
1163 #endif