datapath: backport: openvswitch: allow management from inside user namespaces
[cascardo/ovs.git] / datapath / datapath.c
1 /*
2  * Copyright (c) 2007-2015 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/init.h>
22 #include <linux/module.h>
23 #include <linux/if_arp.h>
24 #include <linux/if_vlan.h>
25 #include <linux/in.h>
26 #include <linux/ip.h>
27 #include <linux/jhash.h>
28 #include <linux/delay.h>
29 #include <linux/time.h>
30 #include <linux/etherdevice.h>
31 #include <linux/genetlink.h>
32 #include <linux/kernel.h>
33 #include <linux/kthread.h>
34 #include <linux/mutex.h>
35 #include <linux/percpu.h>
36 #include <linux/rcupdate.h>
37 #include <linux/tcp.h>
38 #include <linux/udp.h>
39 #include <linux/version.h>
40 #include <linux/ethtool.h>
41 #include <linux/wait.h>
42 #include <asm/div64.h>
43 #include <linux/highmem.h>
44 #include <linux/netfilter_bridge.h>
45 #include <linux/netfilter_ipv4.h>
46 #include <linux/inetdevice.h>
47 #include <linux/list.h>
48 #include <linux/openvswitch.h>
49 #include <linux/rculist.h>
50 #include <linux/dmi.h>
51 #include <net/genetlink.h>
52 #include <net/net_namespace.h>
53 #include <net/netns/generic.h>
54
55 #include "datapath.h"
56 #include "conntrack.h"
57 #include "flow.h"
58 #include "flow_table.h"
59 #include "flow_netlink.h"
60 #include "gso.h"
61 #include "vport-internal_dev.h"
62 #include "vport-netdev.h"
63
64 int ovs_net_id __read_mostly;
65 EXPORT_SYMBOL_GPL(ovs_net_id);
66
67 static struct genl_family dp_packet_genl_family;
68 static struct genl_family dp_flow_genl_family;
69 static struct genl_family dp_datapath_genl_family;
70
71 static const struct nla_policy flow_policy[];
72
73 static struct genl_multicast_group ovs_dp_flow_multicast_group = {
74         .name = OVS_FLOW_MCGROUP
75 };
76
77 static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
78         .name = OVS_DATAPATH_MCGROUP
79 };
80
81 struct genl_multicast_group ovs_dp_vport_multicast_group = {
82         .name = OVS_VPORT_MCGROUP
83 };
84
85 /* Check if need to build a reply message.
86  * OVS userspace sets the NLM_F_ECHO flag if it needs the reply.
87  */
88 static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
89                             unsigned int group)
90 {
91         return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
92                genl_has_listeners(family, genl_info_net(info), group);
93 }
94
95 static void ovs_notify(struct genl_family *family, struct genl_multicast_group *grp,
96                        struct sk_buff *skb, struct genl_info *info)
97 {
98         genl_notify(family, skb, genl_info_net(info),
99                     info->snd_portid, GROUP_ID(grp), info->nlhdr, GFP_KERNEL);
100 }
101
102 /**
103  * DOC: Locking:
104  *
105  * All writes e.g. Writes to device state (add/remove datapath, port, set
106  * operations on vports, etc.), Writes to other state (flow table
107  * modifications, set miscellaneous datapath parameters, etc.) are protected
108  * by ovs_lock.
109  *
110  * Reads are protected by RCU.
111  *
112  * There are a few special cases (mostly stats) that have their own
113  * synchronization but they nest under all of above and don't interact with
114  * each other.
115  *
116  * The RTNL lock nests inside ovs_mutex.
117  */
118
119 static DEFINE_MUTEX(ovs_mutex);
120
121 void ovs_lock(void)
122 {
123         mutex_lock(&ovs_mutex);
124 }
125
126 void ovs_unlock(void)
127 {
128         mutex_unlock(&ovs_mutex);
129 }
130
131 #ifdef CONFIG_LOCKDEP
132 int lockdep_ovsl_is_held(void)
133 {
134         if (debug_locks)
135                 return lockdep_is_held(&ovs_mutex);
136         else
137                 return 1;
138 }
139 EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
140 #endif
141
142 static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
143                              const struct sw_flow_key *,
144                              const struct dp_upcall_info *,
145                              uint32_t cutlen);
146 static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
147                                   const struct sw_flow_key *,
148                                   const struct dp_upcall_info *,
149                                   uint32_t cutlen);
150
151 /* Must be called with rcu_read_lock. */
152 static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
153 {
154         struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
155
156         if (dev) {
157                 struct vport *vport = ovs_internal_dev_get_vport(dev);
158                 if (vport)
159                         return vport->dp;
160         }
161
162         return NULL;
163 }
164
165 /* The caller must hold either ovs_mutex or rcu_read_lock to keep the
166  * returned dp pointer valid.
167  */
168 static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
169 {
170         struct datapath *dp;
171
172         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
173         rcu_read_lock();
174         dp = get_dp_rcu(net, dp_ifindex);
175         rcu_read_unlock();
176
177         return dp;
178 }
179
180 /* Must be called with rcu_read_lock or ovs_mutex. */
181 const char *ovs_dp_name(const struct datapath *dp)
182 {
183         struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
184         return ovs_vport_name(vport);
185 }
186
187 static int get_dpifindex(const struct datapath *dp)
188 {
189         struct vport *local;
190         int ifindex;
191
192         rcu_read_lock();
193
194         local = ovs_vport_rcu(dp, OVSP_LOCAL);
195         if (local)
196                 ifindex = local->dev->ifindex;
197         else
198                 ifindex = 0;
199
200         rcu_read_unlock();
201
202         return ifindex;
203 }
204
205 static void destroy_dp_rcu(struct rcu_head *rcu)
206 {
207         struct datapath *dp = container_of(rcu, struct datapath, rcu);
208
209         ovs_flow_tbl_destroy(&dp->table);
210         free_percpu(dp->stats_percpu);
211         kfree(dp->ports);
212         kfree(dp);
213 }
214
215 static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
216                                             u16 port_no)
217 {
218         return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
219 }
220
221 /* Called with ovs_mutex or RCU read lock. */
222 struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
223 {
224         struct vport *vport;
225         struct hlist_head *head;
226
227         head = vport_hash_bucket(dp, port_no);
228         hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
229                 if (vport->port_no == port_no)
230                         return vport;
231         }
232         return NULL;
233 }
234
235 /* Called with ovs_mutex. */
236 static struct vport *new_vport(const struct vport_parms *parms)
237 {
238         struct vport *vport;
239
240         vport = ovs_vport_add(parms);
241         if (!IS_ERR(vport)) {
242                 struct datapath *dp = parms->dp;
243                 struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
244
245                 hlist_add_head_rcu(&vport->dp_hash_node, head);
246         }
247         return vport;
248 }
249
250 void ovs_dp_detach_port(struct vport *p)
251 {
252         ASSERT_OVSL();
253
254         /* First drop references to device. */
255         hlist_del_rcu(&p->dp_hash_node);
256
257         /* Then destroy it. */
258         ovs_vport_del(p);
259 }
260
261 /* Must be called with rcu_read_lock. */
262 void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
263 {
264         const struct vport *p = OVS_CB(skb)->input_vport;
265         struct datapath *dp = p->dp;
266         struct sw_flow *flow;
267         struct sw_flow_actions *sf_acts;
268         struct dp_stats_percpu *stats;
269         u64 *stats_counter;
270         u32 n_mask_hit;
271
272         stats = this_cpu_ptr(dp->stats_percpu);
273
274         /* Look up flow. */
275         flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
276                                          &n_mask_hit);
277         if (unlikely(!flow)) {
278                 struct dp_upcall_info upcall;
279                 int error;
280
281                 memset(&upcall, 0, sizeof(upcall));
282                 upcall.cmd = OVS_PACKET_CMD_MISS;
283                 upcall.portid = ovs_vport_find_upcall_portid(p, skb);
284                 upcall.mru = OVS_CB(skb)->mru;
285                 error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
286                 if (unlikely(error))
287                         kfree_skb(skb);
288                 else
289                         consume_skb(skb);
290                 stats_counter = &stats->n_missed;
291                 goto out;
292         }
293
294         ovs_flow_stats_update(flow, key->tp.flags, skb);
295         sf_acts = rcu_dereference(flow->sf_acts);
296         ovs_execute_actions(dp, skb, sf_acts, key);
297
298         stats_counter = &stats->n_hit;
299
300 out:
301         /* Update datapath statistics. */
302         u64_stats_update_begin(&stats->syncp);
303         (*stats_counter)++;
304         stats->n_mask_hit += n_mask_hit;
305         u64_stats_update_end(&stats->syncp);
306 }
307
308 int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
309                   const struct sw_flow_key *key,
310                   const struct dp_upcall_info *upcall_info,
311                   uint32_t cutlen)
312 {
313         struct dp_stats_percpu *stats;
314         int err;
315
316         if (upcall_info->portid == 0) {
317                 err = -ENOTCONN;
318                 goto err;
319         }
320
321         if (!skb_is_gso(skb))
322                 err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
323         else
324                 err = queue_gso_packets(dp, skb, key, upcall_info, cutlen);
325         if (err)
326                 goto err;
327
328         return 0;
329
330 err:
331         stats = this_cpu_ptr(dp->stats_percpu);
332
333         u64_stats_update_begin(&stats->syncp);
334         stats->n_lost++;
335         u64_stats_update_end(&stats->syncp);
336
337         return err;
338 }
339
340 static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
341                              const struct sw_flow_key *key,
342                              const struct dp_upcall_info *upcall_info,
343                                  uint32_t cutlen)
344 {
345         unsigned short gso_type = skb_shinfo(skb)->gso_type;
346         struct sw_flow_key later_key;
347         struct sk_buff *segs, *nskb;
348         struct ovs_skb_cb ovs_cb;
349         int err;
350
351         ovs_cb = *OVS_CB(skb);
352         segs = __skb_gso_segment(skb, NETIF_F_SG, false);
353         *OVS_CB(skb) = ovs_cb;
354         if (IS_ERR(segs))
355                 return PTR_ERR(segs);
356         if (segs == NULL)
357                 return -EINVAL;
358
359         if (gso_type & SKB_GSO_UDP) {
360                 /* The initial flow key extracted by ovs_flow_key_extract()
361                  * in this case is for a first fragment, so we need to
362                  * properly mark later fragments.
363                  */
364                 later_key = *key;
365                 later_key.ip.frag = OVS_FRAG_TYPE_LATER;
366         }
367
368         /* Queue all of the segments. */
369         skb = segs;
370         do {
371                 *OVS_CB(skb) = ovs_cb;
372                 if (gso_type & SKB_GSO_UDP && skb != segs)
373                         key = &later_key;
374
375                 err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
376                 if (err)
377                         break;
378
379         } while ((skb = skb->next));
380
381         /* Free all of the segments. */
382         skb = segs;
383         do {
384                 nskb = skb->next;
385                 if (err)
386                         kfree_skb(skb);
387                 else
388                         consume_skb(skb);
389         } while ((skb = nskb));
390         return err;
391 }
392
393 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
394                               unsigned int hdrlen)
395 {
396         size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
397                 + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
398                 + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
399                 + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
400
401         /* OVS_PACKET_ATTR_USERDATA */
402         if (upcall_info->userdata)
403                 size += NLA_ALIGN(upcall_info->userdata->nla_len);
404
405         /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
406         if (upcall_info->egress_tun_info)
407                 size += nla_total_size(ovs_tun_key_attr_size());
408
409         /* OVS_PACKET_ATTR_ACTIONS */
410         if (upcall_info->actions_len)
411                 size += nla_total_size(upcall_info->actions_len);
412
413         /* OVS_PACKET_ATTR_MRU */
414         if (upcall_info->mru)
415                 size += nla_total_size(sizeof(upcall_info->mru));
416
417         return size;
418 }
419
420 static void pad_packet(struct datapath *dp, struct sk_buff *skb)
421 {
422         if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
423                 size_t plen = NLA_ALIGN(skb->len) - skb->len;
424
425                 if (plen > 0)
426                         memset(skb_put(skb, plen), 0, plen);
427         }
428 }
429
430 static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
431                                   const struct sw_flow_key *key,
432                                   const struct dp_upcall_info *upcall_info,
433                                   uint32_t cutlen)
434 {
435         struct ovs_header *upcall;
436         struct sk_buff *nskb = NULL;
437         struct sk_buff *user_skb = NULL; /* to be queued to userspace */
438         struct nlattr *nla;
439         struct genl_info info = {
440 #ifdef HAVE_GENLMSG_NEW_UNICAST
441                 .dst_sk = ovs_dp_get_net(dp)->genl_sock,
442 #endif
443                 .snd_portid = upcall_info->portid,
444         };
445         size_t len;
446         unsigned int hlen;
447         int err, dp_ifindex;
448
449         dp_ifindex = get_dpifindex(dp);
450         if (!dp_ifindex)
451                 return -ENODEV;
452
453         if (skb_vlan_tag_present(skb)) {
454                 nskb = skb_clone(skb, GFP_ATOMIC);
455                 if (!nskb)
456                         return -ENOMEM;
457
458                 nskb = __vlan_hwaccel_push_inside(nskb);
459                 if (!nskb)
460                         return -ENOMEM;
461
462                 skb = nskb;
463         }
464
465         if (nla_attr_size(skb->len) > USHRT_MAX) {
466                 err = -EFBIG;
467                 goto out;
468         }
469
470         /* Complete checksum if needed */
471         if (skb->ip_summed == CHECKSUM_PARTIAL &&
472             (err = skb_checksum_help(skb)))
473                 goto out;
474
475         /* Older versions of OVS user space enforce alignment of the last
476          * Netlink attribute to NLA_ALIGNTO which would require extensive
477          * padding logic. Only perform zerocopy if padding is not required.
478          */
479         if (dp->user_features & OVS_DP_F_UNALIGNED)
480                 hlen = skb_zerocopy_headlen(skb);
481         else
482                 hlen = skb->len;
483
484         len = upcall_msg_size(upcall_info, hlen - cutlen);
485         user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
486         if (!user_skb) {
487                 err = -ENOMEM;
488                 goto out;
489         }
490
491         upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
492                              0, upcall_info->cmd);
493         upcall->dp_ifindex = dp_ifindex;
494
495         err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
496         BUG_ON(err);
497
498         if (upcall_info->userdata)
499                 __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
500                           nla_len(upcall_info->userdata),
501                           nla_data(upcall_info->userdata));
502
503
504         if (upcall_info->egress_tun_info) {
505                 nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
506                 err = ovs_nla_put_tunnel_info(user_skb,
507                                               upcall_info->egress_tun_info);
508                 BUG_ON(err);
509                 nla_nest_end(user_skb, nla);
510         }
511
512         if (upcall_info->actions_len) {
513                 nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);
514                 err = ovs_nla_put_actions(upcall_info->actions,
515                                           upcall_info->actions_len,
516                                           user_skb);
517                 if (!err)
518                         nla_nest_end(user_skb, nla);
519                 else
520                         nla_nest_cancel(user_skb, nla);
521         }
522
523         /* Add OVS_PACKET_ATTR_MRU */
524         if (upcall_info->mru) {
525                 if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
526                                 upcall_info->mru)) {
527                         err = -ENOBUFS;
528                         goto out;
529                 }
530                 pad_packet(dp, user_skb);
531         }
532
533         /* Add OVS_PACKET_ATTR_LEN when packet is truncated */
534         if (cutlen > 0) {
535                 if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
536                                 skb->len)) {
537                         err = -ENOBUFS;
538                         goto out;
539                 }
540                 pad_packet(dp, user_skb);
541         }
542
543         /* Only reserve room for attribute header, packet data is added
544          * in skb_zerocopy()
545          */
546         if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
547                 err = -ENOBUFS;
548                 goto out;
549         }
550         nla->nla_len = nla_attr_size(skb->len - cutlen);
551
552         err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen);
553         if (err)
554                 goto out;
555
556         /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
557         pad_packet(dp, user_skb);
558
559         ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
560
561         err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
562         user_skb = NULL;
563 out:
564         if (err)
565                 skb_tx_error(skb);
566         kfree_skb(user_skb);
567         kfree_skb(nskb);
568         return err;
569 }
570
571 static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
572 {
573         struct ovs_header *ovs_header = info->userhdr;
574         struct net *net = sock_net(skb->sk);
575         struct nlattr **a = info->attrs;
576         struct sw_flow_actions *acts;
577         struct sk_buff *packet;
578         struct sw_flow *flow;
579         struct sw_flow_actions *sf_acts;
580         struct datapath *dp;
581         struct ethhdr *eth;
582         struct vport *input_vport;
583         u16 mru = 0;
584         int len;
585         int err;
586         bool log = !a[OVS_PACKET_ATTR_PROBE];
587
588         err = -EINVAL;
589         if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
590             !a[OVS_PACKET_ATTR_ACTIONS])
591                 goto err;
592
593         len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
594         packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
595         err = -ENOMEM;
596         if (!packet)
597                 goto err;
598         skb_reserve(packet, NET_IP_ALIGN);
599
600         nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
601
602         skb_reset_mac_header(packet);
603         eth = eth_hdr(packet);
604
605         /* Normally, setting the skb 'protocol' field would be handled by a
606          * call to eth_type_trans(), but it assumes there's a sending
607          * device, which we may not have.
608          */
609         if (eth_proto_is_802_3(eth->h_proto))
610                 packet->protocol = eth->h_proto;
611         else
612                 packet->protocol = htons(ETH_P_802_2);
613
614         /* Set packet's mru */
615         if (a[OVS_PACKET_ATTR_MRU]) {
616                 mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
617                 packet->ignore_df = 1;
618         }
619         OVS_CB(packet)->mru = mru;
620
621         /* Build an sw_flow for sending this packet. */
622         flow = ovs_flow_alloc();
623         err = PTR_ERR(flow);
624         if (IS_ERR(flow))
625                 goto err_kfree_skb;
626
627         err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY],
628                                              packet, &flow->key, log);
629         if (err)
630                 goto err_flow_free;
631
632         err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS],
633                                    &flow->key, &acts, log);
634         if (err)
635                 goto err_flow_free;
636
637         rcu_assign_pointer(flow->sf_acts, acts);
638         packet->priority = flow->key.phy.priority;
639         packet->mark = flow->key.phy.skb_mark;
640
641         rcu_read_lock();
642         dp = get_dp_rcu(net, ovs_header->dp_ifindex);
643         err = -ENODEV;
644         if (!dp)
645                 goto err_unlock;
646
647         input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
648         if (!input_vport)
649                 input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
650
651         if (!input_vport)
652                 goto err_unlock;
653
654         packet->dev = input_vport->dev;
655         OVS_CB(packet)->input_vport = input_vport;
656         sf_acts = rcu_dereference(flow->sf_acts);
657
658         local_bh_disable();
659         err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
660         local_bh_enable();
661         rcu_read_unlock();
662
663         ovs_flow_free(flow, false);
664         return err;
665
666 err_unlock:
667         rcu_read_unlock();
668 err_flow_free:
669         ovs_flow_free(flow, false);
670 err_kfree_skb:
671         kfree_skb(packet);
672 err:
673         return err;
674 }
675
676 static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
677         [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
678         [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
679         [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
680         [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
681         [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
682 };
683
684 static struct genl_ops dp_packet_genl_ops[] = {
685         { .cmd = OVS_PACKET_CMD_EXECUTE,
686           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
687           .policy = packet_policy,
688           .doit = ovs_packet_cmd_execute
689         }
690 };
691
692 static struct genl_family dp_packet_genl_family = {
693         .id = GENL_ID_GENERATE,
694         .hdrsize = sizeof(struct ovs_header),
695         .name = OVS_PACKET_FAMILY,
696         .version = OVS_PACKET_VERSION,
697         .maxattr = OVS_PACKET_ATTR_MAX,
698         .netnsok = true,
699         .parallel_ops = true,
700         .ops = dp_packet_genl_ops,
701         .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
702 };
703
704 static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
705                          struct ovs_dp_megaflow_stats *mega_stats)
706 {
707         int i;
708
709         memset(mega_stats, 0, sizeof(*mega_stats));
710
711         stats->n_flows = ovs_flow_tbl_count(&dp->table);
712         mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
713
714         stats->n_hit = stats->n_missed = stats->n_lost = 0;
715
716         for_each_possible_cpu(i) {
717                 const struct dp_stats_percpu *percpu_stats;
718                 struct dp_stats_percpu local_stats;
719                 unsigned int start;
720
721                 percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
722
723                 do {
724                         start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
725                         local_stats = *percpu_stats;
726                 } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
727
728                 stats->n_hit += local_stats.n_hit;
729                 stats->n_missed += local_stats.n_missed;
730                 stats->n_lost += local_stats.n_lost;
731                 mega_stats->n_mask_hit += local_stats.n_mask_hit;
732         }
733 }
734
735 static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags)
736 {
737         return ovs_identifier_is_ufid(sfid) &&
738                !(ufid_flags & OVS_UFID_F_OMIT_KEY);
739 }
740
741 static bool should_fill_mask(uint32_t ufid_flags)
742 {
743         return !(ufid_flags & OVS_UFID_F_OMIT_MASK);
744 }
745
746 static bool should_fill_actions(uint32_t ufid_flags)
747 {
748         return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS);
749 }
750
751 static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
752                                     const struct sw_flow_id *sfid,
753                                     uint32_t ufid_flags)
754 {
755         size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
756
757         /* OVS_FLOW_ATTR_UFID */
758         if (sfid && ovs_identifier_is_ufid(sfid))
759                 len += nla_total_size(sfid->ufid_len);
760
761         /* OVS_FLOW_ATTR_KEY */
762         if (!sfid || should_fill_key(sfid, ufid_flags))
763                 len += nla_total_size(ovs_key_attr_size());
764
765         /* OVS_FLOW_ATTR_MASK */
766         if (should_fill_mask(ufid_flags))
767                 len += nla_total_size(ovs_key_attr_size());
768
769         /* OVS_FLOW_ATTR_ACTIONS */
770         if (should_fill_actions(ufid_flags))
771                 len += nla_total_size(acts->orig_len);
772
773         return len
774                 + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
775                 + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
776                 + nla_total_size(8); /* OVS_FLOW_ATTR_USED */
777 }
778
779 /* Called with ovs_mutex or RCU read lock. */
780 static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
781                                    struct sk_buff *skb)
782 {
783         struct ovs_flow_stats stats;
784         __be16 tcp_flags;
785         unsigned long used;
786
787         ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
788
789         if (used &&
790             nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
791                 return -EMSGSIZE;
792
793         if (stats.n_packets &&
794             nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
795                 return -EMSGSIZE;
796
797         if ((u8)ntohs(tcp_flags) &&
798              nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
799                 return -EMSGSIZE;
800
801         return 0;
802 }
803
804 /* Called with ovs_mutex or RCU read lock. */
805 static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
806                                      struct sk_buff *skb, int skb_orig_len)
807 {
808         struct nlattr *start;
809         int err;
810
811         /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
812          * this is the first flow to be dumped into 'skb'.  This is unusual for
813          * Netlink but individual action lists can be longer than
814          * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
815          * The userspace caller can always fetch the actions separately if it
816          * really wants them.  (Most userspace callers in fact don't care.)
817          *
818          * This can only fail for dump operations because the skb is always
819          * properly sized for single flows.
820          */
821         start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
822         if (start) {
823                 const struct sw_flow_actions *sf_acts;
824
825                 sf_acts = rcu_dereference_ovsl(flow->sf_acts);
826                 err = ovs_nla_put_actions(sf_acts->actions,
827                                           sf_acts->actions_len, skb);
828
829                 if (!err)
830                         nla_nest_end(skb, start);
831                 else {
832                         if (skb_orig_len)
833                                 return err;
834
835                         nla_nest_cancel(skb, start);
836                 }
837         } else if (skb_orig_len) {
838                 return -EMSGSIZE;
839         }
840
841         return 0;
842 }
843
844 /* Called with ovs_mutex or RCU read lock. */
845 static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
846                                   struct sk_buff *skb, u32 portid,
847                                   u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
848 {
849         const int skb_orig_len = skb->len;
850         struct ovs_header *ovs_header;
851         int err;
852
853         ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
854                                  flags, cmd);
855         if (!ovs_header)
856                 return -EMSGSIZE;
857
858         ovs_header->dp_ifindex = dp_ifindex;
859
860         err = ovs_nla_put_identifier(flow, skb);
861         if (err)
862                 goto error;
863
864         if (should_fill_key(&flow->id, ufid_flags)) {
865                 err = ovs_nla_put_masked_key(flow, skb);
866                 if (err)
867                         goto error;
868         }
869
870         if (should_fill_mask(ufid_flags)) {
871                 err = ovs_nla_put_mask(flow, skb);
872                 if (err)
873                         goto error;
874         }
875
876         err = ovs_flow_cmd_fill_stats(flow, skb);
877         if (err)
878                 goto error;
879
880         if (should_fill_actions(ufid_flags)) {
881                 err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
882                 if (err)
883                         goto error;
884         }
885
886         genlmsg_end(skb, ovs_header);
887         return 0;
888
889 error:
890         genlmsg_cancel(skb, ovs_header);
891         return err;
892 }
893
894 /* May not be called with RCU read lock. */
895 static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
896                                                const struct sw_flow_id *sfid,
897                                                struct genl_info *info,
898                                                bool always,
899                                                uint32_t ufid_flags)
900 {
901         struct sk_buff *skb;
902         size_t len;
903
904         if (!always && !ovs_must_notify(&dp_flow_genl_family, info,
905                                         GROUP_ID(&ovs_dp_flow_multicast_group)))
906                 return NULL;
907
908         len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
909         skb = genlmsg_new_unicast(len, info, GFP_KERNEL);
910         if (!skb)
911                 return ERR_PTR(-ENOMEM);
912
913         return skb;
914 }
915
916 /* Called with ovs_mutex. */
917 static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
918                                                int dp_ifindex,
919                                                struct genl_info *info, u8 cmd,
920                                                bool always, u32 ufid_flags)
921 {
922         struct sk_buff *skb;
923         int retval;
924
925         skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
926                                       &flow->id, info, always, ufid_flags);
927         if (IS_ERR_OR_NULL(skb))
928                 return skb;
929
930         retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
931                                         info->snd_portid, info->snd_seq, 0,
932                                         cmd, ufid_flags);
933         BUG_ON(retval < 0);
934         return skb;
935 }
936
937 static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
938 {
939         struct net *net = sock_net(skb->sk);
940         struct nlattr **a = info->attrs;
941         struct ovs_header *ovs_header = info->userhdr;
942         struct sw_flow *flow = NULL, *new_flow;
943         struct sw_flow_mask mask;
944         struct sk_buff *reply;
945         struct datapath *dp;
946         struct sw_flow_key key;
947         struct sw_flow_actions *acts;
948         struct sw_flow_match match;
949         u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
950         int error;
951         bool log = !a[OVS_FLOW_ATTR_PROBE];
952
953         /* Must have key and actions. */
954         error = -EINVAL;
955         if (!a[OVS_FLOW_ATTR_KEY]) {
956                 OVS_NLERR(log, "Flow key attr not present in new flow.");
957                 goto error;
958         }
959         if (!a[OVS_FLOW_ATTR_ACTIONS]) {
960                 OVS_NLERR(log, "Flow actions attr not present in new flow.");
961                 goto error;
962         }
963
964         /* Most of the time we need to allocate a new flow, do it before
965          * locking.
966          */
967         new_flow = ovs_flow_alloc();
968         if (IS_ERR(new_flow)) {
969                 error = PTR_ERR(new_flow);
970                 goto error;
971         }
972
973         /* Extract key. */
974         ovs_match_init(&match, &key, &mask);
975         error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
976                                   a[OVS_FLOW_ATTR_MASK], log);
977         if (error)
978                 goto err_kfree_flow;
979
980         ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
981
982         /* Extract flow identifier. */
983         error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
984                                        &key, log);
985         if (error)
986                 goto err_kfree_flow;
987
988         /* Validate actions. */
989         error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
990                                      &new_flow->key, &acts, log);
991         if (error) {
992                 OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
993                 goto err_kfree_flow;
994         }
995
996         reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
997                                         ufid_flags);
998         if (IS_ERR(reply)) {
999                 error = PTR_ERR(reply);
1000                 goto err_kfree_acts;
1001         }
1002
1003         ovs_lock();
1004         dp = get_dp(net, ovs_header->dp_ifindex);
1005         if (unlikely(!dp)) {
1006                 error = -ENODEV;
1007                 goto err_unlock_ovs;
1008         }
1009
1010         /* Check if this is a duplicate flow */
1011         if (ovs_identifier_is_ufid(&new_flow->id))
1012                 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
1013         if (!flow)
1014                 flow = ovs_flow_tbl_lookup(&dp->table, &key);
1015         if (likely(!flow)) {
1016                 rcu_assign_pointer(new_flow->sf_acts, acts);
1017
1018                 /* Put flow in bucket. */
1019                 error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
1020                 if (unlikely(error)) {
1021                         acts = NULL;
1022                         goto err_unlock_ovs;
1023                 }
1024
1025                 if (unlikely(reply)) {
1026                         error = ovs_flow_cmd_fill_info(new_flow,
1027                                                        ovs_header->dp_ifindex,
1028                                                        reply, info->snd_portid,
1029                                                        info->snd_seq, 0,
1030                                                        OVS_FLOW_CMD_NEW,
1031                                                        ufid_flags);
1032                         BUG_ON(error < 0);
1033                 }
1034                 ovs_unlock();
1035         } else {
1036                 struct sw_flow_actions *old_acts;
1037
1038                 /* Bail out if we're not allowed to modify an existing flow.
1039                  * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
1040                  * because Generic Netlink treats the latter as a dump
1041                  * request.  We also accept NLM_F_EXCL in case that bug ever
1042                  * gets fixed.
1043                  */
1044                 if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
1045                                                          | NLM_F_EXCL))) {
1046                         error = -EEXIST;
1047                         goto err_unlock_ovs;
1048                 }
1049                 /* The flow identifier has to be the same for flow updates.
1050                  * Look for any overlapping flow.
1051                  */
1052                 if (unlikely(!ovs_flow_cmp(flow, &match))) {
1053                         if (ovs_identifier_is_key(&flow->id))
1054                                 flow = ovs_flow_tbl_lookup_exact(&dp->table,
1055                                                                  &match);
1056                         else /* UFID matches but key is different */
1057                                 flow = NULL;
1058                         if (!flow) {
1059                                 error = -ENOENT;
1060                                 goto err_unlock_ovs;
1061                         }
1062                 }
1063                 /* Update actions. */
1064                 old_acts = ovsl_dereference(flow->sf_acts);
1065                 rcu_assign_pointer(flow->sf_acts, acts);
1066
1067                 if (unlikely(reply)) {
1068                         error = ovs_flow_cmd_fill_info(flow,
1069                                                        ovs_header->dp_ifindex,
1070                                                        reply, info->snd_portid,
1071                                                        info->snd_seq, 0,
1072                                                        OVS_FLOW_CMD_NEW,
1073                                                        ufid_flags);
1074                         BUG_ON(error < 0);
1075                 }
1076                 ovs_unlock();
1077
1078                 ovs_nla_free_flow_actions_rcu(old_acts);
1079                 ovs_flow_free(new_flow, false);
1080         }
1081
1082         if (reply)
1083                 ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
1084         return 0;
1085
1086 err_unlock_ovs:
1087         ovs_unlock();
1088         kfree_skb(reply);
1089 err_kfree_acts:
1090         ovs_nla_free_flow_actions(acts);
1091 err_kfree_flow:
1092         ovs_flow_free(new_flow, false);
1093 error:
1094         return error;
1095 }
1096
1097 /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
1098 static struct sw_flow_actions *get_flow_actions(struct net *net,
1099                                                 const struct nlattr *a,
1100                                                 const struct sw_flow_key *key,
1101                                                 const struct sw_flow_mask *mask,
1102                                                 bool log)
1103 {
1104         struct sw_flow_actions *acts;
1105         struct sw_flow_key masked_key;
1106         int error;
1107
1108         ovs_flow_mask_key(&masked_key, key, true, mask);
1109         error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);
1110         if (error) {
1111                 OVS_NLERR(log,
1112                           "Actions may not be safe on all matching packets");
1113                 return ERR_PTR(error);
1114         }
1115
1116         return acts;
1117 }
1118
1119 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1120 {
1121         struct net *net = sock_net(skb->sk);
1122         struct nlattr **a = info->attrs;
1123         struct ovs_header *ovs_header = info->userhdr;
1124         struct sw_flow_key key;
1125         struct sw_flow *flow;
1126         struct sw_flow_mask mask;
1127         struct sk_buff *reply = NULL;
1128         struct datapath *dp;
1129         struct sw_flow_actions *old_acts = NULL, *acts = NULL;
1130         struct sw_flow_match match;
1131         struct sw_flow_id sfid;
1132         u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1133         int error;
1134         bool log = !a[OVS_FLOW_ATTR_PROBE];
1135         bool ufid_present;
1136
1137         /* Extract key. */
1138         error = -EINVAL;
1139         if (!a[OVS_FLOW_ATTR_KEY]) {
1140                 OVS_NLERR(log, "Flow key attribute not present in set flow.");
1141                 goto error;
1142         }
1143
1144         ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1145         ovs_match_init(&match, &key, &mask);
1146         error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1147                                   a[OVS_FLOW_ATTR_MASK], log);
1148         if (error)
1149                 goto error;
1150
1151         /* Validate actions. */
1152         if (a[OVS_FLOW_ATTR_ACTIONS]) {
1153                 acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
1154                                         &mask, log);
1155                 if (IS_ERR(acts)) {
1156                         error = PTR_ERR(acts);
1157                         goto error;
1158                 }
1159
1160                 /* Can allocate before locking if have acts. */
1161                 reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
1162                                                 ufid_flags);
1163                 if (IS_ERR(reply)) {
1164                         error = PTR_ERR(reply);
1165                         goto err_kfree_acts;
1166                 }
1167         }
1168
1169         ovs_lock();
1170         dp = get_dp(net, ovs_header->dp_ifindex);
1171         if (unlikely(!dp)) {
1172                 error = -ENODEV;
1173                 goto err_unlock_ovs;
1174         }
1175         /* Check that the flow exists. */
1176         if (ufid_present)
1177                 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
1178         else
1179                 flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1180         if (unlikely(!flow)) {
1181                 error = -ENOENT;
1182                 goto err_unlock_ovs;
1183         }
1184
1185         /* Update actions, if present. */
1186         if (likely(acts)) {
1187                 old_acts = ovsl_dereference(flow->sf_acts);
1188                 rcu_assign_pointer(flow->sf_acts, acts);
1189
1190                 if (unlikely(reply)) {
1191                         error = ovs_flow_cmd_fill_info(flow,
1192                                                        ovs_header->dp_ifindex,
1193                                                        reply, info->snd_portid,
1194                                                        info->snd_seq, 0,
1195                                                        OVS_FLOW_CMD_NEW,
1196                                                        ufid_flags);
1197                         BUG_ON(error < 0);
1198                 }
1199         } else {
1200                 /* Could not alloc without acts before locking. */
1201                 reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
1202                                                 info, OVS_FLOW_CMD_NEW, false,
1203                                                 ufid_flags);
1204
1205                 if (unlikely(IS_ERR(reply))) {
1206                         error = PTR_ERR(reply);
1207                         goto err_unlock_ovs;
1208                 }
1209         }
1210
1211         /* Clear stats. */
1212         if (a[OVS_FLOW_ATTR_CLEAR])
1213                 ovs_flow_stats_clear(flow);
1214         ovs_unlock();
1215
1216         if (reply)
1217                 ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
1218         if (old_acts)
1219                 ovs_nla_free_flow_actions_rcu(old_acts);
1220
1221         return 0;
1222
1223 err_unlock_ovs:
1224         ovs_unlock();
1225         kfree_skb(reply);
1226 err_kfree_acts:
1227         ovs_nla_free_flow_actions(acts);
1228 error:
1229         return error;
1230 }
1231
1232 static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1233 {
1234         struct nlattr **a = info->attrs;
1235         struct ovs_header *ovs_header = info->userhdr;
1236         struct net *net = sock_net(skb->sk);
1237         struct sw_flow_key key;
1238         struct sk_buff *reply;
1239         struct sw_flow *flow;
1240         struct datapath *dp;
1241         struct sw_flow_match match;
1242         struct sw_flow_id ufid;
1243         u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1244         int err = 0;
1245         bool log = !a[OVS_FLOW_ATTR_PROBE];
1246         bool ufid_present;
1247
1248         ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1249         if (a[OVS_FLOW_ATTR_KEY]) {
1250                 ovs_match_init(&match, &key, NULL);
1251                 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
1252                                         log);
1253         } else if (!ufid_present) {
1254                 OVS_NLERR(log,
1255                           "Flow get message rejected, Key attribute missing.");
1256                 err = -EINVAL;
1257         }
1258         if (err)
1259                 return err;
1260
1261         ovs_lock();
1262         dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1263         if (!dp) {
1264                 err = -ENODEV;
1265                 goto unlock;
1266         }
1267
1268         if (ufid_present)
1269                 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1270         else
1271                 flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1272         if (!flow) {
1273                 err = -ENOENT;
1274                 goto unlock;
1275         }
1276
1277         reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
1278                                         OVS_FLOW_CMD_NEW, true, ufid_flags);
1279         if (IS_ERR(reply)) {
1280                 err = PTR_ERR(reply);
1281                 goto unlock;
1282         }
1283
1284         ovs_unlock();
1285         return genlmsg_reply(reply, info);
1286 unlock:
1287         ovs_unlock();
1288         return err;
1289 }
1290
1291 static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1292 {
1293         struct nlattr **a = info->attrs;
1294         struct ovs_header *ovs_header = info->userhdr;
1295         struct net *net = sock_net(skb->sk);
1296         struct sw_flow_key key;
1297         struct sk_buff *reply;
1298         struct sw_flow *flow = NULL;
1299         struct datapath *dp;
1300         struct sw_flow_match match;
1301         struct sw_flow_id ufid;
1302         u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1303         int err;
1304         bool log = !a[OVS_FLOW_ATTR_PROBE];
1305         bool ufid_present;
1306
1307         ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
1308         if (a[OVS_FLOW_ATTR_KEY]) {
1309                 ovs_match_init(&match, &key, NULL);
1310                 err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1311                                         NULL, log);
1312                 if (unlikely(err))
1313                         return err;
1314         }
1315
1316         ovs_lock();
1317         dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1318         if (unlikely(!dp)) {
1319                 err = -ENODEV;
1320                 goto unlock;
1321         }
1322
1323         if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
1324                 err = ovs_flow_tbl_flush(&dp->table);
1325                 goto unlock;
1326         }
1327
1328         if (ufid_present)
1329                 flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
1330         else
1331                 flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
1332         if (unlikely(!flow)) {
1333                 err = -ENOENT;
1334                 goto unlock;
1335         }
1336
1337         ovs_flow_tbl_remove(&dp->table, flow);
1338         ovs_unlock();
1339
1340         reply = ovs_flow_cmd_alloc_info(rcu_dereference_raw(flow->sf_acts),
1341                                         &flow->id, info, false, ufid_flags);
1342
1343         if (likely(reply)) {
1344                 if (likely(!IS_ERR(reply))) {
1345                         rcu_read_lock();        /*To keep RCU checker happy. */
1346                         err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
1347                                                      reply, info->snd_portid,
1348                                                      info->snd_seq, 0,
1349                                                      OVS_FLOW_CMD_DEL,
1350                                                      ufid_flags);
1351                         rcu_read_unlock();
1352                         BUG_ON(err < 0);
1353                         ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
1354                 } else {
1355                         genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0,
1356                                      GROUP_ID(&ovs_dp_flow_multicast_group), PTR_ERR(reply));
1357
1358                 }
1359         }
1360
1361         ovs_flow_free(flow, true);
1362         return 0;
1363 unlock:
1364         ovs_unlock();
1365         return err;
1366 }
1367
1368 static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1369 {
1370         struct nlattr *a[__OVS_FLOW_ATTR_MAX];
1371         struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1372         struct table_instance *ti;
1373         struct datapath *dp;
1374         u32 ufid_flags;
1375         int err;
1376
1377         err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
1378                             OVS_FLOW_ATTR_MAX, flow_policy);
1379         if (err)
1380                 return err;
1381         ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1382
1383         rcu_read_lock();
1384         dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
1385         if (!dp) {
1386                 rcu_read_unlock();
1387                 return -ENODEV;
1388         }
1389
1390         ti = rcu_dereference(dp->table.ti);
1391         for (;;) {
1392                 struct sw_flow *flow;
1393                 u32 bucket, obj;
1394
1395                 bucket = cb->args[0];
1396                 obj = cb->args[1];
1397                 flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
1398                 if (!flow)
1399                         break;
1400
1401                 if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
1402                                            NETLINK_CB(cb->skb).portid,
1403                                            cb->nlh->nlmsg_seq, NLM_F_MULTI,
1404                                            OVS_FLOW_CMD_NEW, ufid_flags) < 0)
1405                         break;
1406
1407                 cb->args[0] = bucket;
1408                 cb->args[1] = obj;
1409         }
1410         rcu_read_unlock();
1411         return skb->len;
1412 }
1413
1414 static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
1415         [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
1416         [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
1417         [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
1418         [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
1419         [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
1420         [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 },
1421         [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
1422 };
1423
1424 static struct genl_ops dp_flow_genl_ops[] = {
1425         { .cmd = OVS_FLOW_CMD_NEW,
1426           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1427           .policy = flow_policy,
1428           .doit = ovs_flow_cmd_new
1429         },
1430         { .cmd = OVS_FLOW_CMD_DEL,
1431           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1432           .policy = flow_policy,
1433           .doit = ovs_flow_cmd_del
1434         },
1435         { .cmd = OVS_FLOW_CMD_GET,
1436           .flags = 0,               /* OK for unprivileged users. */
1437           .policy = flow_policy,
1438           .doit = ovs_flow_cmd_get,
1439           .dumpit = ovs_flow_cmd_dump
1440         },
1441         { .cmd = OVS_FLOW_CMD_SET,
1442           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1443           .policy = flow_policy,
1444           .doit = ovs_flow_cmd_set,
1445         },
1446 };
1447
1448 static struct genl_family dp_flow_genl_family = {
1449         .id = GENL_ID_GENERATE,
1450         .hdrsize = sizeof(struct ovs_header),
1451         .name = OVS_FLOW_FAMILY,
1452         .version = OVS_FLOW_VERSION,
1453         .maxattr = OVS_FLOW_ATTR_MAX,
1454         .netnsok = true,
1455         .parallel_ops = true,
1456         .ops = dp_flow_genl_ops,
1457         .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
1458         .mcgrps = &ovs_dp_flow_multicast_group,
1459         .n_mcgrps = 1,
1460 };
1461
1462 static size_t ovs_dp_cmd_msg_size(void)
1463 {
1464         size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
1465
1466         msgsize += nla_total_size(IFNAMSIZ);
1467         msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
1468         msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
1469         msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
1470
1471         return msgsize;
1472 }
1473
1474 /* Called with ovs_mutex. */
1475 static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1476                                 u32 portid, u32 seq, u32 flags, u8 cmd)
1477 {
1478         struct ovs_header *ovs_header;
1479         struct ovs_dp_stats dp_stats;
1480         struct ovs_dp_megaflow_stats dp_megaflow_stats;
1481         int err;
1482
1483         ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
1484                                    flags, cmd);
1485         if (!ovs_header)
1486                 goto error;
1487
1488         ovs_header->dp_ifindex = get_dpifindex(dp);
1489
1490         err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1491         if (err)
1492                 goto nla_put_failure;
1493
1494         get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
1495         if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
1496                         &dp_stats))
1497                 goto nla_put_failure;
1498
1499         if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
1500                         sizeof(struct ovs_dp_megaflow_stats),
1501                         &dp_megaflow_stats))
1502                 goto nla_put_failure;
1503
1504         if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
1505                 goto nla_put_failure;
1506
1507         genlmsg_end(skb, ovs_header);
1508         return 0;
1509
1510 nla_put_failure:
1511         genlmsg_cancel(skb, ovs_header);
1512 error:
1513         return -EMSGSIZE;
1514 }
1515
1516 static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
1517 {
1518         return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
1519 }
1520
1521 /* Called with rcu_read_lock or ovs_mutex. */
1522 static struct datapath *lookup_datapath(struct net *net,
1523                                         const struct ovs_header *ovs_header,
1524                                         struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1525 {
1526         struct datapath *dp;
1527
1528         if (!a[OVS_DP_ATTR_NAME])
1529                 dp = get_dp(net, ovs_header->dp_ifindex);
1530         else {
1531                 struct vport *vport;
1532
1533                 vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
1534                 dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1535         }
1536         return dp ? dp : ERR_PTR(-ENODEV);
1537 }
1538
1539 static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
1540 {
1541         struct datapath *dp;
1542
1543         dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1544         if (IS_ERR(dp))
1545                 return;
1546
1547         WARN(dp->user_features, "Dropping previously announced user features\n");
1548         dp->user_features = 0;
1549 }
1550
1551 static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
1552 {
1553         if (a[OVS_DP_ATTR_USER_FEATURES])
1554                 dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
1555 }
1556
1557 static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1558 {
1559         struct nlattr **a = info->attrs;
1560         struct vport_parms parms;
1561         struct sk_buff *reply;
1562         struct datapath *dp;
1563         struct vport *vport;
1564         struct ovs_net *ovs_net;
1565         int err, i;
1566
1567         err = -EINVAL;
1568         if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1569                 goto err;
1570
1571         reply = ovs_dp_cmd_alloc_info(info);
1572         if (!reply)
1573                 return -ENOMEM;
1574
1575         err = -ENOMEM;
1576         dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1577         if (dp == NULL)
1578                 goto err_free_reply;
1579
1580         ovs_dp_set_net(dp, sock_net(skb->sk));
1581
1582         /* Allocate table. */
1583         err = ovs_flow_tbl_init(&dp->table);
1584         if (err)
1585                 goto err_free_dp;
1586
1587         dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
1588         if (!dp->stats_percpu) {
1589                 err = -ENOMEM;
1590                 goto err_destroy_table;
1591         }
1592
1593         dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
1594                             GFP_KERNEL);
1595         if (!dp->ports) {
1596                 err = -ENOMEM;
1597                 goto err_destroy_percpu;
1598         }
1599
1600         for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1601                 INIT_HLIST_HEAD(&dp->ports[i]);
1602
1603         /* Set up our datapath device. */
1604         parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1605         parms.type = OVS_VPORT_TYPE_INTERNAL;
1606         parms.options = NULL;
1607         parms.dp = dp;
1608         parms.port_no = OVSP_LOCAL;
1609         parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
1610
1611         ovs_dp_change(dp, a);
1612
1613         /* So far only local changes have been made, now need the lock. */
1614         ovs_lock();
1615
1616         vport = new_vport(&parms);
1617         if (IS_ERR(vport)) {
1618                 err = PTR_ERR(vport);
1619                 if (err == -EBUSY)
1620                         err = -EEXIST;
1621
1622                 if (err == -EEXIST) {
1623                         /* An outdated user space instance that does not understand
1624                          * the concept of user_features has attempted to create a new
1625                          * datapath and is likely to reuse it. Drop all user features.
1626                          */
1627                         if (info->genlhdr->version < OVS_DP_VER_FEATURES)
1628                                 ovs_dp_reset_user_features(skb, info);
1629                 }
1630
1631                 goto err_destroy_ports_array;
1632         }
1633
1634         err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1635                                    info->snd_seq, 0, OVS_DP_CMD_NEW);
1636         BUG_ON(err < 0);
1637
1638         ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
1639         list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
1640
1641         ovs_unlock();
1642
1643         ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
1644         return 0;
1645
1646 err_destroy_ports_array:
1647         ovs_unlock();
1648         kfree(dp->ports);
1649 err_destroy_percpu:
1650         free_percpu(dp->stats_percpu);
1651 err_destroy_table:
1652         ovs_flow_tbl_destroy(&dp->table);
1653 err_free_dp:
1654         kfree(dp);
1655 err_free_reply:
1656         kfree_skb(reply);
1657 err:
1658         return err;
1659 }
1660
1661 /* Called with ovs_mutex. */
1662 static void __dp_destroy(struct datapath *dp)
1663 {
1664         int i;
1665
1666         for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1667                 struct vport *vport;
1668                 struct hlist_node *n;
1669
1670                 hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
1671                         if (vport->port_no != OVSP_LOCAL)
1672                                 ovs_dp_detach_port(vport);
1673         }
1674
1675         list_del_rcu(&dp->list_node);
1676
1677         /* OVSP_LOCAL is datapath internal port. We need to make sure that
1678          * all ports in datapath are destroyed first before freeing datapath.
1679          */
1680         ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
1681
1682         /* RCU destroy the flow table */
1683         call_rcu(&dp->rcu, destroy_dp_rcu);
1684 }
1685
1686 static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1687 {
1688         struct sk_buff *reply;
1689         struct datapath *dp;
1690         int err;
1691
1692         reply = ovs_dp_cmd_alloc_info(info);
1693         if (!reply)
1694                 return -ENOMEM;
1695
1696         ovs_lock();
1697         dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1698         err = PTR_ERR(dp);
1699         if (IS_ERR(dp))
1700                 goto err_unlock_free;
1701
1702         err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1703                                    info->snd_seq, 0, OVS_DP_CMD_DEL);
1704         BUG_ON(err < 0);
1705
1706         __dp_destroy(dp);
1707         ovs_unlock();
1708
1709         ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
1710         return 0;
1711
1712 err_unlock_free:
1713         ovs_unlock();
1714         kfree_skb(reply);
1715         return err;
1716 }
1717
1718 static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1719 {
1720         struct sk_buff *reply;
1721         struct datapath *dp;
1722         int err;
1723
1724         reply = ovs_dp_cmd_alloc_info(info);
1725         if (!reply)
1726                 return -ENOMEM;
1727
1728         ovs_lock();
1729         dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1730         err = PTR_ERR(dp);
1731         if (IS_ERR(dp))
1732                 goto err_unlock_free;
1733
1734         ovs_dp_change(dp, info->attrs);
1735
1736         err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1737                                    info->snd_seq, 0, OVS_DP_CMD_NEW);
1738         BUG_ON(err < 0);
1739
1740         ovs_unlock();
1741
1742         ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info);
1743         return 0;
1744
1745 err_unlock_free:
1746         ovs_unlock();
1747         kfree_skb(reply);
1748         return err;
1749 }
1750
1751 static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1752 {
1753         struct sk_buff *reply;
1754         struct datapath *dp;
1755         int err;
1756
1757         reply = ovs_dp_cmd_alloc_info(info);
1758         if (!reply)
1759                 return -ENOMEM;
1760
1761         ovs_lock();
1762         dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1763         if (IS_ERR(dp)) {
1764                 err = PTR_ERR(dp);
1765                 goto err_unlock_free;
1766         }
1767         err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
1768                                    info->snd_seq, 0, OVS_DP_CMD_NEW);
1769         BUG_ON(err < 0);
1770         ovs_unlock();
1771
1772         return genlmsg_reply(reply, info);
1773
1774 err_unlock_free:
1775         ovs_unlock();
1776         kfree_skb(reply);
1777         return err;
1778 }
1779
1780 static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1781 {
1782         struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
1783         struct datapath *dp;
1784         int skip = cb->args[0];
1785         int i = 0;
1786
1787         ovs_lock();
1788         list_for_each_entry(dp, &ovs_net->dps, list_node) {
1789                 if (i >= skip &&
1790                     ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
1791                                          cb->nlh->nlmsg_seq, NLM_F_MULTI,
1792                                          OVS_DP_CMD_NEW) < 0)
1793                         break;
1794                 i++;
1795         }
1796         ovs_unlock();
1797
1798         cb->args[0] = i;
1799
1800         return skb->len;
1801 }
1802
1803 static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1804         [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1805         [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1806         [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
1807 };
1808
1809 static struct genl_ops dp_datapath_genl_ops[] = {
1810         { .cmd = OVS_DP_CMD_NEW,
1811           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1812           .policy = datapath_policy,
1813           .doit = ovs_dp_cmd_new
1814         },
1815         { .cmd = OVS_DP_CMD_DEL,
1816           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1817           .policy = datapath_policy,
1818           .doit = ovs_dp_cmd_del
1819         },
1820         { .cmd = OVS_DP_CMD_GET,
1821           .flags = 0,               /* OK for unprivileged users. */
1822           .policy = datapath_policy,
1823           .doit = ovs_dp_cmd_get,
1824           .dumpit = ovs_dp_cmd_dump
1825         },
1826         { .cmd = OVS_DP_CMD_SET,
1827           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1828           .policy = datapath_policy,
1829           .doit = ovs_dp_cmd_set,
1830         },
1831 };
1832
1833 static struct genl_family dp_datapath_genl_family = {
1834         .id = GENL_ID_GENERATE,
1835         .hdrsize = sizeof(struct ovs_header),
1836         .name = OVS_DATAPATH_FAMILY,
1837         .version = OVS_DATAPATH_VERSION,
1838         .maxattr = OVS_DP_ATTR_MAX,
1839         .netnsok = true,
1840         .parallel_ops = true,
1841         .ops = dp_datapath_genl_ops,
1842         .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
1843         .mcgrps = &ovs_dp_datapath_multicast_group,
1844         .n_mcgrps = 1,
1845 };
1846
1847 /* Called with ovs_mutex or RCU read lock. */
1848 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1849                                    u32 portid, u32 seq, u32 flags, u8 cmd)
1850 {
1851         struct ovs_header *ovs_header;
1852         struct ovs_vport_stats vport_stats;
1853         int err;
1854
1855         ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
1856                                  flags, cmd);
1857         if (!ovs_header)
1858                 return -EMSGSIZE;
1859
1860         ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1861
1862         if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
1863             nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
1864             nla_put_string(skb, OVS_VPORT_ATTR_NAME,
1865                            ovs_vport_name(vport)))
1866                 goto nla_put_failure;
1867
1868         ovs_vport_get_stats(vport, &vport_stats);
1869         if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1870                     &vport_stats))
1871                 goto nla_put_failure;
1872
1873         if (ovs_vport_get_upcall_portids(vport, skb))
1874                 goto nla_put_failure;
1875
1876         err = ovs_vport_get_options(vport, skb);
1877         if (err == -EMSGSIZE)
1878                 goto error;
1879
1880         genlmsg_end(skb, ovs_header);
1881         return 0;
1882
1883 nla_put_failure:
1884         err = -EMSGSIZE;
1885 error:
1886         genlmsg_cancel(skb, ovs_header);
1887         return err;
1888 }
1889
1890 static struct sk_buff *ovs_vport_cmd_alloc_info(void)
1891 {
1892         return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1893 }
1894
1895 /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
1896 struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
1897                                          u32 seq, u8 cmd)
1898 {
1899         struct sk_buff *skb;
1900         int retval;
1901
1902         skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1903         if (!skb)
1904                 return ERR_PTR(-ENOMEM);
1905
1906         retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
1907         BUG_ON(retval < 0);
1908
1909         return skb;
1910 }
1911
1912 /* Called with ovs_mutex or RCU read lock. */
1913 static struct vport *lookup_vport(struct net *net,
1914                                   const struct ovs_header *ovs_header,
1915                                   struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1916 {
1917         struct datapath *dp;
1918         struct vport *vport;
1919
1920         if (a[OVS_VPORT_ATTR_NAME]) {
1921                 vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
1922                 if (!vport)
1923                         return ERR_PTR(-ENODEV);
1924                 if (ovs_header->dp_ifindex &&
1925                     ovs_header->dp_ifindex != get_dpifindex(vport->dp))
1926                         return ERR_PTR(-ENODEV);
1927                 return vport;
1928         } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1929                 u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1930
1931                 if (port_no >= DP_MAX_PORTS)
1932                         return ERR_PTR(-EFBIG);
1933
1934                 dp = get_dp(net, ovs_header->dp_ifindex);
1935                 if (!dp)
1936                         return ERR_PTR(-ENODEV);
1937
1938                 vport = ovs_vport_ovsl_rcu(dp, port_no);
1939                 if (!vport)
1940                         return ERR_PTR(-ENODEV);
1941                 return vport;
1942         } else
1943                 return ERR_PTR(-EINVAL);
1944 }
1945
1946 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1947 {
1948         struct nlattr **a = info->attrs;
1949         struct ovs_header *ovs_header = info->userhdr;
1950         struct vport_parms parms;
1951         struct sk_buff *reply;
1952         struct vport *vport;
1953         struct datapath *dp;
1954         u32 port_no;
1955         int err;
1956
1957         if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1958             !a[OVS_VPORT_ATTR_UPCALL_PID])
1959                 return -EINVAL;
1960
1961         port_no = a[OVS_VPORT_ATTR_PORT_NO]
1962                 ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
1963         if (port_no >= DP_MAX_PORTS)
1964                 return -EFBIG;
1965
1966         reply = ovs_vport_cmd_alloc_info();
1967         if (!reply)
1968                 return -ENOMEM;
1969
1970         ovs_lock();
1971 restart:
1972         dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1973         err = -ENODEV;
1974         if (!dp)
1975                 goto exit_unlock_free;
1976
1977         if (port_no) {
1978                 vport = ovs_vport_ovsl(dp, port_no);
1979                 err = -EBUSY;
1980                 if (vport)
1981                         goto exit_unlock_free;
1982         } else {
1983                 for (port_no = 1; ; port_no++) {
1984                         if (port_no >= DP_MAX_PORTS) {
1985                                 err = -EFBIG;
1986                                 goto exit_unlock_free;
1987                         }
1988                         vport = ovs_vport_ovsl(dp, port_no);
1989                         if (!vport)
1990                                 break;
1991                 }
1992         }
1993
1994         parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1995         parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1996         parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1997         parms.dp = dp;
1998         parms.port_no = port_no;
1999         parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
2000
2001         vport = new_vport(&parms);
2002         err = PTR_ERR(vport);
2003         if (IS_ERR(vport)) {
2004                 if (err == -EAGAIN)
2005                         goto restart;
2006                 goto exit_unlock_free;
2007         }
2008
2009         err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2010                                       info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2011         BUG_ON(err < 0);
2012         ovs_unlock();
2013
2014         ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
2015         return 0;
2016
2017 exit_unlock_free:
2018         ovs_unlock();
2019         kfree_skb(reply);
2020         return err;
2021 }
2022
2023 static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
2024 {
2025         struct nlattr **a = info->attrs;
2026         struct sk_buff *reply;
2027         struct vport *vport;
2028         int err;
2029
2030         reply = ovs_vport_cmd_alloc_info();
2031         if (!reply)
2032                 return -ENOMEM;
2033
2034         ovs_lock();
2035         vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2036         err = PTR_ERR(vport);
2037         if (IS_ERR(vport))
2038                 goto exit_unlock_free;
2039
2040         if (a[OVS_VPORT_ATTR_TYPE] &&
2041             nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
2042                 err = -EINVAL;
2043                 goto exit_unlock_free;
2044         }
2045
2046         if (a[OVS_VPORT_ATTR_OPTIONS]) {
2047                 err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
2048                 if (err)
2049                         goto exit_unlock_free;
2050         }
2051
2052         if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
2053                 struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
2054
2055                 err = ovs_vport_set_upcall_portids(vport, ids);
2056                 if (err)
2057                         goto exit_unlock_free;
2058         }
2059
2060         err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2061                                       info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2062         BUG_ON(err < 0);
2063         ovs_unlock();
2064
2065         ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
2066         return 0;
2067
2068 exit_unlock_free:
2069         ovs_unlock();
2070         kfree_skb(reply);
2071         return err;
2072 }
2073
2074 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2075 {
2076         struct nlattr **a = info->attrs;
2077         struct sk_buff *reply;
2078         struct vport *vport;
2079         int err;
2080
2081         reply = ovs_vport_cmd_alloc_info();
2082         if (!reply)
2083                 return -ENOMEM;
2084
2085         ovs_lock();
2086         vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2087         err = PTR_ERR(vport);
2088         if (IS_ERR(vport))
2089                 goto exit_unlock_free;
2090
2091         if (vport->port_no == OVSP_LOCAL) {
2092                 err = -EINVAL;
2093                 goto exit_unlock_free;
2094         }
2095
2096         err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2097                                       info->snd_seq, 0, OVS_VPORT_CMD_DEL);
2098         BUG_ON(err < 0);
2099         ovs_dp_detach_port(vport);
2100         ovs_unlock();
2101
2102         ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info);
2103         return 0;
2104
2105 exit_unlock_free:
2106         ovs_unlock();
2107         kfree_skb(reply);
2108         return err;
2109 }
2110
2111 static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
2112 {
2113         struct nlattr **a = info->attrs;
2114         struct ovs_header *ovs_header = info->userhdr;
2115         struct sk_buff *reply;
2116         struct vport *vport;
2117         int err;
2118
2119         reply = ovs_vport_cmd_alloc_info();
2120         if (!reply)
2121                 return -ENOMEM;
2122
2123         rcu_read_lock();
2124         vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
2125         err = PTR_ERR(vport);
2126         if (IS_ERR(vport))
2127                 goto exit_unlock_free;
2128         err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2129                                       info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2130         BUG_ON(err < 0);
2131         rcu_read_unlock();
2132
2133         return genlmsg_reply(reply, info);
2134
2135 exit_unlock_free:
2136         rcu_read_unlock();
2137         kfree_skb(reply);
2138         return err;
2139 }
2140
2141 static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
2142 {
2143         struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
2144         struct datapath *dp;
2145         int bucket = cb->args[0], skip = cb->args[1];
2146         int i, j = 0;
2147
2148         rcu_read_lock();
2149         dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
2150         if (!dp) {
2151                 rcu_read_unlock();
2152                 return -ENODEV;
2153         }
2154         for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
2155                 struct vport *vport;
2156
2157                 j = 0;
2158                 hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
2159                         if (j >= skip &&
2160                             ovs_vport_cmd_fill_info(vport, skb,
2161                                                     NETLINK_CB(cb->skb).portid,
2162                                                     cb->nlh->nlmsg_seq,
2163                                                     NLM_F_MULTI,
2164                                                     OVS_VPORT_CMD_NEW) < 0)
2165                                 goto out;
2166
2167                         j++;
2168                 }
2169                 skip = 0;
2170         }
2171 out:
2172         rcu_read_unlock();
2173
2174         cb->args[0] = i;
2175         cb->args[1] = j;
2176
2177         return skb->len;
2178 }
2179
2180 static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
2181         [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
2182         [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
2183         [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
2184         [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
2185         [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
2186         [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
2187 };
2188
2189 static struct genl_ops dp_vport_genl_ops[] = {
2190         { .cmd = OVS_VPORT_CMD_NEW,
2191           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2192           .policy = vport_policy,
2193           .doit = ovs_vport_cmd_new
2194         },
2195         { .cmd = OVS_VPORT_CMD_DEL,
2196           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2197           .policy = vport_policy,
2198           .doit = ovs_vport_cmd_del
2199         },
2200         { .cmd = OVS_VPORT_CMD_GET,
2201           .flags = 0,               /* OK for unprivileged users. */
2202           .policy = vport_policy,
2203           .doit = ovs_vport_cmd_get,
2204           .dumpit = ovs_vport_cmd_dump
2205         },
2206         { .cmd = OVS_VPORT_CMD_SET,
2207           .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2208           .policy = vport_policy,
2209           .doit = ovs_vport_cmd_set,
2210         },
2211 };
2212
2213 struct genl_family dp_vport_genl_family = {
2214         .id = GENL_ID_GENERATE,
2215         .hdrsize = sizeof(struct ovs_header),
2216         .name = OVS_VPORT_FAMILY,
2217         .version = OVS_VPORT_VERSION,
2218         .maxattr = OVS_VPORT_ATTR_MAX,
2219         .netnsok = true,
2220         .parallel_ops = true,
2221         .ops = dp_vport_genl_ops,
2222         .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
2223         .mcgrps = &ovs_dp_vport_multicast_group,
2224         .n_mcgrps = 1,
2225 };
2226
2227 static struct genl_family *dp_genl_families[] = {
2228         &dp_datapath_genl_family,
2229         &dp_vport_genl_family,
2230         &dp_flow_genl_family,
2231         &dp_packet_genl_family,
2232 };
2233
2234 static void dp_unregister_genl(int n_families)
2235 {
2236         int i;
2237
2238         for (i = 0; i < n_families; i++)
2239                 genl_unregister_family(dp_genl_families[i]);
2240 }
2241
2242 static int dp_register_genl(void)
2243 {
2244         int err;
2245         int i;
2246
2247         for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
2248
2249                 err = genl_register_family(dp_genl_families[i]);
2250                 if (err)
2251                         goto error;
2252         }
2253
2254         return 0;
2255
2256 error:
2257         dp_unregister_genl(i);
2258         return err;
2259 }
2260
2261 static int __net_init ovs_init_net(struct net *net)
2262 {
2263         struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2264
2265         INIT_LIST_HEAD(&ovs_net->dps);
2266         INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
2267         ovs_ct_init(net);
2268         return 0;
2269 }
2270
2271 static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
2272                                             struct list_head *head)
2273 {
2274         struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2275         struct datapath *dp;
2276
2277         list_for_each_entry(dp, &ovs_net->dps, list_node) {
2278                 int i;
2279
2280                 for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
2281                         struct vport *vport;
2282
2283                         hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
2284
2285                                 if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
2286                                         continue;
2287
2288                                 if (dev_net(vport->dev) == dnet)
2289                                         list_add(&vport->detach_list, head);
2290                         }
2291                 }
2292         }
2293 }
2294
2295 static void __net_exit ovs_exit_net(struct net *dnet)
2296 {
2297         struct datapath *dp, *dp_next;
2298         struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id);
2299         struct vport *vport, *vport_next;
2300         struct net *net;
2301         LIST_HEAD(head);
2302
2303         ovs_ct_exit(dnet);
2304         ovs_lock();
2305         list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
2306                 __dp_destroy(dp);
2307
2308         rtnl_lock();
2309         for_each_net(net)
2310                 list_vports_from_net(net, dnet, &head);
2311         rtnl_unlock();
2312
2313         /* Detach all vports from given namespace. */
2314         list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
2315                 list_del(&vport->detach_list);
2316                 ovs_dp_detach_port(vport);
2317         }
2318
2319         ovs_unlock();
2320
2321         cancel_work_sync(&ovs_net->dp_notify_work);
2322 }
2323
2324 static struct pernet_operations ovs_net_ops = {
2325         .init = ovs_init_net,
2326         .exit = ovs_exit_net,
2327         .id   = &ovs_net_id,
2328         .size = sizeof(struct ovs_net),
2329 };
2330
2331 static int __init dp_init(void)
2332 {
2333         int err;
2334
2335         BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
2336
2337         pr_info("Open vSwitch switching datapath %s\n", VERSION);
2338
2339         err = compat_init();
2340         if (err)
2341                 goto error;
2342
2343         err = action_fifos_init();
2344         if (err)
2345                 goto error_compat_exit;
2346
2347         err = ovs_internal_dev_rtnl_link_register();
2348         if (err)
2349                 goto error_action_fifos_exit;
2350
2351         err = ovs_flow_init();
2352         if (err)
2353                 goto error_unreg_rtnl_link;
2354
2355         err = ovs_vport_init();
2356         if (err)
2357                 goto error_flow_exit;
2358
2359         err = register_pernet_device(&ovs_net_ops);
2360         if (err)
2361                 goto error_vport_exit;
2362
2363         err = register_netdevice_notifier(&ovs_dp_device_notifier);
2364         if (err)
2365                 goto error_netns_exit;
2366
2367         err = ovs_netdev_init();
2368         if (err)
2369                 goto error_unreg_notifier;
2370
2371         err = dp_register_genl();
2372         if (err < 0)
2373                 goto error_unreg_netdev;
2374
2375         return 0;
2376
2377 error_unreg_netdev:
2378         ovs_netdev_exit();
2379 error_unreg_notifier:
2380         unregister_netdevice_notifier(&ovs_dp_device_notifier);
2381 error_netns_exit:
2382         unregister_pernet_device(&ovs_net_ops);
2383 error_vport_exit:
2384         ovs_vport_exit();
2385 error_flow_exit:
2386         ovs_flow_exit();
2387 error_unreg_rtnl_link:
2388         ovs_internal_dev_rtnl_link_unregister();
2389 error_action_fifos_exit:
2390         action_fifos_exit();
2391 error_compat_exit:
2392         compat_exit();
2393 error:
2394         return err;
2395 }
2396
2397 static void dp_cleanup(void)
2398 {
2399         dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
2400         ovs_netdev_exit();
2401         unregister_netdevice_notifier(&ovs_dp_device_notifier);
2402         unregister_pernet_device(&ovs_net_ops);
2403         rcu_barrier();
2404         ovs_vport_exit();
2405         ovs_flow_exit();
2406         ovs_internal_dev_rtnl_link_unregister();
2407         action_fifos_exit();
2408         compat_exit();
2409 }
2410
2411 module_init(dp_init);
2412 module_exit(dp_cleanup);
2413
2414 MODULE_DESCRIPTION("Open vSwitch switching datapath");
2415 MODULE_LICENSE("GPL");
2416 MODULE_VERSION(VERSION);