soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF

[cascardo/linux.git] / net / core / filter.c
diff --git a/net/core/filter.c b/net/core/filter.c

index bb18c36..35e6fed 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -49,16 +49,18 @@
  #include <net/sch_generic.h>
  #include <net/cls_cgroup.h>
  #include <net/dst_metadata.h>
+#include <net/dst.h>
+#include <net/sock_reuseport.h>
  
  /**
   *     sk_filter - run a packet through a socket filter
   *     @sk: sock associated with &sk_buff
   *     @skb: buffer to filter
   *
- * Run the filter code and then cut skb->data to correct size returned by
- * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
   * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
+ * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
   * be accepted or -EPERM if the packet should be tossed.
   *
   */
@@ -82,7 +84,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
         rcu_read_lock();
         filter = rcu_dereference(sk->sk_filter);
         if (filter) {
-               unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+               unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
  
                 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
         }
@@ -148,12 +150,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
         return raw_smp_processor_id();
  }
  
-/* note that this only generates 32-bit random numbers */
-static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
-{
-       return prandom_u32();
-}
-
  static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                               struct bpf_insn *insn_buf)
  {
@@ -312,7 +308,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
                         *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
                         break;
                 case SKF_AD_OFF + SKF_AD_RANDOM:
-                       *insn = BPF_EMIT_CALL(__get_random_u32);
+                       *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
+                       bpf_user_rnd_init_once();
                         break;
                 }
                 break;
@@ -352,12 +349,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
   *    jump offsets, 2nd pass remapping:
   *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
   *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
- *
- * User BPF's register A is mapped to our BPF register 6, user BPF
- * register X is mapped to BPF register 7; frame pointer is always
- * register 10; Context 'void *ctx' is stored in register 1, that is,
- * for socket filters: ctx == 'struct sk_buff *', for seccomp:
- * ctx == 'struct seccomp_data *'.
   */
  static int bpf_convert_filter(struct sock_filter *prog, int len,
                               struct bpf_insn *new_prog, int *new_len)
@@ -385,9 +376,22 @@ do_pass:
         new_insn = new_prog;
         fp = prog;
  
-       if (new_insn)
-               *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
-       new_insn++;
+       /* Classic BPF related prologue emission. */
+       if (new_insn) {
+               /* Classic BPF expects A and X to be reset first. These need
+                * to be guaranteed to be the first two instructions.
+                */
+               *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+               *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+
+               /* All programs must keep CTX in callee saved BPF_REG_CTX.
+                * In eBPF case it's done by the compiler, here we need to
+                * do this ourself. Initial CTX is present in BPF_REG_ARG1.
+                */
+               *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+       } else {
+               new_insn += 3;
+       }
  
         for (i = 0; i < len; fp++, i++) {
                 struct bpf_insn tmp_insns[6] = { };
@@ -1001,7 +1005,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
         int err;
  
         fp->bpf_func = NULL;
-       fp->jited = false;
+       fp->jited = 0;
  
         err = bpf_check_classic(fp->insns, fp->len);
         if (err) {
@@ -1083,16 +1087,18 @@ EXPORT_SYMBOL_GPL(bpf_prog_create);
   *     @pfp: the unattached filter that is created
   *     @fprog: the filter program
   *     @trans: post-classic verifier transformation handler
+ *     @save_orig: save classic BPF program
   *
   * This function effectively does the same as bpf_prog_create(), only
   * that it builds up its insns buffer from user space provided buffer.
   * It also allows for passing a bpf_aux_classic_check_t handler.
   */
  int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
-                             bpf_aux_classic_check_t trans)
+                             bpf_aux_classic_check_t trans, bool save_orig)
  {
         unsigned int fsize = bpf_classic_proglen(fprog);
         struct bpf_prog *fp;
+       int err;
  
         /* Make sure new filter is there and in the right amounts. */
         if (fprog->filter == NULL)
@@ -1108,12 +1114,16 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
         }
  
         fp->len = fprog->len;
-       /* Since unattached filters are not copied back to user
-        * space through sk_get_filter(), we do not need to hold
-        * a copy here, and can spare us the work.
-        */
         fp->orig_prog = NULL;
  
+       if (save_orig) {
+               err = bpf_prog_store_orig_filter(fp, fprog);
+               if (err) {
+                       __bpf_prog_free(fp);
+                       return -ENOMEM;
+               }
+       }
+
         /* bpf_prepare_filter() already takes care of freeing
          * memory in case something goes wrong.
          */
@@ -1158,17 +1168,32 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
         return 0;
  }
  
-/**
- *     sk_attach_filter - attach a socket filter
- *     @fprog: the filter program
- *     @sk: the socket to use
- *
- * Attach the user's filter code. We first run some sanity checks on
- * it to make sure it does not explode on us later. If an error
- * occurs or there is insufficient memory for the filter a negative
- * errno code is returned. On success the return is zero.
- */
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+       struct bpf_prog *old_prog;
+       int err;
+
+       if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+               return -ENOMEM;
+
+       if (sk_unhashed(sk)) {
+               err = reuseport_alloc(sk);
+               if (err)
+                       return err;
+       } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+               /* The socket wasn't bound with SO_REUSEPORT */
+               return -EINVAL;
+       }
+
+       old_prog = reuseport_attach_prog(sk, prog);
+       if (old_prog)
+               bpf_prog_destroy(old_prog);
+
+       return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
  {
         unsigned int fsize = bpf_classic_proglen(fprog);
         unsigned int bpf_fsize = bpf_prog_size(fprog->len);
@@ -1176,19 +1201,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
         int err;
  
         if (sock_flag(sk, SOCK_FILTER_LOCKED))
-               return -EPERM;
+               return ERR_PTR(-EPERM);
  
         /* Make sure new filter is there and in the right amounts. */
         if (fprog->filter == NULL)
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
  
         prog = bpf_prog_alloc(bpf_fsize, 0);
         if (!prog)
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
  
         if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                 __bpf_prog_free(prog);
-               return -EFAULT;
+               return ERR_PTR(-EFAULT);
         }
  
         prog->len = fprog->len;
@@ -1196,13 +1221,30 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
         err = bpf_prog_store_orig_filter(prog, fprog);
         if (err) {
                 __bpf_prog_free(prog);
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
         }
  
         /* bpf_prepare_filter() already takes care of freeing
          * memory in case something goes wrong.
          */
-       prog = bpf_prepare_filter(prog, NULL);
+       return bpf_prepare_filter(prog, NULL);
+}
+
+/**
+ *     sk_attach_filter - attach a socket filter
+ *     @fprog: the filter program
+ *     @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+       struct bpf_prog *prog = __get_filter(fprog, sk);
+       int err;
+
         if (IS_ERR(prog))
                 return PTR_ERR(prog);
  
@@ -1216,23 +1258,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
  }
  EXPORT_SYMBOL_GPL(sk_attach_filter);
  
-int sk_attach_bpf(u32 ufd, struct sock *sk)
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
  {
-       struct bpf_prog *prog;
+       struct bpf_prog *prog = __get_filter(fprog, sk);
         int err;
  
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       err = __reuseport_attach_prog(prog, sk);
+       if (err < 0) {
+               __bpf_prog_release(prog);
+               return err;
+       }
+
+       return 0;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+       struct bpf_prog *prog;
+
         if (sock_flag(sk, SOCK_FILTER_LOCKED))
-               return -EPERM;
+               return ERR_PTR(-EPERM);
  
         prog = bpf_prog_get(ufd);
         if (IS_ERR(prog))
-               return PTR_ERR(prog);
+               return prog;
  
         if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
                 bpf_prog_put(prog);
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
         }
  
+       return prog;
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+       struct bpf_prog *prog = __get_bpf(ufd, sk);
+       int err;
+
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
         err = __sk_attach_prog(prog, sk);
         if (err < 0) {
                 bpf_prog_put(prog);
@@ -1242,7 +1311,25 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
         return 0;
  }
  
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+       struct bpf_prog *prog = __get_bpf(ufd, sk);
+       int err;
+
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       err = __reuseport_attach_prog(prog, sk);
+       if (err < 0) {
+               bpf_prog_put(prog);
+               return err;
+       }
+
+       return 0;
+}
+
  #define BPF_RECOMPUTE_CSUM(flags)      ((flags) & 1)
+#define BPF_LDST_LEN                   16U
  
  static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
  {
@@ -1250,7 +1337,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
         int offset = (int) r2;
         void *from = (void *) (long) r3;
         unsigned int len = (unsigned int) r4;
-       char buf[16];
+       char buf[BPF_LDST_LEN];
         void *ptr;
  
         /* bpf verifier guarantees that:
@@ -1297,6 +1384,36 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = {
         .arg5_type      = ARG_ANYTHING,
  };
  
+static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+       const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
+       int offset = (int) r2;
+       void *to = (void *)(unsigned long) r3;
+       unsigned int len = (unsigned int) r4;
+       void *ptr;
+
+       if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN))
+               return -EFAULT;
+
+       ptr = skb_header_pointer(skb, offset, len, to);
+       if (unlikely(!ptr))
+               return -EFAULT;
+       if (ptr != to)
+               memcpy(to, ptr, len);
+
+       return 0;
+}
+
+const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+       .func           = bpf_skb_load_bytes,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_STACK,
+       .arg4_type      = ARG_CONST_STACK_SIZE,
+};
+
  #define BPF_HEADER_FIELD_SIZE(flags)   ((flags) & 0x0f)
  #define BPF_IS_PSEUDO_HEADER(flags)    ((flags) & 0x10)
  
@@ -1404,9 +1521,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
         if (unlikely(!dev))
                 return -EINVAL;
  
-       if (unlikely(!(dev->flags & IFF_UP)))
-               return -EINVAL;
-
         skb2 = skb_clone(skb, GFP_ATOMIC);
         if (unlikely(!skb2))
                 return -ENOMEM;
@@ -1428,6 +1542,49 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
         .arg3_type      = ARG_ANYTHING,
  };
  
+struct redirect_info {
+       u32 ifindex;
+       u32 flags;
+};
+
+static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+{
+       struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+       ri->ifindex = ifindex;
+       ri->flags = flags;
+       return TC_ACT_REDIRECT;
+}
+
+int skb_do_redirect(struct sk_buff *skb)
+{
+       struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+       struct net_device *dev;
+
+       dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
+       ri->ifindex = 0;
+       if (unlikely(!dev)) {
+               kfree_skb(skb);
+               return -EINVAL;
+       }
+
+       if (BPF_IS_REDIRECT_INGRESS(ri->flags))
+               return dev_forward_skb(dev, skb);
+
+       skb->dev = dev;
+       skb_sender_cpu_clear(skb);
+       return dev_queue_xmit(skb);
+}
+
+const struct bpf_func_proto bpf_redirect_proto = {
+       .func           = bpf_redirect,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+       .arg2_type      = ARG_ANYTHING,
+};
+
  static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
  {
         return task_get_classid((struct sk_buff *) (unsigned long) r1);
@@ -1440,6 +1597,25 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
         .arg1_type      = ARG_PTR_TO_CTX,
  };
  
+static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+       const struct dst_entry *dst;
+
+       dst = skb_dst((struct sk_buff *) (unsigned long) r1);
+       if (dst)
+               return dst->tclassid;
+#endif
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_get_route_realm_proto = {
+       .func           = bpf_get_route_realm,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+};
+
  static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
  {
         struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1580,7 +1756,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
         case BPF_FUNC_ktime_get_ns:
                 return &bpf_ktime_get_ns_proto;
         case BPF_FUNC_trace_printk:
-               return bpf_get_trace_printk_proto();
+               if (capable(CAP_SYS_ADMIN))
+                       return bpf_get_trace_printk_proto();
         default:
                 return NULL;
         }
@@ -1592,6 +1769,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
         switch (func_id) {
         case BPF_FUNC_skb_store_bytes:
                 return &bpf_skb_store_bytes_proto;
+       case BPF_FUNC_skb_load_bytes:
+               return &bpf_skb_load_bytes_proto;
         case BPF_FUNC_l3_csum_replace:
                 return &bpf_l3_csum_replace_proto;
         case BPF_FUNC_l4_csum_replace:
@@ -1608,6 +1787,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
                 return &bpf_skb_get_tunnel_key_proto;
         case BPF_FUNC_skb_set_tunnel_key:
                 return bpf_get_skb_set_tunnel_key_proto();
+       case BPF_FUNC_redirect:
+               return &bpf_redirect_proto;
+       case BPF_FUNC_get_route_realm:
+               return &bpf_get_route_realm_proto;
         default:
                 return sk_filter_func_proto(func_id);
         }
@@ -1633,6 +1816,9 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
  static bool sk_filter_is_valid_access(int off, int size,
                                       enum bpf_access_type type)
  {
+       if (off == offsetof(struct __sk_buff, tc_classid))
+               return false;
+
         if (type == BPF_WRITE) {
                 switch (off) {
                 case offsetof(struct __sk_buff, cb[0]) ...
@@ -1649,10 +1835,14 @@ static bool sk_filter_is_valid_access(int off, int size,
  static bool tc_cls_act_is_valid_access(int off, int size,
                                        enum bpf_access_type type)
  {
+       if (off == offsetof(struct __sk_buff, tc_classid))
+               return type == BPF_WRITE ? true : false;
+
         if (type == BPF_WRITE) {
                 switch (off) {
                 case offsetof(struct __sk_buff, mark):
                 case offsetof(struct __sk_buff, tc_index):
+               case offsetof(struct __sk_buff, priority):
                 case offsetof(struct __sk_buff, cb[0]) ...
                         offsetof(struct __sk_buff, cb[4]):
                         break;
@@ -1665,7 +1855,8 @@ static bool tc_cls_act_is_valid_access(int off, int size,
  
  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
                                       int src_reg, int ctx_off,
-                                     struct bpf_insn *insn_buf)
+                                     struct bpf_insn *insn_buf,
+                                     struct bpf_prog *prog)
  {
         struct bpf_insn *insn = insn_buf;
  
@@ -1694,8 +1885,12 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
         case offsetof(struct __sk_buff, priority):
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
  
-               *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
-                                     offsetof(struct sk_buff, priority));
+               if (type == BPF_WRITE)
+                       *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+                                             offsetof(struct sk_buff, priority));
+               else
+                       *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+                                             offsetof(struct sk_buff, priority));
                 break;
  
         case offsetof(struct __sk_buff, ingress_ifindex):
@@ -1752,6 +1947,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
                 offsetof(struct __sk_buff, cb[4]):
                 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
  
+               prog->cb_access = 1;
                 ctx_off -= offsetof(struct __sk_buff, cb[0]);
                 ctx_off += offsetof(struct sk_buff, cb);
                 ctx_off += offsetof(struct qdisc_skb_cb, data);
@@ -1761,6 +1957,14 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
                         *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
                 break;
  
+       case offsetof(struct __sk_buff, tc_classid):
+               ctx_off -= offsetof(struct __sk_buff, tc_classid);
+               ctx_off += offsetof(struct sk_buff, cb);
+               ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+               WARN_ON(type != BPF_WRITE);
+               *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+               break;
+
         case offsetof(struct __sk_buff, tc_index):
  #ifdef CONFIG_NET_SCHED
                 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);