IB/ipoib: Avoid flushing the workqueue from worker context
[cascardo/linux.git] / drivers / infiniband / ulp / ipoib / ipoib_main.c
1 /*
2  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34
35 #include "ipoib.h"
36
37 #include <linux/module.h>
38
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/kernel.h>
42 #include <linux/vmalloc.h>
43
44 #include <linux/if_arp.h>       /* For ARPHRD_xxx */
45
46 #include <linux/ip.h>
47 #include <linux/in.h>
48
49 #include <linux/jhash.h>
50 #include <net/arp.h>
51
52 #define DRV_VERSION "1.0.0"
53
54 const char ipoib_driver_version[] = DRV_VERSION;
55
56 MODULE_AUTHOR("Roland Dreier");
57 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
58 MODULE_LICENSE("Dual BSD/GPL");
59 MODULE_VERSION(DRV_VERSION);
60
61 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
62 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
63
64 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
65 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
66 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
67 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
68
69 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
70 int ipoib_debug_level;
71
72 module_param_named(debug_level, ipoib_debug_level, int, 0644);
73 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
74 #endif
75
76 struct ipoib_path_iter {
77         struct net_device *dev;
78         struct ipoib_path  path;
79 };
80
81 static const u8 ipv4_bcast_addr[] = {
82         0x00, 0xff, 0xff, 0xff,
83         0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
84         0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
85 };
86
87 struct workqueue_struct *ipoib_workqueue;
88
89 struct ib_sa_client ipoib_sa_client;
90
91 static void ipoib_add_one(struct ib_device *device);
92 static void ipoib_remove_one(struct ib_device *device);
93 static void ipoib_neigh_reclaim(struct rcu_head *rp);
94
95 static struct ib_client ipoib_client = {
96         .name   = "ipoib",
97         .add    = ipoib_add_one,
98         .remove = ipoib_remove_one
99 };
100
101 int ipoib_open(struct net_device *dev)
102 {
103         struct ipoib_dev_priv *priv = netdev_priv(dev);
104
105         ipoib_dbg(priv, "bringing up interface\n");
106
107         netif_carrier_off(dev);
108
109         set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
110
111
112         ipoib_pkey_dev_check_presence(dev);
113
114         if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
115                 return 0;
116
117         if (ipoib_ib_dev_open(dev, 1))
118                 goto err_disable;
119
120         if (ipoib_ib_dev_up(dev))
121                 goto err_stop;
122
123         if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
124                 struct ipoib_dev_priv *cpriv;
125
126                 /* Bring up any child interfaces too */
127                 down_read(&priv->vlan_rwsem);
128                 list_for_each_entry(cpriv, &priv->child_intfs, list) {
129                         int flags;
130
131                         flags = cpriv->dev->flags;
132                         if (flags & IFF_UP)
133                                 continue;
134
135                         dev_change_flags(cpriv->dev, flags | IFF_UP);
136                 }
137                 up_read(&priv->vlan_rwsem);
138         }
139
140         netif_start_queue(dev);
141
142         return 0;
143
144 err_stop:
145         ipoib_ib_dev_stop(dev, 1);
146
147 err_disable:
148         clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
149
150         return -EINVAL;
151 }
152
153 static int ipoib_stop(struct net_device *dev)
154 {
155         struct ipoib_dev_priv *priv = netdev_priv(dev);
156
157         ipoib_dbg(priv, "stopping interface\n");
158
159         clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
160
161         netif_stop_queue(dev);
162
163         ipoib_ib_dev_down(dev, 1);
164         ipoib_ib_dev_stop(dev, 0);
165
166         if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
167                 struct ipoib_dev_priv *cpriv;
168
169                 /* Bring down any child interfaces too */
170                 down_read(&priv->vlan_rwsem);
171                 list_for_each_entry(cpriv, &priv->child_intfs, list) {
172                         int flags;
173
174                         flags = cpriv->dev->flags;
175                         if (!(flags & IFF_UP))
176                                 continue;
177
178                         dev_change_flags(cpriv->dev, flags & ~IFF_UP);
179                 }
180                 up_read(&priv->vlan_rwsem);
181         }
182
183         return 0;
184 }
185
186 static void ipoib_uninit(struct net_device *dev)
187 {
188         ipoib_dev_cleanup(dev);
189 }
190
191 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
192 {
193         struct ipoib_dev_priv *priv = netdev_priv(dev);
194
195         if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
196                 features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
197
198         return features;
199 }
200
201 static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
202 {
203         struct ipoib_dev_priv *priv = netdev_priv(dev);
204
205         /* dev->mtu > 2K ==> connected mode */
206         if (ipoib_cm_admin_enabled(dev)) {
207                 if (new_mtu > ipoib_cm_max_mtu(dev))
208                         return -EINVAL;
209
210                 if (new_mtu > priv->mcast_mtu)
211                         ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
212                                    priv->mcast_mtu);
213
214                 dev->mtu = new_mtu;
215                 return 0;
216         }
217
218         if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
219                 return -EINVAL;
220
221         priv->admin_mtu = new_mtu;
222
223         dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
224
225         return 0;
226 }
227
228 int ipoib_set_mode(struct net_device *dev, const char *buf)
229 {
230         struct ipoib_dev_priv *priv = netdev_priv(dev);
231
232         /* flush paths if we switch modes so that connections are restarted */
233         if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
234                 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
235                 ipoib_warn(priv, "enabling connected mode "
236                            "will cause multicast packet drops\n");
237                 netdev_update_features(dev);
238                 rtnl_unlock();
239                 priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
240
241                 ipoib_flush_paths(dev);
242                 rtnl_lock();
243                 return 0;
244         }
245
246         if (!strcmp(buf, "datagram\n")) {
247                 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
248                 netdev_update_features(dev);
249                 dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
250                 rtnl_unlock();
251                 ipoib_flush_paths(dev);
252                 rtnl_lock();
253                 return 0;
254         }
255
256         return -EINVAL;
257 }
258
259 static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
260 {
261         struct ipoib_dev_priv *priv = netdev_priv(dev);
262         struct rb_node *n = priv->path_tree.rb_node;
263         struct ipoib_path *path;
264         int ret;
265
266         while (n) {
267                 path = rb_entry(n, struct ipoib_path, rb_node);
268
269                 ret = memcmp(gid, path->pathrec.dgid.raw,
270                              sizeof (union ib_gid));
271
272                 if (ret < 0)
273                         n = n->rb_left;
274                 else if (ret > 0)
275                         n = n->rb_right;
276                 else
277                         return path;
278         }
279
280         return NULL;
281 }
282
283 static int __path_add(struct net_device *dev, struct ipoib_path *path)
284 {
285         struct ipoib_dev_priv *priv = netdev_priv(dev);
286         struct rb_node **n = &priv->path_tree.rb_node;
287         struct rb_node *pn = NULL;
288         struct ipoib_path *tpath;
289         int ret;
290
291         while (*n) {
292                 pn = *n;
293                 tpath = rb_entry(pn, struct ipoib_path, rb_node);
294
295                 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
296                              sizeof (union ib_gid));
297                 if (ret < 0)
298                         n = &pn->rb_left;
299                 else if (ret > 0)
300                         n = &pn->rb_right;
301                 else
302                         return -EEXIST;
303         }
304
305         rb_link_node(&path->rb_node, pn, n);
306         rb_insert_color(&path->rb_node, &priv->path_tree);
307
308         list_add_tail(&path->list, &priv->path_list);
309
310         return 0;
311 }
312
313 static void path_free(struct net_device *dev, struct ipoib_path *path)
314 {
315         struct sk_buff *skb;
316
317         while ((skb = __skb_dequeue(&path->queue)))
318                 dev_kfree_skb_irq(skb);
319
320         ipoib_dbg(netdev_priv(dev), "path_free\n");
321
322         /* remove all neigh connected to this path */
323         ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
324
325         if (path->ah)
326                 ipoib_put_ah(path->ah);
327
328         kfree(path);
329 }
330
331 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
332
333 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
334 {
335         struct ipoib_path_iter *iter;
336
337         iter = kmalloc(sizeof *iter, GFP_KERNEL);
338         if (!iter)
339                 return NULL;
340
341         iter->dev = dev;
342         memset(iter->path.pathrec.dgid.raw, 0, 16);
343
344         if (ipoib_path_iter_next(iter)) {
345                 kfree(iter);
346                 return NULL;
347         }
348
349         return iter;
350 }
351
352 int ipoib_path_iter_next(struct ipoib_path_iter *iter)
353 {
354         struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
355         struct rb_node *n;
356         struct ipoib_path *path;
357         int ret = 1;
358
359         spin_lock_irq(&priv->lock);
360
361         n = rb_first(&priv->path_tree);
362
363         while (n) {
364                 path = rb_entry(n, struct ipoib_path, rb_node);
365
366                 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
367                            sizeof (union ib_gid)) < 0) {
368                         iter->path = *path;
369                         ret = 0;
370                         break;
371                 }
372
373                 n = rb_next(n);
374         }
375
376         spin_unlock_irq(&priv->lock);
377
378         return ret;
379 }
380
381 void ipoib_path_iter_read(struct ipoib_path_iter *iter,
382                           struct ipoib_path *path)
383 {
384         *path = iter->path;
385 }
386
387 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
388
389 void ipoib_mark_paths_invalid(struct net_device *dev)
390 {
391         struct ipoib_dev_priv *priv = netdev_priv(dev);
392         struct ipoib_path *path, *tp;
393
394         spin_lock_irq(&priv->lock);
395
396         list_for_each_entry_safe(path, tp, &priv->path_list, list) {
397                 ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
398                         be16_to_cpu(path->pathrec.dlid),
399                         path->pathrec.dgid.raw);
400                 path->valid =  0;
401         }
402
403         spin_unlock_irq(&priv->lock);
404 }
405
406 void ipoib_flush_paths(struct net_device *dev)
407 {
408         struct ipoib_dev_priv *priv = netdev_priv(dev);
409         struct ipoib_path *path, *tp;
410         LIST_HEAD(remove_list);
411         unsigned long flags;
412
413         netif_tx_lock_bh(dev);
414         spin_lock_irqsave(&priv->lock, flags);
415
416         list_splice_init(&priv->path_list, &remove_list);
417
418         list_for_each_entry(path, &remove_list, list)
419                 rb_erase(&path->rb_node, &priv->path_tree);
420
421         list_for_each_entry_safe(path, tp, &remove_list, list) {
422                 if (path->query)
423                         ib_sa_cancel_query(path->query_id, path->query);
424                 spin_unlock_irqrestore(&priv->lock, flags);
425                 netif_tx_unlock_bh(dev);
426                 wait_for_completion(&path->done);
427                 path_free(dev, path);
428                 netif_tx_lock_bh(dev);
429                 spin_lock_irqsave(&priv->lock, flags);
430         }
431
432         spin_unlock_irqrestore(&priv->lock, flags);
433         netif_tx_unlock_bh(dev);
434 }
435
436 static void path_rec_completion(int status,
437                                 struct ib_sa_path_rec *pathrec,
438                                 void *path_ptr)
439 {
440         struct ipoib_path *path = path_ptr;
441         struct net_device *dev = path->dev;
442         struct ipoib_dev_priv *priv = netdev_priv(dev);
443         struct ipoib_ah *ah = NULL;
444         struct ipoib_ah *old_ah = NULL;
445         struct ipoib_neigh *neigh, *tn;
446         struct sk_buff_head skqueue;
447         struct sk_buff *skb;
448         unsigned long flags;
449
450         if (!status)
451                 ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
452                           be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
453         else
454                 ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
455                           status, path->pathrec.dgid.raw);
456
457         skb_queue_head_init(&skqueue);
458
459         if (!status) {
460                 struct ib_ah_attr av;
461
462                 if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
463                         ah = ipoib_create_ah(dev, priv->pd, &av);
464         }
465
466         spin_lock_irqsave(&priv->lock, flags);
467
468         if (!IS_ERR_OR_NULL(ah)) {
469                 path->pathrec = *pathrec;
470
471                 old_ah   = path->ah;
472                 path->ah = ah;
473
474                 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
475                           ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
476
477                 while ((skb = __skb_dequeue(&path->queue)))
478                         __skb_queue_tail(&skqueue, skb);
479
480                 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
481                         if (neigh->ah) {
482                                 WARN_ON(neigh->ah != old_ah);
483                                 /*
484                                  * Dropping the ah reference inside
485                                  * priv->lock is safe here, because we
486                                  * will hold one more reference from
487                                  * the original value of path->ah (ie
488                                  * old_ah).
489                                  */
490                                 ipoib_put_ah(neigh->ah);
491                         }
492                         kref_get(&path->ah->ref);
493                         neigh->ah = path->ah;
494
495                         if (ipoib_cm_enabled(dev, neigh->daddr)) {
496                                 if (!ipoib_cm_get(neigh))
497                                         ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
498                                                                                path,
499                                                                                neigh));
500                                 if (!ipoib_cm_get(neigh)) {
501                                         ipoib_neigh_free(neigh);
502                                         continue;
503                                 }
504                         }
505
506                         while ((skb = __skb_dequeue(&neigh->queue)))
507                                 __skb_queue_tail(&skqueue, skb);
508                 }
509                 path->valid = 1;
510         }
511
512         path->query = NULL;
513         complete(&path->done);
514
515         spin_unlock_irqrestore(&priv->lock, flags);
516
517         if (IS_ERR_OR_NULL(ah))
518                 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
519
520         if (old_ah)
521                 ipoib_put_ah(old_ah);
522
523         while ((skb = __skb_dequeue(&skqueue))) {
524                 skb->dev = dev;
525                 if (dev_queue_xmit(skb))
526                         ipoib_warn(priv, "dev_queue_xmit failed "
527                                    "to requeue packet\n");
528         }
529 }
530
531 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
532 {
533         struct ipoib_dev_priv *priv = netdev_priv(dev);
534         struct ipoib_path *path;
535
536         if (!priv->broadcast)
537                 return NULL;
538
539         path = kzalloc(sizeof *path, GFP_ATOMIC);
540         if (!path)
541                 return NULL;
542
543         path->dev = dev;
544
545         skb_queue_head_init(&path->queue);
546
547         INIT_LIST_HEAD(&path->neigh_list);
548
549         memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
550         path->pathrec.sgid          = priv->local_gid;
551         path->pathrec.pkey          = cpu_to_be16(priv->pkey);
552         path->pathrec.numb_path     = 1;
553         path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
554
555         return path;
556 }
557
558 static int path_rec_start(struct net_device *dev,
559                           struct ipoib_path *path)
560 {
561         struct ipoib_dev_priv *priv = netdev_priv(dev);
562
563         ipoib_dbg(priv, "Start path record lookup for %pI6\n",
564                   path->pathrec.dgid.raw);
565
566         init_completion(&path->done);
567
568         path->query_id =
569                 ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
570                                    &path->pathrec,
571                                    IB_SA_PATH_REC_DGID          |
572                                    IB_SA_PATH_REC_SGID          |
573                                    IB_SA_PATH_REC_NUMB_PATH     |
574                                    IB_SA_PATH_REC_TRAFFIC_CLASS |
575                                    IB_SA_PATH_REC_PKEY,
576                                    1000, GFP_ATOMIC,
577                                    path_rec_completion,
578                                    path, &path->query);
579         if (path->query_id < 0) {
580                 ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
581                 path->query = NULL;
582                 complete(&path->done);
583                 return path->query_id;
584         }
585
586         return 0;
587 }
588
589 static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
590                            struct net_device *dev)
591 {
592         struct ipoib_dev_priv *priv = netdev_priv(dev);
593         struct ipoib_path *path;
594         struct ipoib_neigh *neigh;
595         unsigned long flags;
596
597         spin_lock_irqsave(&priv->lock, flags);
598         neigh = ipoib_neigh_alloc(daddr, dev);
599         if (!neigh) {
600                 spin_unlock_irqrestore(&priv->lock, flags);
601                 ++dev->stats.tx_dropped;
602                 dev_kfree_skb_any(skb);
603                 return;
604         }
605
606         path = __path_find(dev, daddr + 4);
607         if (!path) {
608                 path = path_rec_create(dev, daddr + 4);
609                 if (!path)
610                         goto err_path;
611
612                 __path_add(dev, path);
613         }
614
615         list_add_tail(&neigh->list, &path->neigh_list);
616
617         if (path->ah) {
618                 kref_get(&path->ah->ref);
619                 neigh->ah = path->ah;
620
621                 if (ipoib_cm_enabled(dev, neigh->daddr)) {
622                         if (!ipoib_cm_get(neigh))
623                                 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
624                         if (!ipoib_cm_get(neigh)) {
625                                 ipoib_neigh_free(neigh);
626                                 goto err_drop;
627                         }
628                         if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
629                                 __skb_queue_tail(&neigh->queue, skb);
630                         else {
631                                 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
632                                            skb_queue_len(&neigh->queue));
633                                 goto err_drop;
634                         }
635                 } else {
636                         spin_unlock_irqrestore(&priv->lock, flags);
637                         ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
638                         ipoib_neigh_put(neigh);
639                         return;
640                 }
641         } else {
642                 neigh->ah  = NULL;
643
644                 if (!path->query && path_rec_start(dev, path))
645                         goto err_path;
646
647                 __skb_queue_tail(&neigh->queue, skb);
648         }
649
650         spin_unlock_irqrestore(&priv->lock, flags);
651         ipoib_neigh_put(neigh);
652         return;
653
654 err_path:
655         ipoib_neigh_free(neigh);
656 err_drop:
657         ++dev->stats.tx_dropped;
658         dev_kfree_skb_any(skb);
659
660         spin_unlock_irqrestore(&priv->lock, flags);
661         ipoib_neigh_put(neigh);
662 }
663
664 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
665                              struct ipoib_cb *cb)
666 {
667         struct ipoib_dev_priv *priv = netdev_priv(dev);
668         struct ipoib_path *path;
669         unsigned long flags;
670
671         spin_lock_irqsave(&priv->lock, flags);
672
673         path = __path_find(dev, cb->hwaddr + 4);
674         if (!path || !path->valid) {
675                 int new_path = 0;
676
677                 if (!path) {
678                         path = path_rec_create(dev, cb->hwaddr + 4);
679                         new_path = 1;
680                 }
681                 if (path) {
682                         __skb_queue_tail(&path->queue, skb);
683
684                         if (!path->query && path_rec_start(dev, path)) {
685                                 spin_unlock_irqrestore(&priv->lock, flags);
686                                 if (new_path)
687                                         path_free(dev, path);
688                                 return;
689                         } else
690                                 __path_add(dev, path);
691                 } else {
692                         ++dev->stats.tx_dropped;
693                         dev_kfree_skb_any(skb);
694                 }
695
696                 spin_unlock_irqrestore(&priv->lock, flags);
697                 return;
698         }
699
700         if (path->ah) {
701                 ipoib_dbg(priv, "Send unicast ARP to %04x\n",
702                           be16_to_cpu(path->pathrec.dlid));
703
704                 spin_unlock_irqrestore(&priv->lock, flags);
705                 ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
706                 return;
707         } else if ((path->query || !path_rec_start(dev, path)) &&
708                    skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
709                 __skb_queue_tail(&path->queue, skb);
710         } else {
711                 ++dev->stats.tx_dropped;
712                 dev_kfree_skb_any(skb);
713         }
714
715         spin_unlock_irqrestore(&priv->lock, flags);
716 }
717
718 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
719 {
720         struct ipoib_dev_priv *priv = netdev_priv(dev);
721         struct ipoib_neigh *neigh;
722         struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
723         struct ipoib_header *header;
724         unsigned long flags;
725
726         header = (struct ipoib_header *) skb->data;
727
728         if (unlikely(cb->hwaddr[4] == 0xff)) {
729                 /* multicast, arrange "if" according to probability */
730                 if ((header->proto != htons(ETH_P_IP)) &&
731                     (header->proto != htons(ETH_P_IPV6)) &&
732                     (header->proto != htons(ETH_P_ARP)) &&
733                     (header->proto != htons(ETH_P_RARP)) &&
734                     (header->proto != htons(ETH_P_TIPC))) {
735                         /* ethertype not supported by IPoIB */
736                         ++dev->stats.tx_dropped;
737                         dev_kfree_skb_any(skb);
738                         return NETDEV_TX_OK;
739                 }
740                 /* Add in the P_Key for multicast*/
741                 cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
742                 cb->hwaddr[9] = priv->pkey & 0xff;
743
744                 neigh = ipoib_neigh_get(dev, cb->hwaddr);
745                 if (likely(neigh))
746                         goto send_using_neigh;
747                 ipoib_mcast_send(dev, cb->hwaddr, skb);
748                 return NETDEV_TX_OK;
749         }
750
751         /* unicast, arrange "switch" according to probability */
752         switch (header->proto) {
753         case htons(ETH_P_IP):
754         case htons(ETH_P_IPV6):
755         case htons(ETH_P_TIPC):
756                 neigh = ipoib_neigh_get(dev, cb->hwaddr);
757                 if (unlikely(!neigh)) {
758                         neigh_add_path(skb, cb->hwaddr, dev);
759                         return NETDEV_TX_OK;
760                 }
761                 break;
762         case htons(ETH_P_ARP):
763         case htons(ETH_P_RARP):
764                 /* for unicast ARP and RARP should always perform path find */
765                 unicast_arp_send(skb, dev, cb);
766                 return NETDEV_TX_OK;
767         default:
768                 /* ethertype not supported by IPoIB */
769                 ++dev->stats.tx_dropped;
770                 dev_kfree_skb_any(skb);
771                 return NETDEV_TX_OK;
772         }
773
774 send_using_neigh:
775         /* note we now hold a ref to neigh */
776         if (ipoib_cm_get(neigh)) {
777                 if (ipoib_cm_up(neigh)) {
778                         ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
779                         goto unref;
780                 }
781         } else if (neigh->ah) {
782                 ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
783                 goto unref;
784         }
785
786         if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
787                 spin_lock_irqsave(&priv->lock, flags);
788                 __skb_queue_tail(&neigh->queue, skb);
789                 spin_unlock_irqrestore(&priv->lock, flags);
790         } else {
791                 ++dev->stats.tx_dropped;
792                 dev_kfree_skb_any(skb);
793         }
794
795 unref:
796         ipoib_neigh_put(neigh);
797
798         return NETDEV_TX_OK;
799 }
800
801 static void ipoib_timeout(struct net_device *dev)
802 {
803         struct ipoib_dev_priv *priv = netdev_priv(dev);
804
805         ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
806                    jiffies_to_msecs(jiffies - dev->trans_start));
807         ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
808                    netif_queue_stopped(dev),
809                    priv->tx_head, priv->tx_tail);
810         /* XXX reset QP, etc. */
811 }
812
813 static int ipoib_hard_header(struct sk_buff *skb,
814                              struct net_device *dev,
815                              unsigned short type,
816                              const void *daddr, const void *saddr, unsigned len)
817 {
818         struct ipoib_header *header;
819         struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
820
821         header = (struct ipoib_header *) skb_push(skb, sizeof *header);
822
823         header->proto = htons(type);
824         header->reserved = 0;
825
826         /*
827          * we don't rely on dst_entry structure,  always stuff the
828          * destination address into skb->cb so we can figure out where
829          * to send the packet later.
830          */
831         memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
832
833         return sizeof *header;
834 }
835
836 static void ipoib_set_mcast_list(struct net_device *dev)
837 {
838         struct ipoib_dev_priv *priv = netdev_priv(dev);
839
840         if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
841                 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
842                 return;
843         }
844
845         queue_work(ipoib_workqueue, &priv->restart_task);
846 }
847
848 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
849 {
850         /*
851          * Use only the address parts that contributes to spreading
852          * The subnet prefix is not used as one can not connect to
853          * same remote port (GUID) using the same remote QPN via two
854          * different subnets.
855          */
856          /* qpn octets[1:4) & port GUID octets[12:20) */
857         u32 *d32 = (u32 *) daddr;
858         u32 hv;
859
860         hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
861         return hv & htbl->mask;
862 }
863
864 struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
865 {
866         struct ipoib_dev_priv *priv = netdev_priv(dev);
867         struct ipoib_neigh_table *ntbl = &priv->ntbl;
868         struct ipoib_neigh_hash *htbl;
869         struct ipoib_neigh *neigh = NULL;
870         u32 hash_val;
871
872         rcu_read_lock_bh();
873
874         htbl = rcu_dereference_bh(ntbl->htbl);
875
876         if (!htbl)
877                 goto out_unlock;
878
879         hash_val = ipoib_addr_hash(htbl, daddr);
880         for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
881              neigh != NULL;
882              neigh = rcu_dereference_bh(neigh->hnext)) {
883                 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
884                         /* found, take one ref on behalf of the caller */
885                         if (!atomic_inc_not_zero(&neigh->refcnt)) {
886                                 /* deleted */
887                                 neigh = NULL;
888                                 goto out_unlock;
889                         }
890                         neigh->alive = jiffies;
891                         goto out_unlock;
892                 }
893         }
894
895 out_unlock:
896         rcu_read_unlock_bh();
897         return neigh;
898 }
899
900 static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
901 {
902         struct ipoib_neigh_table *ntbl = &priv->ntbl;
903         struct ipoib_neigh_hash *htbl;
904         unsigned long neigh_obsolete;
905         unsigned long dt;
906         unsigned long flags;
907         int i;
908
909         if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
910                 return;
911
912         spin_lock_irqsave(&priv->lock, flags);
913
914         htbl = rcu_dereference_protected(ntbl->htbl,
915                                          lockdep_is_held(&priv->lock));
916
917         if (!htbl)
918                 goto out_unlock;
919
920         /* neigh is obsolete if it was idle for two GC periods */
921         dt = 2 * arp_tbl.gc_interval;
922         neigh_obsolete = jiffies - dt;
923         /* handle possible race condition */
924         if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
925                 goto out_unlock;
926
927         for (i = 0; i < htbl->size; i++) {
928                 struct ipoib_neigh *neigh;
929                 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
930
931                 while ((neigh = rcu_dereference_protected(*np,
932                                                           lockdep_is_held(&priv->lock))) != NULL) {
933                         /* was the neigh idle for two GC periods */
934                         if (time_after(neigh_obsolete, neigh->alive)) {
935                                 rcu_assign_pointer(*np,
936                                                    rcu_dereference_protected(neigh->hnext,
937                                                                              lockdep_is_held(&priv->lock)));
938                                 /* remove from path/mc list */
939                                 list_del(&neigh->list);
940                                 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
941                         } else {
942                                 np = &neigh->hnext;
943                         }
944
945                 }
946         }
947
948 out_unlock:
949         spin_unlock_irqrestore(&priv->lock, flags);
950 }
951
952 static void ipoib_reap_neigh(struct work_struct *work)
953 {
954         struct ipoib_dev_priv *priv =
955                 container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
956
957         __ipoib_reap_neigh(priv);
958
959         if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
960                 queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
961                                    arp_tbl.gc_interval);
962 }
963
964
965 static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
966                                       struct net_device *dev)
967 {
968         struct ipoib_neigh *neigh;
969
970         neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
971         if (!neigh)
972                 return NULL;
973
974         neigh->dev = dev;
975         memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
976         skb_queue_head_init(&neigh->queue);
977         INIT_LIST_HEAD(&neigh->list);
978         ipoib_cm_set(neigh, NULL);
979         /* one ref on behalf of the caller */
980         atomic_set(&neigh->refcnt, 1);
981
982         return neigh;
983 }
984
985 struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
986                                       struct net_device *dev)
987 {
988         struct ipoib_dev_priv *priv = netdev_priv(dev);
989         struct ipoib_neigh_table *ntbl = &priv->ntbl;
990         struct ipoib_neigh_hash *htbl;
991         struct ipoib_neigh *neigh;
992         u32 hash_val;
993
994         htbl = rcu_dereference_protected(ntbl->htbl,
995                                          lockdep_is_held(&priv->lock));
996         if (!htbl) {
997                 neigh = NULL;
998                 goto out_unlock;
999         }
1000
1001         /* need to add a new neigh, but maybe some other thread succeeded?
1002          * recalc hash, maybe hash resize took place so we do a search
1003          */
1004         hash_val = ipoib_addr_hash(htbl, daddr);
1005         for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1006                                                lockdep_is_held(&priv->lock));
1007              neigh != NULL;
1008              neigh = rcu_dereference_protected(neigh->hnext,
1009                                                lockdep_is_held(&priv->lock))) {
1010                 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1011                         /* found, take one ref on behalf of the caller */
1012                         if (!atomic_inc_not_zero(&neigh->refcnt)) {
1013                                 /* deleted */
1014                                 neigh = NULL;
1015                                 break;
1016                         }
1017                         neigh->alive = jiffies;
1018                         goto out_unlock;
1019                 }
1020         }
1021
1022         neigh = ipoib_neigh_ctor(daddr, dev);
1023         if (!neigh)
1024                 goto out_unlock;
1025
1026         /* one ref on behalf of the hash table */
1027         atomic_inc(&neigh->refcnt);
1028         neigh->alive = jiffies;
1029         /* put in hash */
1030         rcu_assign_pointer(neigh->hnext,
1031                            rcu_dereference_protected(htbl->buckets[hash_val],
1032                                                      lockdep_is_held(&priv->lock)));
1033         rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1034         atomic_inc(&ntbl->entries);
1035
1036 out_unlock:
1037
1038         return neigh;
1039 }
1040
1041 void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1042 {
1043         /* neigh reference count was dropprd to zero */
1044         struct net_device *dev = neigh->dev;
1045         struct ipoib_dev_priv *priv = netdev_priv(dev);
1046         struct sk_buff *skb;
1047         if (neigh->ah)
1048                 ipoib_put_ah(neigh->ah);
1049         while ((skb = __skb_dequeue(&neigh->queue))) {
1050                 ++dev->stats.tx_dropped;
1051                 dev_kfree_skb_any(skb);
1052         }
1053         if (ipoib_cm_get(neigh))
1054                 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1055         ipoib_dbg(netdev_priv(dev),
1056                   "neigh free for %06x %pI6\n",
1057                   IPOIB_QPN(neigh->daddr),
1058                   neigh->daddr + 4);
1059         kfree(neigh);
1060         if (atomic_dec_and_test(&priv->ntbl.entries)) {
1061                 if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1062                         complete(&priv->ntbl.flushed);
1063         }
1064 }
1065
1066 static void ipoib_neigh_reclaim(struct rcu_head *rp)
1067 {
1068         /* Called as a result of removal from hash table */
1069         struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1070         /* note TX context may hold another ref */
1071         ipoib_neigh_put(neigh);
1072 }
1073
1074 void ipoib_neigh_free(struct ipoib_neigh *neigh)
1075 {
1076         struct net_device *dev = neigh->dev;
1077         struct ipoib_dev_priv *priv = netdev_priv(dev);
1078         struct ipoib_neigh_table *ntbl = &priv->ntbl;
1079         struct ipoib_neigh_hash *htbl;
1080         struct ipoib_neigh __rcu **np;
1081         struct ipoib_neigh *n;
1082         u32 hash_val;
1083
1084         htbl = rcu_dereference_protected(ntbl->htbl,
1085                                         lockdep_is_held(&priv->lock));
1086         if (!htbl)
1087                 return;
1088
1089         hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1090         np = &htbl->buckets[hash_val];
1091         for (n = rcu_dereference_protected(*np,
1092                                             lockdep_is_held(&priv->lock));
1093              n != NULL;
1094              n = rcu_dereference_protected(*np,
1095                                         lockdep_is_held(&priv->lock))) {
1096                 if (n == neigh) {
1097                         /* found */
1098                         rcu_assign_pointer(*np,
1099                                            rcu_dereference_protected(neigh->hnext,
1100                                                                      lockdep_is_held(&priv->lock)));
1101                         /* remove from parent list */
1102                         list_del(&neigh->list);
1103                         call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1104                         return;
1105                 } else {
1106                         np = &n->hnext;
1107                 }
1108         }
1109 }
1110
1111 static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1112 {
1113         struct ipoib_neigh_table *ntbl = &priv->ntbl;
1114         struct ipoib_neigh_hash *htbl;
1115         struct ipoib_neigh **buckets;
1116         u32 size;
1117
1118         clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1119         ntbl->htbl = NULL;
1120         htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1121         if (!htbl)
1122                 return -ENOMEM;
1123         set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1124         size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1125         buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1126         if (!buckets) {
1127                 kfree(htbl);
1128                 return -ENOMEM;
1129         }
1130         htbl->size = size;
1131         htbl->mask = (size - 1);
1132         htbl->buckets = buckets;
1133         ntbl->htbl = htbl;
1134         htbl->ntbl = ntbl;
1135         atomic_set(&ntbl->entries, 0);
1136
1137         /* start garbage collection */
1138         clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1139         queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1140                            arp_tbl.gc_interval);
1141
1142         return 0;
1143 }
1144
1145 static void neigh_hash_free_rcu(struct rcu_head *head)
1146 {
1147         struct ipoib_neigh_hash *htbl = container_of(head,
1148                                                     struct ipoib_neigh_hash,
1149                                                     rcu);
1150         struct ipoib_neigh __rcu **buckets = htbl->buckets;
1151         struct ipoib_neigh_table *ntbl = htbl->ntbl;
1152
1153         kfree(buckets);
1154         kfree(htbl);
1155         complete(&ntbl->deleted);
1156 }
1157
1158 void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1159 {
1160         struct ipoib_dev_priv *priv = netdev_priv(dev);
1161         struct ipoib_neigh_table *ntbl = &priv->ntbl;
1162         struct ipoib_neigh_hash *htbl;
1163         unsigned long flags;
1164         int i;
1165
1166         /* remove all neigh connected to a given path or mcast */
1167         spin_lock_irqsave(&priv->lock, flags);
1168
1169         htbl = rcu_dereference_protected(ntbl->htbl,
1170                                          lockdep_is_held(&priv->lock));
1171
1172         if (!htbl)
1173                 goto out_unlock;
1174
1175         for (i = 0; i < htbl->size; i++) {
1176                 struct ipoib_neigh *neigh;
1177                 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1178
1179                 while ((neigh = rcu_dereference_protected(*np,
1180                                                           lockdep_is_held(&priv->lock))) != NULL) {
1181                         /* delete neighs belong to this parent */
1182                         if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1183                                 rcu_assign_pointer(*np,
1184                                                    rcu_dereference_protected(neigh->hnext,
1185                                                                              lockdep_is_held(&priv->lock)));
1186                                 /* remove from parent list */
1187                                 list_del(&neigh->list);
1188                                 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1189                         } else {
1190                                 np = &neigh->hnext;
1191                         }
1192
1193                 }
1194         }
1195 out_unlock:
1196         spin_unlock_irqrestore(&priv->lock, flags);
1197 }
1198
1199 static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1200 {
1201         struct ipoib_neigh_table *ntbl = &priv->ntbl;
1202         struct ipoib_neigh_hash *htbl;
1203         unsigned long flags;
1204         int i, wait_flushed = 0;
1205
1206         init_completion(&priv->ntbl.flushed);
1207
1208         spin_lock_irqsave(&priv->lock, flags);
1209
1210         htbl = rcu_dereference_protected(ntbl->htbl,
1211                                         lockdep_is_held(&priv->lock));
1212         if (!htbl)
1213                 goto out_unlock;
1214
1215         wait_flushed = atomic_read(&priv->ntbl.entries);
1216         if (!wait_flushed)
1217                 goto free_htbl;
1218
1219         for (i = 0; i < htbl->size; i++) {
1220                 struct ipoib_neigh *neigh;
1221                 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1222
1223                 while ((neigh = rcu_dereference_protected(*np,
1224                                        lockdep_is_held(&priv->lock))) != NULL) {
1225                         rcu_assign_pointer(*np,
1226                                            rcu_dereference_protected(neigh->hnext,
1227                                                                      lockdep_is_held(&priv->lock)));
1228                         /* remove from path/mc list */
1229                         list_del(&neigh->list);
1230                         call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1231                 }
1232         }
1233
1234 free_htbl:
1235         rcu_assign_pointer(ntbl->htbl, NULL);
1236         call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1237
1238 out_unlock:
1239         spin_unlock_irqrestore(&priv->lock, flags);
1240         if (wait_flushed)
1241                 wait_for_completion(&priv->ntbl.flushed);
1242 }
1243
1244 static void ipoib_neigh_hash_uninit(struct net_device *dev)
1245 {
1246         struct ipoib_dev_priv *priv = netdev_priv(dev);
1247         int stopped;
1248
1249         ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1250         init_completion(&priv->ntbl.deleted);
1251         set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1252
1253         /* Stop GC if called at init fail need to cancel work */
1254         stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1255         if (!stopped)
1256                 cancel_delayed_work(&priv->neigh_reap_task);
1257
1258         ipoib_flush_neighs(priv);
1259
1260         wait_for_completion(&priv->ntbl.deleted);
1261 }
1262
1263
1264 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1265 {
1266         struct ipoib_dev_priv *priv = netdev_priv(dev);
1267
1268         if (ipoib_neigh_hash_init(priv) < 0)
1269                 goto out;
1270         /* Allocate RX/TX "rings" to hold queued skbs */
1271         priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1272                                 GFP_KERNEL);
1273         if (!priv->rx_ring) {
1274                 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1275                        ca->name, ipoib_recvq_size);
1276                 goto out_neigh_hash_cleanup;
1277         }
1278
1279         priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1280         if (!priv->tx_ring) {
1281                 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1282                        ca->name, ipoib_sendq_size);
1283                 goto out_rx_ring_cleanup;
1284         }
1285
1286         /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1287
1288         if (ipoib_ib_dev_init(dev, ca, port))
1289                 goto out_tx_ring_cleanup;
1290
1291         return 0;
1292
1293 out_tx_ring_cleanup:
1294         vfree(priv->tx_ring);
1295
1296 out_rx_ring_cleanup:
1297         kfree(priv->rx_ring);
1298
1299 out_neigh_hash_cleanup:
1300         ipoib_neigh_hash_uninit(dev);
1301 out:
1302         return -ENOMEM;
1303 }
1304
1305 void ipoib_dev_cleanup(struct net_device *dev)
1306 {
1307         struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1308         LIST_HEAD(head);
1309
1310         ASSERT_RTNL();
1311
1312         ipoib_delete_debug_files(dev);
1313
1314         /* Delete any child interfaces first */
1315         list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1316                 /* Stop GC on child */
1317                 set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1318                 cancel_delayed_work(&cpriv->neigh_reap_task);
1319                 unregister_netdevice_queue(cpriv->dev, &head);
1320         }
1321         unregister_netdevice_many(&head);
1322
1323         ipoib_ib_dev_cleanup(dev);
1324
1325         kfree(priv->rx_ring);
1326         vfree(priv->tx_ring);
1327
1328         priv->rx_ring = NULL;
1329         priv->tx_ring = NULL;
1330
1331         ipoib_neigh_hash_uninit(dev);
1332 }
1333
1334 static const struct header_ops ipoib_header_ops = {
1335         .create = ipoib_hard_header,
1336 };
1337
1338 static const struct net_device_ops ipoib_netdev_ops = {
1339         .ndo_uninit              = ipoib_uninit,
1340         .ndo_open                = ipoib_open,
1341         .ndo_stop                = ipoib_stop,
1342         .ndo_change_mtu          = ipoib_change_mtu,
1343         .ndo_fix_features        = ipoib_fix_features,
1344         .ndo_start_xmit          = ipoib_start_xmit,
1345         .ndo_tx_timeout          = ipoib_timeout,
1346         .ndo_set_rx_mode         = ipoib_set_mcast_list,
1347 };
1348
1349 void ipoib_setup(struct net_device *dev)
1350 {
1351         struct ipoib_dev_priv *priv = netdev_priv(dev);
1352
1353         dev->netdev_ops          = &ipoib_netdev_ops;
1354         dev->header_ops          = &ipoib_header_ops;
1355
1356         ipoib_set_ethtool_ops(dev);
1357
1358         netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
1359
1360         dev->watchdog_timeo      = HZ;
1361
1362         dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
1363
1364         dev->hard_header_len     = IPOIB_ENCAP_LEN;
1365         dev->addr_len            = INFINIBAND_ALEN;
1366         dev->type                = ARPHRD_INFINIBAND;
1367         dev->tx_queue_len        = ipoib_sendq_size * 2;
1368         dev->features            = (NETIF_F_VLAN_CHALLENGED     |
1369                                     NETIF_F_HIGHDMA);
1370         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1371
1372         memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1373
1374         priv->dev = dev;
1375
1376         spin_lock_init(&priv->lock);
1377
1378         init_rwsem(&priv->vlan_rwsem);
1379
1380         INIT_LIST_HEAD(&priv->path_list);
1381         INIT_LIST_HEAD(&priv->child_intfs);
1382         INIT_LIST_HEAD(&priv->dead_ahs);
1383         INIT_LIST_HEAD(&priv->multicast_list);
1384
1385         INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1386         INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1387         INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1388         INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1389         INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1390         INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1391         INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1392         INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1393 }
1394
1395 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1396 {
1397         struct net_device *dev;
1398
1399         dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
1400                            ipoib_setup);
1401         if (!dev)
1402                 return NULL;
1403
1404         return netdev_priv(dev);
1405 }
1406
1407 static ssize_t show_pkey(struct device *dev,
1408                          struct device_attribute *attr, char *buf)
1409 {
1410         struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1411
1412         return sprintf(buf, "0x%04x\n", priv->pkey);
1413 }
1414 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1415
1416 static ssize_t show_umcast(struct device *dev,
1417                            struct device_attribute *attr, char *buf)
1418 {
1419         struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1420
1421         return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1422 }
1423
1424 void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1425 {
1426         struct ipoib_dev_priv *priv = netdev_priv(ndev);
1427
1428         if (umcast_val > 0) {
1429                 set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1430                 ipoib_warn(priv, "ignoring multicast groups joined directly "
1431                                 "by userspace\n");
1432         } else
1433                 clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1434 }
1435
1436 static ssize_t set_umcast(struct device *dev,
1437                           struct device_attribute *attr,
1438                           const char *buf, size_t count)
1439 {
1440         unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1441
1442         ipoib_set_umcast(to_net_dev(dev), umcast_val);
1443
1444         return count;
1445 }
1446 static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1447
1448 int ipoib_add_umcast_attr(struct net_device *dev)
1449 {
1450         return device_create_file(&dev->dev, &dev_attr_umcast);
1451 }
1452
1453 static ssize_t create_child(struct device *dev,
1454                             struct device_attribute *attr,
1455                             const char *buf, size_t count)
1456 {
1457         int pkey;
1458         int ret;
1459
1460         if (sscanf(buf, "%i", &pkey) != 1)
1461                 return -EINVAL;
1462
1463         if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
1464                 return -EINVAL;
1465
1466         /*
1467          * Set the full membership bit, so that we join the right
1468          * broadcast group, etc.
1469          */
1470         pkey |= 0x8000;
1471
1472         ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1473
1474         return ret ? ret : count;
1475 }
1476 static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1477
1478 static ssize_t delete_child(struct device *dev,
1479                             struct device_attribute *attr,
1480                             const char *buf, size_t count)
1481 {
1482         int pkey;
1483         int ret;
1484
1485         if (sscanf(buf, "%i", &pkey) != 1)
1486                 return -EINVAL;
1487
1488         if (pkey < 0 || pkey > 0xffff)
1489                 return -EINVAL;
1490
1491         ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1492
1493         return ret ? ret : count;
1494
1495 }
1496 static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1497
1498 int ipoib_add_pkey_attr(struct net_device *dev)
1499 {
1500         return device_create_file(&dev->dev, &dev_attr_pkey);
1501 }
1502
1503 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1504 {
1505         struct ib_device_attr *device_attr;
1506         int result = -ENOMEM;
1507
1508         device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
1509         if (!device_attr) {
1510                 printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
1511                        hca->name, sizeof *device_attr);
1512                 return result;
1513         }
1514
1515         result = ib_query_device(hca, device_attr);
1516         if (result) {
1517                 printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
1518                        hca->name, result);
1519                 kfree(device_attr);
1520                 return result;
1521         }
1522         priv->hca_caps = device_attr->device_cap_flags;
1523
1524         kfree(device_attr);
1525
1526         if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1527                 priv->dev->hw_features = NETIF_F_SG |
1528                         NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1529
1530                 if (priv->hca_caps & IB_DEVICE_UD_TSO)
1531                         priv->dev->hw_features |= NETIF_F_TSO;
1532
1533                 priv->dev->features |= priv->dev->hw_features;
1534         }
1535
1536         return 0;
1537 }
1538
1539 static struct net_device *ipoib_add_port(const char *format,
1540                                          struct ib_device *hca, u8 port)
1541 {
1542         struct ipoib_dev_priv *priv;
1543         struct ib_port_attr attr;
1544         int result = -ENOMEM;
1545
1546         priv = ipoib_intf_alloc(format);
1547         if (!priv)
1548                 goto alloc_mem_failed;
1549
1550         SET_NETDEV_DEV(priv->dev, hca->dma_device);
1551         priv->dev->dev_id = port - 1;
1552
1553         if (!ib_query_port(hca, port, &attr))
1554                 priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1555         else {
1556                 printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1557                        hca->name, port);
1558                 goto device_init_failed;
1559         }
1560
1561         /* MTU will be reset when mcast join happens */
1562         priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1563         priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1564
1565         priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1566
1567         result = ib_query_pkey(hca, port, 0, &priv->pkey);
1568         if (result) {
1569                 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1570                        hca->name, port, result);
1571                 goto device_init_failed;
1572         }
1573
1574         if (ipoib_set_dev_features(priv, hca))
1575                 goto device_init_failed;
1576
1577         /*
1578          * Set the full membership bit, so that we join the right
1579          * broadcast group, etc.
1580          */
1581         priv->pkey |= 0x8000;
1582
1583         priv->dev->broadcast[8] = priv->pkey >> 8;
1584         priv->dev->broadcast[9] = priv->pkey & 0xff;
1585
1586         result = ib_query_gid(hca, port, 0, &priv->local_gid);
1587         if (result) {
1588                 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1589                        hca->name, port, result);
1590                 goto device_init_failed;
1591         } else
1592                 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1593
1594         result = ipoib_dev_init(priv->dev, hca, port);
1595         if (result < 0) {
1596                 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1597                        hca->name, port, result);
1598                 goto device_init_failed;
1599         }
1600
1601         INIT_IB_EVENT_HANDLER(&priv->event_handler,
1602                               priv->ca, ipoib_event);
1603         result = ib_register_event_handler(&priv->event_handler);
1604         if (result < 0) {
1605                 printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1606                        "port %d (ret = %d)\n",
1607                        hca->name, port, result);
1608                 goto event_failed;
1609         }
1610
1611         result = register_netdev(priv->dev);
1612         if (result) {
1613                 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1614                        hca->name, port, result);
1615                 goto register_failed;
1616         }
1617
1618         ipoib_create_debug_files(priv->dev);
1619
1620         if (ipoib_cm_add_mode_attr(priv->dev))
1621                 goto sysfs_failed;
1622         if (ipoib_add_pkey_attr(priv->dev))
1623                 goto sysfs_failed;
1624         if (ipoib_add_umcast_attr(priv->dev))
1625                 goto sysfs_failed;
1626         if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1627                 goto sysfs_failed;
1628         if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1629                 goto sysfs_failed;
1630
1631         return priv->dev;
1632
1633 sysfs_failed:
1634         ipoib_delete_debug_files(priv->dev);
1635         unregister_netdev(priv->dev);
1636
1637 register_failed:
1638         ib_unregister_event_handler(&priv->event_handler);
1639         /* Stop GC if started before flush */
1640         set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1641         cancel_delayed_work(&priv->neigh_reap_task);
1642         flush_workqueue(ipoib_workqueue);
1643
1644 event_failed:
1645         ipoib_dev_cleanup(priv->dev);
1646
1647 device_init_failed:
1648         free_netdev(priv->dev);
1649
1650 alloc_mem_failed:
1651         return ERR_PTR(result);
1652 }
1653
1654 static void ipoib_add_one(struct ib_device *device)
1655 {
1656         struct list_head *dev_list;
1657         struct net_device *dev;
1658         struct ipoib_dev_priv *priv;
1659         int s, e, p;
1660
1661         if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1662                 return;
1663
1664         dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1665         if (!dev_list)
1666                 return;
1667
1668         INIT_LIST_HEAD(dev_list);
1669
1670         if (device->node_type == RDMA_NODE_IB_SWITCH) {
1671                 s = 0;
1672                 e = 0;
1673         } else {
1674                 s = 1;
1675                 e = device->phys_port_cnt;
1676         }
1677
1678         for (p = s; p <= e; ++p) {
1679                 if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1680                         continue;
1681                 dev = ipoib_add_port("ib%d", device, p);
1682                 if (!IS_ERR(dev)) {
1683                         priv = netdev_priv(dev);
1684                         list_add_tail(&priv->list, dev_list);
1685                 }
1686         }
1687
1688         ib_set_client_data(device, &ipoib_client, dev_list);
1689 }
1690
1691 static void ipoib_remove_one(struct ib_device *device)
1692 {
1693         struct ipoib_dev_priv *priv, *tmp;
1694         struct list_head *dev_list;
1695
1696         if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1697                 return;
1698
1699         dev_list = ib_get_client_data(device, &ipoib_client);
1700         if (!dev_list)
1701                 return;
1702
1703         list_for_each_entry_safe(priv, tmp, dev_list, list) {
1704                 ib_unregister_event_handler(&priv->event_handler);
1705
1706                 rtnl_lock();
1707                 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1708                 rtnl_unlock();
1709
1710                 /* Stop GC */
1711                 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1712                 cancel_delayed_work(&priv->neigh_reap_task);
1713                 flush_workqueue(ipoib_workqueue);
1714
1715                 unregister_netdev(priv->dev);
1716                 free_netdev(priv->dev);
1717         }
1718
1719         kfree(dev_list);
1720 }
1721
1722 static int __init ipoib_init_module(void)
1723 {
1724         int ret;
1725
1726         ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1727         ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1728         ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1729
1730         ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1731         ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1732         ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1733 #ifdef CONFIG_INFINIBAND_IPOIB_CM
1734         ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1735 #endif
1736
1737         /*
1738          * When copying small received packets, we only copy from the
1739          * linear data part of the SKB, so we rely on this condition.
1740          */
1741         BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
1742
1743         ret = ipoib_register_debugfs();
1744         if (ret)
1745                 return ret;
1746
1747         /*
1748          * We create our own workqueue mainly because we want to be
1749          * able to flush it when devices are being removed.  We can't
1750          * use schedule_work()/flush_scheduled_work() because both
1751          * unregister_netdev() and linkwatch_event take the rtnl lock,
1752          * so flush_scheduled_work() can deadlock during device
1753          * removal.
1754          */
1755         ipoib_workqueue = create_singlethread_workqueue("ipoib");
1756         if (!ipoib_workqueue) {
1757                 ret = -ENOMEM;
1758                 goto err_fs;
1759         }
1760
1761         ib_sa_register_client(&ipoib_sa_client);
1762
1763         ret = ib_register_client(&ipoib_client);
1764         if (ret)
1765                 goto err_sa;
1766
1767         ret = ipoib_netlink_init();
1768         if (ret)
1769                 goto err_client;
1770
1771         return 0;
1772
1773 err_client:
1774         ib_unregister_client(&ipoib_client);
1775
1776 err_sa:
1777         ib_sa_unregister_client(&ipoib_sa_client);
1778         destroy_workqueue(ipoib_workqueue);
1779
1780 err_fs:
1781         ipoib_unregister_debugfs();
1782
1783         return ret;
1784 }
1785
1786 static void __exit ipoib_cleanup_module(void)
1787 {
1788         ipoib_netlink_fini();
1789         ib_unregister_client(&ipoib_client);
1790         ib_sa_unregister_client(&ipoib_sa_client);
1791         ipoib_unregister_debugfs();
1792         destroy_workqueue(ipoib_workqueue);
1793 }
1794
1795 module_init(ipoib_init_module);
1796 module_exit(ipoib_cleanup_module);