writeback: clean up wb_dirty_limit()
[cascardo/linux.git] / mm / backing-dev.c
1
2 #include <linux/wait.h>
3 #include <linux/backing-dev.h>
4 #include <linux/kthread.h>
5 #include <linux/freezer.h>
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/mm.h>
9 #include <linux/sched.h>
10 #include <linux/module.h>
11 #include <linux/writeback.h>
12 #include <linux/device.h>
13 #include <trace/events/writeback.h>
14
15 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16
17 struct backing_dev_info noop_backing_dev_info = {
18         .name           = "noop",
19         .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
20 };
21 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
22
23 static struct class *bdi_class;
24
25 /*
26  * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
27  * locking.
28  */
29 DEFINE_SPINLOCK(bdi_lock);
30 LIST_HEAD(bdi_list);
31
32 /* bdi_wq serves all asynchronous writeback tasks */
33 struct workqueue_struct *bdi_wq;
34
35 #ifdef CONFIG_DEBUG_FS
36 #include <linux/debugfs.h>
37 #include <linux/seq_file.h>
38
39 static struct dentry *bdi_debug_root;
40
41 static void bdi_debug_init(void)
42 {
43         bdi_debug_root = debugfs_create_dir("bdi", NULL);
44 }
45
46 static int bdi_debug_stats_show(struct seq_file *m, void *v)
47 {
48         struct backing_dev_info *bdi = m->private;
49         struct bdi_writeback *wb = &bdi->wb;
50         unsigned long background_thresh;
51         unsigned long dirty_thresh;
52         unsigned long wb_thresh;
53         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
54         struct inode *inode;
55
56         nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
57         spin_lock(&wb->list_lock);
58         list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
59                 nr_dirty++;
60         list_for_each_entry(inode, &wb->b_io, i_wb_list)
61                 nr_io++;
62         list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
63                 nr_more_io++;
64         list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
65                 if (inode->i_state & I_DIRTY_TIME)
66                         nr_dirty_time++;
67         spin_unlock(&wb->list_lock);
68
69         global_dirty_limits(&background_thresh, &dirty_thresh);
70         wb_thresh = wb_calc_thresh(wb, dirty_thresh);
71
72 #define K(x) ((x) << (PAGE_SHIFT - 10))
73         seq_printf(m,
74                    "BdiWriteback:       %10lu kB\n"
75                    "BdiReclaimable:     %10lu kB\n"
76                    "BdiDirtyThresh:     %10lu kB\n"
77                    "DirtyThresh:        %10lu kB\n"
78                    "BackgroundThresh:   %10lu kB\n"
79                    "BdiDirtied:         %10lu kB\n"
80                    "BdiWritten:         %10lu kB\n"
81                    "BdiWriteBandwidth:  %10lu kBps\n"
82                    "b_dirty:            %10lu\n"
83                    "b_io:               %10lu\n"
84                    "b_more_io:          %10lu\n"
85                    "b_dirty_time:       %10lu\n"
86                    "bdi_list:           %10u\n"
87                    "state:              %10lx\n",
88                    (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
89                    (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
90                    K(wb_thresh),
91                    K(dirty_thresh),
92                    K(background_thresh),
93                    (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
94                    (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
95                    (unsigned long) K(wb->write_bandwidth),
96                    nr_dirty,
97                    nr_io,
98                    nr_more_io,
99                    nr_dirty_time,
100                    !list_empty(&bdi->bdi_list), bdi->wb.state);
101 #undef K
102
103         return 0;
104 }
105
106 static int bdi_debug_stats_open(struct inode *inode, struct file *file)
107 {
108         return single_open(file, bdi_debug_stats_show, inode->i_private);
109 }
110
111 static const struct file_operations bdi_debug_stats_fops = {
112         .open           = bdi_debug_stats_open,
113         .read           = seq_read,
114         .llseek         = seq_lseek,
115         .release        = single_release,
116 };
117
118 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
119 {
120         bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
121         bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
122                                                bdi, &bdi_debug_stats_fops);
123 }
124
125 static void bdi_debug_unregister(struct backing_dev_info *bdi)
126 {
127         debugfs_remove(bdi->debug_stats);
128         debugfs_remove(bdi->debug_dir);
129 }
130 #else
131 static inline void bdi_debug_init(void)
132 {
133 }
134 static inline void bdi_debug_register(struct backing_dev_info *bdi,
135                                       const char *name)
136 {
137 }
138 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
139 {
140 }
141 #endif
142
143 static ssize_t read_ahead_kb_store(struct device *dev,
144                                   struct device_attribute *attr,
145                                   const char *buf, size_t count)
146 {
147         struct backing_dev_info *bdi = dev_get_drvdata(dev);
148         unsigned long read_ahead_kb;
149         ssize_t ret;
150
151         ret = kstrtoul(buf, 10, &read_ahead_kb);
152         if (ret < 0)
153                 return ret;
154
155         bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
156
157         return count;
158 }
159
160 #define K(pages) ((pages) << (PAGE_SHIFT - 10))
161
162 #define BDI_SHOW(name, expr)                                            \
163 static ssize_t name##_show(struct device *dev,                          \
164                            struct device_attribute *attr, char *page)   \
165 {                                                                       \
166         struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
167                                                                         \
168         return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);  \
169 }                                                                       \
170 static DEVICE_ATTR_RW(name);
171
172 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
173
174 static ssize_t min_ratio_store(struct device *dev,
175                 struct device_attribute *attr, const char *buf, size_t count)
176 {
177         struct backing_dev_info *bdi = dev_get_drvdata(dev);
178         unsigned int ratio;
179         ssize_t ret;
180
181         ret = kstrtouint(buf, 10, &ratio);
182         if (ret < 0)
183                 return ret;
184
185         ret = bdi_set_min_ratio(bdi, ratio);
186         if (!ret)
187                 ret = count;
188
189         return ret;
190 }
191 BDI_SHOW(min_ratio, bdi->min_ratio)
192
193 static ssize_t max_ratio_store(struct device *dev,
194                 struct device_attribute *attr, const char *buf, size_t count)
195 {
196         struct backing_dev_info *bdi = dev_get_drvdata(dev);
197         unsigned int ratio;
198         ssize_t ret;
199
200         ret = kstrtouint(buf, 10, &ratio);
201         if (ret < 0)
202                 return ret;
203
204         ret = bdi_set_max_ratio(bdi, ratio);
205         if (!ret)
206                 ret = count;
207
208         return ret;
209 }
210 BDI_SHOW(max_ratio, bdi->max_ratio)
211
212 static ssize_t stable_pages_required_show(struct device *dev,
213                                           struct device_attribute *attr,
214                                           char *page)
215 {
216         struct backing_dev_info *bdi = dev_get_drvdata(dev);
217
218         return snprintf(page, PAGE_SIZE-1, "%d\n",
219                         bdi_cap_stable_pages_required(bdi) ? 1 : 0);
220 }
221 static DEVICE_ATTR_RO(stable_pages_required);
222
223 static struct attribute *bdi_dev_attrs[] = {
224         &dev_attr_read_ahead_kb.attr,
225         &dev_attr_min_ratio.attr,
226         &dev_attr_max_ratio.attr,
227         &dev_attr_stable_pages_required.attr,
228         NULL,
229 };
230 ATTRIBUTE_GROUPS(bdi_dev);
231
232 static __init int bdi_class_init(void)
233 {
234         bdi_class = class_create(THIS_MODULE, "bdi");
235         if (IS_ERR(bdi_class))
236                 return PTR_ERR(bdi_class);
237
238         bdi_class->dev_groups = bdi_dev_groups;
239         bdi_debug_init();
240         return 0;
241 }
242 postcore_initcall(bdi_class_init);
243
244 static int __init default_bdi_init(void)
245 {
246         int err;
247
248         bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
249                                               WQ_UNBOUND | WQ_SYSFS, 0);
250         if (!bdi_wq)
251                 return -ENOMEM;
252
253         err = bdi_init(&noop_backing_dev_info);
254
255         return err;
256 }
257 subsys_initcall(default_bdi_init);
258
259 /*
260  * This function is used when the first inode for this wb is marked dirty. It
261  * wakes-up the corresponding bdi thread which should then take care of the
262  * periodic background write-out of dirty inodes. Since the write-out would
263  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
264  * set up a timer which wakes the bdi thread up later.
265  *
266  * Note, we wouldn't bother setting up the timer, but this function is on the
267  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
268  * by delaying the wake-up.
269  *
270  * We have to be careful not to postpone flush work if it is scheduled for
271  * earlier. Thus we use queue_delayed_work().
272  */
273 void wb_wakeup_delayed(struct bdi_writeback *wb)
274 {
275         unsigned long timeout;
276
277         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
278         spin_lock_bh(&wb->work_lock);
279         if (test_bit(WB_registered, &wb->state))
280                 queue_delayed_work(bdi_wq, &wb->dwork, timeout);
281         spin_unlock_bh(&wb->work_lock);
282 }
283
284 /*
285  * Initial write bandwidth: 100 MB/s
286  */
287 #define INIT_BW         (100 << (20 - PAGE_SHIFT))
288
289 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
290                    gfp_t gfp)
291 {
292         int i, err;
293
294         memset(wb, 0, sizeof(*wb));
295
296         wb->bdi = bdi;
297         wb->last_old_flush = jiffies;
298         INIT_LIST_HEAD(&wb->b_dirty);
299         INIT_LIST_HEAD(&wb->b_io);
300         INIT_LIST_HEAD(&wb->b_more_io);
301         INIT_LIST_HEAD(&wb->b_dirty_time);
302         spin_lock_init(&wb->list_lock);
303
304         wb->bw_time_stamp = jiffies;
305         wb->balanced_dirty_ratelimit = INIT_BW;
306         wb->dirty_ratelimit = INIT_BW;
307         wb->write_bandwidth = INIT_BW;
308         wb->avg_write_bandwidth = INIT_BW;
309
310         spin_lock_init(&wb->work_lock);
311         INIT_LIST_HEAD(&wb->work_list);
312         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
313
314         err = fprop_local_init_percpu(&wb->completions, gfp);
315         if (err)
316                 return err;
317
318         for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
319                 err = percpu_counter_init(&wb->stat[i], 0, gfp);
320                 if (err) {
321                         while (--i)
322                                 percpu_counter_destroy(&wb->stat[i]);
323                         fprop_local_destroy_percpu(&wb->completions);
324                         return err;
325                 }
326         }
327
328         return 0;
329 }
330
331 /*
332  * Remove bdi from the global list and shutdown any threads we have running
333  */
334 static void wb_shutdown(struct bdi_writeback *wb)
335 {
336         /* Make sure nobody queues further work */
337         spin_lock_bh(&wb->work_lock);
338         if (!test_and_clear_bit(WB_registered, &wb->state)) {
339                 spin_unlock_bh(&wb->work_lock);
340                 return;
341         }
342         spin_unlock_bh(&wb->work_lock);
343
344         /*
345          * Drain work list and shutdown the delayed_work.  !WB_registered
346          * tells wb_workfn() that @wb is dying and its work_list needs to
347          * be drained no matter what.
348          */
349         mod_delayed_work(bdi_wq, &wb->dwork, 0);
350         flush_delayed_work(&wb->dwork);
351         WARN_ON(!list_empty(&wb->work_list));
352 }
353
354 static void wb_exit(struct bdi_writeback *wb)
355 {
356         int i;
357
358         WARN_ON(delayed_work_pending(&wb->dwork));
359
360         for (i = 0; i < NR_WB_STAT_ITEMS; i++)
361                 percpu_counter_destroy(&wb->stat[i]);
362
363         fprop_local_destroy_percpu(&wb->completions);
364 }
365
366 #ifdef CONFIG_CGROUP_WRITEBACK
367
368 #include <linux/memcontrol.h>
369
370 /*
371  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
372  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
373  * protected.  cgwb_release_wait is used to wait for the completion of cgwb
374  * releases from bdi destruction path.
375  */
376 static DEFINE_SPINLOCK(cgwb_lock);
377 static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
378
379 /**
380  * wb_congested_get_create - get or create a wb_congested
381  * @bdi: associated bdi
382  * @blkcg_id: ID of the associated blkcg
383  * @gfp: allocation mask
384  *
385  * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
386  * The returned wb_congested has its reference count incremented.  Returns
387  * NULL on failure.
388  */
389 struct bdi_writeback_congested *
390 wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
391 {
392         struct bdi_writeback_congested *new_congested = NULL, *congested;
393         struct rb_node **node, *parent;
394         unsigned long flags;
395
396         if (blkcg_id == 1)
397                 return &bdi->wb_congested;
398 retry:
399         spin_lock_irqsave(&cgwb_lock, flags);
400
401         node = &bdi->cgwb_congested_tree.rb_node;
402         parent = NULL;
403
404         while (*node != NULL) {
405                 parent = *node;
406                 congested = container_of(parent, struct bdi_writeback_congested,
407                                          rb_node);
408                 if (congested->blkcg_id < blkcg_id)
409                         node = &parent->rb_left;
410                 else if (congested->blkcg_id > blkcg_id)
411                         node = &parent->rb_right;
412                 else
413                         goto found;
414         }
415
416         if (new_congested) {
417                 /* !found and storage for new one already allocated, insert */
418                 congested = new_congested;
419                 new_congested = NULL;
420                 rb_link_node(&congested->rb_node, parent, node);
421                 rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
422                 atomic_inc(&bdi->usage_cnt);
423                 goto found;
424         }
425
426         spin_unlock_irqrestore(&cgwb_lock, flags);
427
428         /* allocate storage for new one and retry */
429         new_congested = kzalloc(sizeof(*new_congested), gfp);
430         if (!new_congested)
431                 return NULL;
432
433         atomic_set(&new_congested->refcnt, 0);
434         new_congested->bdi = bdi;
435         new_congested->blkcg_id = blkcg_id;
436         goto retry;
437
438 found:
439         atomic_inc(&congested->refcnt);
440         spin_unlock_irqrestore(&cgwb_lock, flags);
441         kfree(new_congested);
442         return congested;
443 }
444
445 /**
446  * wb_congested_put - put a wb_congested
447  * @congested: wb_congested to put
448  *
449  * Put @congested and destroy it if the refcnt reaches zero.
450  */
451 void wb_congested_put(struct bdi_writeback_congested *congested)
452 {
453         struct backing_dev_info *bdi = congested->bdi;
454         unsigned long flags;
455
456         if (congested->blkcg_id == 1)
457                 return;
458
459         local_irq_save(flags);
460         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
461                 local_irq_restore(flags);
462                 return;
463         }
464
465         rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
466         spin_unlock_irqrestore(&cgwb_lock, flags);
467         kfree(congested);
468
469         if (atomic_dec_and_test(&bdi->usage_cnt))
470                 wake_up_all(&cgwb_release_wait);
471 }
472
473 static void cgwb_release_workfn(struct work_struct *work)
474 {
475         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
476                                                 release_work);
477         struct backing_dev_info *bdi = wb->bdi;
478
479         wb_shutdown(wb);
480
481         css_put(wb->memcg_css);
482         css_put(wb->blkcg_css);
483         wb_congested_put(wb->congested);
484
485         percpu_ref_exit(&wb->refcnt);
486         wb_exit(wb);
487         kfree_rcu(wb, rcu);
488
489         if (atomic_dec_and_test(&bdi->usage_cnt))
490                 wake_up_all(&cgwb_release_wait);
491 }
492
493 static void cgwb_release(struct percpu_ref *refcnt)
494 {
495         struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
496                                                 refcnt);
497         schedule_work(&wb->release_work);
498 }
499
500 static void cgwb_kill(struct bdi_writeback *wb)
501 {
502         lockdep_assert_held(&cgwb_lock);
503
504         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
505         list_del(&wb->memcg_node);
506         list_del(&wb->blkcg_node);
507         percpu_ref_kill(&wb->refcnt);
508 }
509
510 static int cgwb_create(struct backing_dev_info *bdi,
511                        struct cgroup_subsys_state *memcg_css, gfp_t gfp)
512 {
513         struct mem_cgroup *memcg;
514         struct cgroup_subsys_state *blkcg_css;
515         struct blkcg *blkcg;
516         struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
517         struct bdi_writeback *wb;
518         unsigned long flags;
519         int ret = 0;
520
521         memcg = mem_cgroup_from_css(memcg_css);
522         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
523         blkcg = css_to_blkcg(blkcg_css);
524         memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
525         blkcg_cgwb_list = &blkcg->cgwb_list;
526
527         /* look up again under lock and discard on blkcg mismatch */
528         spin_lock_irqsave(&cgwb_lock, flags);
529         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
530         if (wb && wb->blkcg_css != blkcg_css) {
531                 cgwb_kill(wb);
532                 wb = NULL;
533         }
534         spin_unlock_irqrestore(&cgwb_lock, flags);
535         if (wb)
536                 goto out_put;
537
538         /* need to create a new one */
539         wb = kmalloc(sizeof(*wb), gfp);
540         if (!wb)
541                 return -ENOMEM;
542
543         ret = wb_init(wb, bdi, gfp);
544         if (ret)
545                 goto err_free;
546
547         ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
548         if (ret)
549                 goto err_wb_exit;
550
551         wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
552         if (!wb->congested)
553                 goto err_ref_exit;
554
555         wb->memcg_css = memcg_css;
556         wb->blkcg_css = blkcg_css;
557         INIT_WORK(&wb->release_work, cgwb_release_workfn);
558         set_bit(WB_registered, &wb->state);
559
560         /*
561          * The root wb determines the registered state of the whole bdi and
562          * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
563          * whether they're still online.  Don't link @wb if any is dead.
564          * See wb_memcg_offline() and wb_blkcg_offline().
565          */
566         ret = -ENODEV;
567         spin_lock_irqsave(&cgwb_lock, flags);
568         if (test_bit(WB_registered, &bdi->wb.state) &&
569             blkcg_cgwb_list->next && memcg_cgwb_list->next) {
570                 /* we might have raced another instance of this function */
571                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
572                 if (!ret) {
573                         atomic_inc(&bdi->usage_cnt);
574                         list_add(&wb->memcg_node, memcg_cgwb_list);
575                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
576                         css_get(memcg_css);
577                         css_get(blkcg_css);
578                 }
579         }
580         spin_unlock_irqrestore(&cgwb_lock, flags);
581         if (ret) {
582                 if (ret == -EEXIST)
583                         ret = 0;
584                 goto err_put_congested;
585         }
586         goto out_put;
587
588 err_put_congested:
589         wb_congested_put(wb->congested);
590 err_ref_exit:
591         percpu_ref_exit(&wb->refcnt);
592 err_wb_exit:
593         wb_exit(wb);
594 err_free:
595         kfree(wb);
596 out_put:
597         css_put(blkcg_css);
598         return ret;
599 }
600
601 /**
602  * wb_get_create - get wb for a given memcg, create if necessary
603  * @bdi: target bdi
604  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
605  * @gfp: allocation mask to use
606  *
607  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
608  * create one.  The returned wb has its refcount incremented.
609  *
610  * This function uses css_get() on @memcg_css and thus expects its refcnt
611  * to be positive on invocation.  IOW, rcu_read_lock() protection on
612  * @memcg_css isn't enough.  try_get it before calling this function.
613  *
614  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
615  * memcg on the default hierarchy, memcg association is guaranteed to be
616  * more specific (equal or descendant to the associated blkcg) and thus can
617  * identify both the memcg and blkcg associations.
618  *
619  * Because the blkcg associated with a memcg may change as blkcg is enabled
620  * and disabled closer to root in the hierarchy, each wb keeps track of
621  * both the memcg and blkcg associated with it and verifies the blkcg on
622  * each lookup.  On mismatch, the existing wb is discarded and a new one is
623  * created.
624  */
625 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
626                                     struct cgroup_subsys_state *memcg_css,
627                                     gfp_t gfp)
628 {
629         struct bdi_writeback *wb;
630
631         might_sleep_if(gfp & __GFP_WAIT);
632
633         if (!memcg_css->parent)
634                 return &bdi->wb;
635
636         do {
637                 rcu_read_lock();
638                 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
639                 if (wb) {
640                         struct cgroup_subsys_state *blkcg_css;
641
642                         /* see whether the blkcg association has changed */
643                         blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
644                                                      &blkio_cgrp_subsys);
645                         if (unlikely(wb->blkcg_css != blkcg_css ||
646                                      !wb_tryget(wb)))
647                                 wb = NULL;
648                         css_put(blkcg_css);
649                 }
650                 rcu_read_unlock();
651         } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
652
653         return wb;
654 }
655
656 void __inode_attach_wb(struct inode *inode, struct page *page)
657 {
658         struct backing_dev_info *bdi = inode_to_bdi(inode);
659         struct bdi_writeback *wb = NULL;
660
661         if (inode_cgwb_enabled(inode)) {
662                 struct cgroup_subsys_state *memcg_css;
663
664                 if (page) {
665                         memcg_css = mem_cgroup_css_from_page(page);
666                         wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
667                 } else {
668                         /* must pin memcg_css, see wb_get_create() */
669                         memcg_css = task_get_css(current, memory_cgrp_id);
670                         wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
671                         css_put(memcg_css);
672                 }
673         }
674
675         if (!wb)
676                 wb = &bdi->wb;
677
678         /*
679          * There may be multiple instances of this function racing to
680          * update the same inode.  Use cmpxchg() to tell the winner.
681          */
682         if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
683                 wb_put(wb);
684 }
685
686 static void cgwb_bdi_init(struct backing_dev_info *bdi)
687 {
688         bdi->wb.memcg_css = mem_cgroup_root_css;
689         bdi->wb.blkcg_css = blkcg_root_css;
690         bdi->wb_congested.blkcg_id = 1;
691         INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
692         bdi->cgwb_congested_tree = RB_ROOT;
693         atomic_set(&bdi->usage_cnt, 1);
694 }
695
696 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
697 {
698         struct radix_tree_iter iter;
699         void **slot;
700
701         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
702
703         spin_lock_irq(&cgwb_lock);
704         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
705                 cgwb_kill(*slot);
706         spin_unlock_irq(&cgwb_lock);
707
708         /*
709          * All cgwb's and their congested states must be shutdown and
710          * released before returning.  Drain the usage counter to wait for
711          * all cgwb's and cgwb_congested's ever created on @bdi.
712          */
713         atomic_dec(&bdi->usage_cnt);
714         wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
715 }
716
717 /**
718  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
719  * @memcg: memcg being offlined
720  *
721  * Also prevents creation of any new wb's associated with @memcg.
722  */
723 void wb_memcg_offline(struct mem_cgroup *memcg)
724 {
725         LIST_HEAD(to_destroy);
726         struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
727         struct bdi_writeback *wb, *next;
728
729         spin_lock_irq(&cgwb_lock);
730         list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
731                 cgwb_kill(wb);
732         memcg_cgwb_list->next = NULL;   /* prevent new wb's */
733         spin_unlock_irq(&cgwb_lock);
734 }
735
736 /**
737  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
738  * @blkcg: blkcg being offlined
739  *
740  * Also prevents creation of any new wb's associated with @blkcg.
741  */
742 void wb_blkcg_offline(struct blkcg *blkcg)
743 {
744         LIST_HEAD(to_destroy);
745         struct bdi_writeback *wb, *next;
746
747         spin_lock_irq(&cgwb_lock);
748         list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
749                 cgwb_kill(wb);
750         blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
751         spin_unlock_irq(&cgwb_lock);
752 }
753
754 #else   /* CONFIG_CGROUP_WRITEBACK */
755
756 static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
757 static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
758
759 #endif  /* CONFIG_CGROUP_WRITEBACK */
760
761 int bdi_init(struct backing_dev_info *bdi)
762 {
763         int err;
764
765         bdi->dev = NULL;
766
767         bdi->min_ratio = 0;
768         bdi->max_ratio = 100;
769         bdi->max_prop_frac = FPROP_FRAC_BASE;
770         INIT_LIST_HEAD(&bdi->bdi_list);
771         init_waitqueue_head(&bdi->wb_waitq);
772
773         err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
774         if (err)
775                 return err;
776
777         bdi->wb_congested.state = 0;
778         bdi->wb.congested = &bdi->wb_congested;
779
780         cgwb_bdi_init(bdi);
781         return 0;
782 }
783 EXPORT_SYMBOL(bdi_init);
784
785 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
786                 const char *fmt, ...)
787 {
788         va_list args;
789         struct device *dev;
790
791         if (bdi->dev)   /* The driver needs to use separate queues per device */
792                 return 0;
793
794         va_start(args, fmt);
795         dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
796         va_end(args);
797         if (IS_ERR(dev))
798                 return PTR_ERR(dev);
799
800         bdi->dev = dev;
801
802         bdi_debug_register(bdi, dev_name(dev));
803         set_bit(WB_registered, &bdi->wb.state);
804
805         spin_lock_bh(&bdi_lock);
806         list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
807         spin_unlock_bh(&bdi_lock);
808
809         trace_writeback_bdi_register(bdi);
810         return 0;
811 }
812 EXPORT_SYMBOL(bdi_register);
813
814 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
815 {
816         return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
817 }
818 EXPORT_SYMBOL(bdi_register_dev);
819
820 /*
821  * Remove bdi from bdi_list, and ensure that it is no longer visible
822  */
823 static void bdi_remove_from_list(struct backing_dev_info *bdi)
824 {
825         spin_lock_bh(&bdi_lock);
826         list_del_rcu(&bdi->bdi_list);
827         spin_unlock_bh(&bdi_lock);
828
829         synchronize_rcu_expedited();
830 }
831
832 /*
833  * Called when the device behind @bdi has been removed or ejected.
834  *
835  * We can't really do much here except for reducing the dirty ratio at
836  * the moment.  In the future we should be able to set a flag so that
837  * the filesystem can handle errors at mark_inode_dirty time instead
838  * of only at writeback time.
839  */
840 void bdi_unregister(struct backing_dev_info *bdi)
841 {
842         if (WARN_ON_ONCE(!bdi->dev))
843                 return;
844
845         bdi_set_min_ratio(bdi, 0);
846 }
847 EXPORT_SYMBOL(bdi_unregister);
848
849 void bdi_destroy(struct backing_dev_info *bdi)
850 {
851         /* make sure nobody finds us on the bdi_list anymore */
852         bdi_remove_from_list(bdi);
853         wb_shutdown(&bdi->wb);
854         cgwb_bdi_destroy(bdi);
855
856         if (bdi->dev) {
857                 bdi_debug_unregister(bdi);
858                 device_unregister(bdi->dev);
859                 bdi->dev = NULL;
860         }
861
862         wb_exit(&bdi->wb);
863 }
864 EXPORT_SYMBOL(bdi_destroy);
865
866 /*
867  * For use from filesystems to quickly init and register a bdi associated
868  * with dirty writeback
869  */
870 int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
871 {
872         int err;
873
874         bdi->name = name;
875         bdi->capabilities = 0;
876         err = bdi_init(bdi);
877         if (err)
878                 return err;
879
880         err = bdi_register(bdi, NULL, "%.28s-%ld", name,
881                            atomic_long_inc_return(&bdi_seq));
882         if (err) {
883                 bdi_destroy(bdi);
884                 return err;
885         }
886
887         return 0;
888 }
889 EXPORT_SYMBOL(bdi_setup_and_register);
890
891 static wait_queue_head_t congestion_wqh[2] = {
892                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
893                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
894         };
895 static atomic_t nr_wb_congested[2];
896
897 void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
898 {
899         wait_queue_head_t *wqh = &congestion_wqh[sync];
900         enum wb_state bit;
901
902         bit = sync ? WB_sync_congested : WB_async_congested;
903         if (test_and_clear_bit(bit, &congested->state))
904                 atomic_dec(&nr_wb_congested[sync]);
905         smp_mb__after_atomic();
906         if (waitqueue_active(wqh))
907                 wake_up(wqh);
908 }
909 EXPORT_SYMBOL(clear_wb_congested);
910
911 void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
912 {
913         enum wb_state bit;
914
915         bit = sync ? WB_sync_congested : WB_async_congested;
916         if (!test_and_set_bit(bit, &congested->state))
917                 atomic_inc(&nr_wb_congested[sync]);
918 }
919 EXPORT_SYMBOL(set_wb_congested);
920
921 /**
922  * congestion_wait - wait for a backing_dev to become uncongested
923  * @sync: SYNC or ASYNC IO
924  * @timeout: timeout in jiffies
925  *
926  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
927  * write congestion.  If no backing_devs are congested then just wait for the
928  * next write to be completed.
929  */
930 long congestion_wait(int sync, long timeout)
931 {
932         long ret;
933         unsigned long start = jiffies;
934         DEFINE_WAIT(wait);
935         wait_queue_head_t *wqh = &congestion_wqh[sync];
936
937         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
938         ret = io_schedule_timeout(timeout);
939         finish_wait(wqh, &wait);
940
941         trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
942                                         jiffies_to_usecs(jiffies - start));
943
944         return ret;
945 }
946 EXPORT_SYMBOL(congestion_wait);
947
948 /**
949  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
950  * @zone: A zone to check if it is heavily congested
951  * @sync: SYNC or ASYNC IO
952  * @timeout: timeout in jiffies
953  *
954  * In the event of a congested backing_dev (any backing_dev) and the given
955  * @zone has experienced recent congestion, this waits for up to @timeout
956  * jiffies for either a BDI to exit congestion of the given @sync queue
957  * or a write to complete.
958  *
959  * In the absence of zone congestion, cond_resched() is called to yield
960  * the processor if necessary but otherwise does not sleep.
961  *
962  * The return value is 0 if the sleep is for the full timeout. Otherwise,
963  * it is the number of jiffies that were still remaining when the function
964  * returned. return_value == timeout implies the function did not sleep.
965  */
966 long wait_iff_congested(struct zone *zone, int sync, long timeout)
967 {
968         long ret;
969         unsigned long start = jiffies;
970         DEFINE_WAIT(wait);
971         wait_queue_head_t *wqh = &congestion_wqh[sync];
972
973         /*
974          * If there is no congestion, or heavy congestion is not being
975          * encountered in the current zone, yield if necessary instead
976          * of sleeping on the congestion queue
977          */
978         if (atomic_read(&nr_wb_congested[sync]) == 0 ||
979             !test_bit(ZONE_CONGESTED, &zone->flags)) {
980                 cond_resched();
981
982                 /* In case we scheduled, work out time remaining */
983                 ret = timeout - (jiffies - start);
984                 if (ret < 0)
985                         ret = 0;
986
987                 goto out;
988         }
989
990         /* Sleep until uncongested or a write happens */
991         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
992         ret = io_schedule_timeout(timeout);
993         finish_wait(wqh, &wait);
994
995 out:
996         trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
997                                         jiffies_to_usecs(jiffies - start));
998
999         return ret;
1000 }
1001 EXPORT_SYMBOL(wait_iff_congested);
1002
1003 int pdflush_proc_obsolete(struct ctl_table *table, int write,
1004                         void __user *buffer, size_t *lenp, loff_t *ppos)
1005 {
1006         char kbuf[] = "0\n";
1007
1008         if (*ppos || *lenp < sizeof(kbuf)) {
1009                 *lenp = 0;
1010                 return 0;
1011         }
1012
1013         if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
1014                 return -EFAULT;
1015         printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
1016                         table->procname);
1017
1018         *lenp = 2;
1019         *ppos += *lenp;
1020         return 2;
1021 }