4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * libcfs/libcfs/workitem.c
38 * Author: Isaac Huang <isaac@clusterfs.com>
39 * Liang Zhen <zhen.liang@sun.com>
42 #define DEBUG_SUBSYSTEM S_LNET
44 #include "../../include/linux/libcfs/libcfs.h"
46 #define CFS_WS_NAME_LEN 16
49 /* chain on global list */
50 struct list_head ws_list;
51 /** serialised workitems */
53 /** where schedulers sleep */
54 wait_queue_head_t ws_waitq;
55 /** concurrent workitems */
56 struct list_head ws_runq;
58 * rescheduled running-workitems, a workitem can be rescheduled
59 * while running in wi_action(), but we don't to execute it again
60 * unless it returns from wi_action(), so we put it on ws_rerunq
61 * while rescheduling, and move it to runq after it returns
64 struct list_head ws_rerunq;
65 /** CPT-table for this scheduler */
66 struct cfs_cpt_table *ws_cptab;
67 /** CPT id for affinity */
69 /** number of scheduled workitems */
71 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
72 unsigned int ws_nthreads:30;
73 /** shutting down, protected by cfs_wi_data::wi_glock */
74 unsigned int ws_stopping:1;
75 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
76 unsigned int ws_starting:1;
78 char ws_name[CFS_WS_NAME_LEN];
81 static struct cfs_workitem_data {
84 /** list of all schedulers */
85 struct list_head wi_scheds;
86 /** WI module is initialized */
88 /** shutting down the whole WI module */
93 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
95 spin_lock(&sched->ws_lock);
96 if (sched->ws_stopping) {
97 spin_unlock(&sched->ws_lock);
101 if (!list_empty(&sched->ws_runq)) {
102 spin_unlock(&sched->ws_lock);
105 spin_unlock(&sched->ws_lock);
110 * 0. it only works when called from wi->wi_action.
111 * 1. when it returns no one shall try to schedule the workitem.
114 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
116 LASSERT(!in_interrupt()); /* because we use plain spinlock */
117 LASSERT(!sched->ws_stopping);
119 spin_lock(&sched->ws_lock);
121 LASSERT(wi->wi_running);
122 if (wi->wi_scheduled) { /* cancel pending schedules */
123 LASSERT(!list_empty(&wi->wi_list));
124 list_del_init(&wi->wi_list);
126 LASSERT(sched->ws_nscheduled > 0);
127 sched->ws_nscheduled--;
130 LASSERT(list_empty(&wi->wi_list));
132 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
133 spin_unlock(&sched->ws_lock);
135 EXPORT_SYMBOL(cfs_wi_exit);
138 * cancel schedule request of workitem \a wi
141 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
145 LASSERT(!in_interrupt()); /* because we use plain spinlock */
146 LASSERT(!sched->ws_stopping);
149 * return 0 if it's running already, otherwise return 1, which
150 * means the workitem will not be scheduled and will not have
151 * any race with wi_action.
153 spin_lock(&sched->ws_lock);
155 rc = !(wi->wi_running);
157 if (wi->wi_scheduled) { /* cancel pending schedules */
158 LASSERT(!list_empty(&wi->wi_list));
159 list_del_init(&wi->wi_list);
161 LASSERT(sched->ws_nscheduled > 0);
162 sched->ws_nscheduled--;
164 wi->wi_scheduled = 0;
167 LASSERT(list_empty(&wi->wi_list));
169 spin_unlock(&sched->ws_lock);
172 EXPORT_SYMBOL(cfs_wi_deschedule);
175 * Workitem scheduled with (serial == 1) is strictly serialised not only with
176 * itself, but also with others scheduled this way.
178 * Now there's only one static serialised queue, but in the future more might
179 * be added, and even dynamic creation of serialised queues might be supported.
182 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
184 LASSERT(!in_interrupt()); /* because we use plain spinlock */
185 LASSERT(!sched->ws_stopping);
187 spin_lock(&sched->ws_lock);
189 if (!wi->wi_scheduled) {
190 LASSERT(list_empty(&wi->wi_list));
192 wi->wi_scheduled = 1;
193 sched->ws_nscheduled++;
194 if (!wi->wi_running) {
195 list_add_tail(&wi->wi_list, &sched->ws_runq);
196 wake_up(&sched->ws_waitq);
198 list_add(&wi->wi_list, &sched->ws_rerunq);
202 LASSERT(!list_empty(&wi->wi_list));
203 spin_unlock(&sched->ws_lock);
205 EXPORT_SYMBOL(cfs_wi_schedule);
207 static int cfs_wi_scheduler(void *arg)
209 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
213 /* CPT affinity scheduler? */
215 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
216 CWARN("Failed to bind %s on CPT %d\n",
217 sched->ws_name, sched->ws_cpt);
219 spin_lock(&cfs_wi_data.wi_glock);
221 LASSERT(sched->ws_starting == 1);
222 sched->ws_starting--;
223 sched->ws_nthreads++;
225 spin_unlock(&cfs_wi_data.wi_glock);
227 spin_lock(&sched->ws_lock);
229 while (!sched->ws_stopping) {
232 struct cfs_workitem *wi;
234 while (!list_empty(&sched->ws_runq) &&
235 nloops < CFS_WI_RESCHED) {
236 wi = list_entry(sched->ws_runq.next,
237 struct cfs_workitem, wi_list);
238 LASSERT(wi->wi_scheduled && !wi->wi_running);
240 list_del_init(&wi->wi_list);
242 LASSERT(sched->ws_nscheduled > 0);
243 sched->ws_nscheduled--;
246 wi->wi_scheduled = 0;
248 spin_unlock(&sched->ws_lock);
251 rc = (*wi->wi_action) (wi);
253 spin_lock(&sched->ws_lock);
254 if (rc != 0) /* WI should be dead, even be freed! */
258 if (list_empty(&wi->wi_list))
261 LASSERT(wi->wi_scheduled);
262 /* wi is rescheduled, should be on rerunq now, we
263 * move it to runq so it can run action now
265 list_move_tail(&wi->wi_list, &sched->ws_runq);
268 if (!list_empty(&sched->ws_runq)) {
269 spin_unlock(&sched->ws_lock);
270 /* don't sleep because some workitems still
271 * expect me to come back soon
274 spin_lock(&sched->ws_lock);
278 spin_unlock(&sched->ws_lock);
279 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
280 !cfs_wi_sched_cansleep(sched));
281 spin_lock(&sched->ws_lock);
284 spin_unlock(&sched->ws_lock);
286 spin_lock(&cfs_wi_data.wi_glock);
287 sched->ws_nthreads--;
288 spin_unlock(&cfs_wi_data.wi_glock);
294 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
298 LASSERT(cfs_wi_data.wi_init);
299 LASSERT(!cfs_wi_data.wi_stopping);
301 spin_lock(&cfs_wi_data.wi_glock);
302 if (sched->ws_stopping) {
303 CDEBUG(D_INFO, "%s is in progress of stopping\n",
305 spin_unlock(&cfs_wi_data.wi_glock);
309 LASSERT(!list_empty(&sched->ws_list));
310 sched->ws_stopping = 1;
312 spin_unlock(&cfs_wi_data.wi_glock);
315 wake_up_all(&sched->ws_waitq);
317 spin_lock(&cfs_wi_data.wi_glock);
318 while (sched->ws_nthreads > 0) {
319 CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
320 "waiting for %d threads of WI sched[%s] to terminate\n",
321 sched->ws_nthreads, sched->ws_name);
323 spin_unlock(&cfs_wi_data.wi_glock);
324 set_current_state(TASK_UNINTERRUPTIBLE);
325 schedule_timeout(cfs_time_seconds(1) / 20);
326 spin_lock(&cfs_wi_data.wi_glock);
329 list_del(&sched->ws_list);
331 spin_unlock(&cfs_wi_data.wi_glock);
332 LASSERT(sched->ws_nscheduled == 0);
334 LIBCFS_FREE(sched, sizeof(*sched));
336 EXPORT_SYMBOL(cfs_wi_sched_destroy);
339 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
340 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
342 struct cfs_wi_sched *sched;
345 LASSERT(cfs_wi_data.wi_init);
346 LASSERT(!cfs_wi_data.wi_stopping);
347 LASSERT(!cptab || cpt == CFS_CPT_ANY ||
348 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
350 LIBCFS_ALLOC(sched, sizeof(*sched));
354 if (strlen(name) > sizeof(sched->ws_name) - 1) {
355 LIBCFS_FREE(sched, sizeof(*sched));
358 strncpy(sched->ws_name, name, sizeof(sched->ws_name));
360 sched->ws_cptab = cptab;
363 spin_lock_init(&sched->ws_lock);
364 init_waitqueue_head(&sched->ws_waitq);
365 INIT_LIST_HEAD(&sched->ws_runq);
366 INIT_LIST_HEAD(&sched->ws_rerunq);
367 INIT_LIST_HEAD(&sched->ws_list);
372 struct task_struct *task;
374 spin_lock(&cfs_wi_data.wi_glock);
375 while (sched->ws_starting > 0) {
376 spin_unlock(&cfs_wi_data.wi_glock);
378 spin_lock(&cfs_wi_data.wi_glock);
381 sched->ws_starting++;
382 spin_unlock(&cfs_wi_data.wi_glock);
384 if (sched->ws_cptab && sched->ws_cpt >= 0) {
385 snprintf(name, sizeof(name), "%s_%02d_%02u",
386 sched->ws_name, sched->ws_cpt,
389 snprintf(name, sizeof(name), "%s_%02u",
390 sched->ws_name, sched->ws_nthreads);
393 task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
400 CERROR("Failed to create thread for WI scheduler %s: %d\n",
403 spin_lock(&cfs_wi_data.wi_glock);
405 /* make up for cfs_wi_sched_destroy */
406 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
407 sched->ws_starting--;
409 spin_unlock(&cfs_wi_data.wi_glock);
411 cfs_wi_sched_destroy(sched);
414 spin_lock(&cfs_wi_data.wi_glock);
415 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
416 spin_unlock(&cfs_wi_data.wi_glock);
421 EXPORT_SYMBOL(cfs_wi_sched_create);
426 memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
428 spin_lock_init(&cfs_wi_data.wi_glock);
429 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
430 cfs_wi_data.wi_init = 1;
436 cfs_wi_shutdown(void)
438 struct cfs_wi_sched *sched;
439 struct cfs_wi_sched *temp;
441 spin_lock(&cfs_wi_data.wi_glock);
442 cfs_wi_data.wi_stopping = 1;
443 spin_unlock(&cfs_wi_data.wi_glock);
445 /* nobody should contend on this list */
446 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
447 sched->ws_stopping = 1;
448 wake_up_all(&sched->ws_waitq);
451 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
452 spin_lock(&cfs_wi_data.wi_glock);
454 while (sched->ws_nthreads != 0) {
455 spin_unlock(&cfs_wi_data.wi_glock);
456 set_current_state(TASK_UNINTERRUPTIBLE);
457 schedule_timeout(cfs_time_seconds(1) / 20);
458 spin_lock(&cfs_wi_data.wi_glock);
460 spin_unlock(&cfs_wi_data.wi_glock);
462 list_for_each_entry_safe(sched, temp, &cfs_wi_data.wi_scheds, ws_list) {
463 list_del(&sched->ws_list);
464 LIBCFS_FREE(sched, sizeof(*sched));
467 cfs_wi_data.wi_stopping = 0;
468 cfs_wi_data.wi_init = 0;