2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
33 * IO end handler for temporary buffer_heads handling writes to the journal.
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 struct buffer_head *orig_bh = bh->b_private;
41 set_buffer_uptodate(bh);
43 clear_buffer_uptodate(bh);
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_atomic();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
53 * When an ext4 file is truncated, it is possible that some pages are not
54 * successfully freed, because they are attached to a committing transaction.
55 * After the transaction commits, these pages are left on the LRU, with no
56 * ->mapping, and with attached buffers. These pages are trivially reclaimable
57 * by the VM, but their apparent absence upsets the VM accounting, and it makes
58 * the numbers in /proc/meminfo look odd.
60 * So here, we have a buffer which has just come off the forget list. Look to
61 * see if we can strip all buffers from the backing page.
63 * Called under lock_journal(), and possibly under journal_datalist_lock. The
64 * caller provided us with a ref against the buffer, and we drop that here.
66 static void release_buffer_page(struct buffer_head *bh)
72 if (atomic_read(&bh->b_count) != 1)
80 /* OK, it's a truncated page */
81 if (!trylock_page(page))
86 try_to_free_buffers(page);
88 page_cache_release(page);
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
97 struct commit_header *h;
100 if (!jbd2_journal_has_csum_v2or3(j))
103 h = (struct commit_header *)(bh->b_data);
104 h->h_chksum_type = 0;
105 h->h_chksum_size = 0;
107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 h->h_chksum[0] = cpu_to_be32(csum);
112 * Done it all: now submit the commit record. We should have
113 * cleaned up our previous buffers by now, so if we are in abort
114 * mode we can now just skip the rest of the journal write
117 * Returns 1 if the journal needs to be aborted or 0 on success
119 static int journal_submit_commit_record(journal_t *journal,
120 transaction_t *commit_transaction,
121 struct buffer_head **cbh,
124 struct commit_header *tmp;
125 struct buffer_head *bh;
127 struct timespec now = current_kernel_time();
131 if (is_journal_aborted(journal))
134 bh = jbd2_journal_get_descriptor_buffer(journal);
138 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
145 if (jbd2_has_feature_checksum(journal)) {
146 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
147 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
148 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
150 jbd2_commit_block_csum_set(journal, bh);
152 BUFFER_TRACE(bh, "submit commit block");
154 clear_buffer_dirty(bh);
155 set_buffer_uptodate(bh);
156 bh->b_end_io = journal_end_buffer_io_sync;
158 if (journal->j_flags & JBD2_BARRIER &&
159 !jbd2_has_feature_async_commit(journal))
160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
162 ret = submit_bh(WRITE_SYNC, bh);
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
172 static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
177 clear_buffer_dirty(bh);
180 if (unlikely(!buffer_uptodate(bh)))
182 put_bh(bh); /* One for getblk() */
188 * write the filemap data using writepage() address_space_operations.
189 * We don't do block allocation here even for delalloc. We don't
190 * use writepages() because with dealyed allocation we may be doing
191 * block allocation in writepages().
193 static int journal_submit_inode_data_buffers(struct address_space *mapping)
196 struct writeback_control wbc = {
197 .sync_mode = WB_SYNC_ALL,
198 .nr_to_write = mapping->nrpages * 2,
200 .range_end = i_size_read(mapping->host),
203 ret = generic_writepages(mapping, &wbc);
208 * Submit all the data buffers of inode associated with the transaction to
211 * We are in a committing transaction. Therefore no new inode can be added to
212 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
213 * operate on from being released while we write out pages.
215 static int journal_submit_data_buffers(journal_t *journal,
216 transaction_t *commit_transaction)
218 struct jbd2_inode *jinode;
220 struct address_space *mapping;
222 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
226 spin_unlock(&journal->j_list_lock);
228 * submit the inode data buffers. We use writepage
229 * instead of writepages. Because writepages can do
230 * block allocation with delalloc. We need to write
231 * only allocated blocks here.
233 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 err = journal_submit_inode_data_buffers(mapping);
237 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 smp_mb__after_atomic();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 spin_unlock(&journal->j_list_lock);
248 * Wait for data submitted for writeout, refile inodes to proper
249 * transaction if needed.
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 transaction_t *commit_transaction)
255 struct jbd2_inode *jinode, *next_i;
258 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
262 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266 * Because AS_EIO is cleared by
267 * filemap_fdatawait_range(), set it again so
268 * that user process can get -EIO from fsync().
271 &jinode->i_vfs_inode->i_mapping->flags);
276 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
278 smp_mb__after_atomic();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282 /* Now refile inode to proper lists */
283 list_for_each_entry_safe(jinode, next_i,
284 &commit_transaction->t_inode_list, i_list) {
285 list_del(&jinode->i_list);
286 if (jinode->i_next_transaction) {
287 jinode->i_transaction = jinode->i_next_transaction;
288 jinode->i_next_transaction = NULL;
289 list_add(&jinode->i_list,
290 &jinode->i_transaction->t_inode_list);
292 jinode->i_transaction = NULL;
295 spin_unlock(&journal->j_list_lock);
300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 struct page *page = bh->b_page;
306 addr = kmap_atomic(page);
307 checksum = crc32_be(crc32_sum,
308 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
314 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
315 unsigned long long block)
317 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
318 if (jbd2_has_feature_64bit(j))
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322 static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
325 struct jbd2_journal_block_tail *tail;
328 if (!jbd2_journal_has_csum_v2or3(j))
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
338 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence)
341 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
342 struct page *page = bh->b_page;
347 if (!jbd2_journal_has_csum_v2or3(j))
350 seq = cpu_to_be32(sequence);
351 addr = kmap_atomic(page);
352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
353 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
357 if (jbd2_has_feature_csum3(j))
358 tag3->t_checksum = cpu_to_be32(csum32);
360 tag->t_checksum = cpu_to_be16(csum32);
363 * jbd2_journal_commit_transaction
365 * The primary function for committing a transaction to the log. This
366 * function is called by the journal thread to begin a complete commit.
368 void jbd2_journal_commit_transaction(journal_t *journal)
370 struct transaction_stats_s stats;
371 transaction_t *commit_transaction;
372 struct journal_head *jh;
373 struct buffer_head *descriptor;
374 struct buffer_head **wbuf = journal->j_wbuf;
378 unsigned long long blocknr;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL;
388 int tag_bytes = journal_tag_bytes(journal);
389 struct buffer_head *cbh = NULL; /* For transactional checksums */
390 __u32 crc32_sum = ~0;
391 struct blk_plug plug;
392 /* Tail of the journal */
393 unsigned long first_block;
400 if (jbd2_journal_has_csum_v2or3(journal))
401 csum_size = sizeof(struct jbd2_journal_block_tail);
404 * First job: lock down the current transaction and wait for
405 * all outstanding updates to complete.
408 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
409 if (journal->j_flags & JBD2_FLUSHED) {
410 jbd_debug(3, "super block updated\n");
411 mutex_lock(&journal->j_checkpoint_mutex);
413 * We hold j_checkpoint_mutex so tail cannot change under us.
414 * We don't need any special data guarantees for writing sb
415 * since journal is empty and it is ok for write to be
416 * flushed only with transaction commit.
418 jbd2_journal_update_sb_log_tail(journal,
419 journal->j_tail_sequence,
422 mutex_unlock(&journal->j_checkpoint_mutex);
424 jbd_debug(3, "superblock not updated\n");
427 J_ASSERT(journal->j_running_transaction != NULL);
428 J_ASSERT(journal->j_committing_transaction == NULL);
430 commit_transaction = journal->j_running_transaction;
432 trace_jbd2_start_commit(journal, commit_transaction);
433 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
434 commit_transaction->t_tid);
436 write_lock(&journal->j_state_lock);
437 J_ASSERT(commit_transaction->t_state == T_RUNNING);
438 commit_transaction->t_state = T_LOCKED;
440 trace_jbd2_commit_locking(journal, commit_transaction);
441 stats.run.rs_wait = commit_transaction->t_max_wait;
442 stats.run.rs_request_delay = 0;
443 stats.run.rs_locked = jiffies;
444 if (commit_transaction->t_requested)
445 stats.run.rs_request_delay =
446 jbd2_time_diff(commit_transaction->t_requested,
447 stats.run.rs_locked);
448 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
449 stats.run.rs_locked);
451 spin_lock(&commit_transaction->t_handle_lock);
452 while (atomic_read(&commit_transaction->t_updates)) {
455 prepare_to_wait(&journal->j_wait_updates, &wait,
456 TASK_UNINTERRUPTIBLE);
457 if (atomic_read(&commit_transaction->t_updates)) {
458 spin_unlock(&commit_transaction->t_handle_lock);
459 write_unlock(&journal->j_state_lock);
461 write_lock(&journal->j_state_lock);
462 spin_lock(&commit_transaction->t_handle_lock);
464 finish_wait(&journal->j_wait_updates, &wait);
466 spin_unlock(&commit_transaction->t_handle_lock);
468 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
469 journal->j_max_transaction_buffers);
472 * First thing we are allowed to do is to discard any remaining
473 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
474 * that there are no such buffers: if a large filesystem
475 * operation like a truncate needs to split itself over multiple
476 * transactions, then it may try to do a jbd2_journal_restart() while
477 * there are still BJ_Reserved buffers outstanding. These must
478 * be released cleanly from the current transaction.
480 * In this case, the filesystem must still reserve write access
481 * again before modifying the buffer in the new transaction, but
482 * we do not require it to remember exactly which old buffers it
483 * has reserved. This is consistent with the existing behaviour
484 * that multiple jbd2_journal_get_write_access() calls to the same
485 * buffer are perfectly permissible.
487 while (commit_transaction->t_reserved_list) {
488 jh = commit_transaction->t_reserved_list;
489 JBUFFER_TRACE(jh, "reserved, unused: refile");
491 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
492 * leave undo-committed data.
494 if (jh->b_committed_data) {
495 struct buffer_head *bh = jh2bh(jh);
497 jbd_lock_bh_state(bh);
498 jbd2_free(jh->b_committed_data, bh->b_size);
499 jh->b_committed_data = NULL;
500 jbd_unlock_bh_state(bh);
502 jbd2_journal_refile_buffer(journal, jh);
506 * Now try to drop any written-back buffers from the journal's
507 * checkpoint lists. We do this *before* commit because it potentially
510 spin_lock(&journal->j_list_lock);
511 __jbd2_journal_clean_checkpoint_list(journal, false);
512 spin_unlock(&journal->j_list_lock);
514 jbd_debug(3, "JBD2: commit phase 1\n");
517 * Clear revoked flag to reflect there is no revoked buffers
518 * in the next transaction which is going to be started.
520 jbd2_clear_buffer_revoked_flags(journal);
523 * Switch to a new revoke table.
525 jbd2_journal_switch_revoke_table(journal);
528 * Reserved credits cannot be claimed anymore, free them
530 atomic_sub(atomic_read(&journal->j_reserved_credits),
531 &commit_transaction->t_outstanding_credits);
533 trace_jbd2_commit_flushing(journal, commit_transaction);
534 stats.run.rs_flushing = jiffies;
535 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
536 stats.run.rs_flushing);
538 commit_transaction->t_state = T_FLUSH;
539 journal->j_committing_transaction = commit_transaction;
540 journal->j_running_transaction = NULL;
541 start_time = ktime_get();
542 commit_transaction->t_log_start = journal->j_head;
543 wake_up(&journal->j_wait_transaction_locked);
544 write_unlock(&journal->j_state_lock);
546 jbd_debug(3, "JBD2: commit phase 2a\n");
549 * Now start flushing things to disk, in the order they appear
550 * on the transaction lists. Data blocks go first.
552 err = journal_submit_data_buffers(journal, commit_transaction);
554 jbd2_journal_abort(journal, err);
556 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
559 jbd_debug(3, "JBD2: commit phase 2b\n");
562 * Way to go: we have now written out all of the data for a
563 * transaction! Now comes the tricky part: we need to write out
564 * metadata. Loop over the transaction's entire buffer list:
566 write_lock(&journal->j_state_lock);
567 commit_transaction->t_state = T_COMMIT;
568 write_unlock(&journal->j_state_lock);
570 trace_jbd2_commit_logging(journal, commit_transaction);
571 stats.run.rs_logging = jiffies;
572 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
573 stats.run.rs_logging);
574 stats.run.rs_blocks =
575 atomic_read(&commit_transaction->t_outstanding_credits);
576 stats.run.rs_blocks_logged = 0;
578 J_ASSERT(commit_transaction->t_nr_buffers <=
579 atomic_read(&commit_transaction->t_outstanding_credits));
584 while (commit_transaction->t_buffers) {
586 /* Find the next buffer to be journaled... */
588 jh = commit_transaction->t_buffers;
590 /* If we're in abort mode, we just un-journal the buffer and
593 if (is_journal_aborted(journal)) {
594 clear_buffer_jbddirty(jh2bh(jh));
595 JBUFFER_TRACE(jh, "journal is aborting: refile");
596 jbd2_buffer_abort_trigger(jh,
598 jh->b_frozen_triggers :
600 jbd2_journal_refile_buffer(journal, jh);
601 /* If that was the last one, we need to clean up
602 * any descriptor buffers which may have been
603 * already allocated, even if we are now
605 if (!commit_transaction->t_buffers)
606 goto start_journal_io;
610 /* Make sure we have a descriptor block in which to
611 record the metadata buffer. */
614 J_ASSERT (bufs == 0);
616 jbd_debug(4, "JBD2: get descriptor\n");
618 descriptor = jbd2_journal_get_descriptor_buffer(journal);
620 jbd2_journal_abort(journal, -EIO);
624 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
625 (unsigned long long)descriptor->b_blocknr,
627 header = (journal_header_t *)descriptor->b_data;
628 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
629 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
630 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
632 tagp = &descriptor->b_data[sizeof(journal_header_t)];
633 space_left = descriptor->b_size -
634 sizeof(journal_header_t);
636 set_buffer_jwrite(descriptor);
637 set_buffer_dirty(descriptor);
638 wbuf[bufs++] = descriptor;
640 /* Record it so that we can wait for IO
642 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
643 jbd2_file_log_bh(&log_bufs, descriptor);
646 /* Where is the buffer to be written? */
648 err = jbd2_journal_next_log_block(journal, &blocknr);
649 /* If the block mapping failed, just abandon the buffer
650 and repeat this loop: we'll fall into the
651 refile-on-abort condition above. */
653 jbd2_journal_abort(journal, err);
658 * start_this_handle() uses t_outstanding_credits to determine
659 * the free space in the log, but this counter is changed
660 * by jbd2_journal_next_log_block() also.
662 atomic_dec(&commit_transaction->t_outstanding_credits);
664 /* Bump b_count to prevent truncate from stumbling over
665 the shadowed buffer! @@@ This can go if we ever get
666 rid of the shadow pairing of buffers. */
667 atomic_inc(&jh2bh(jh)->b_count);
670 * Make a temporary IO buffer with which to write it out
671 * (this will requeue the metadata buffer to BJ_Shadow).
673 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
674 JBUFFER_TRACE(jh, "ph3: write metadata");
675 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
676 jh, &wbuf[bufs], blocknr);
678 jbd2_journal_abort(journal, flags);
681 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
683 /* Record the new block's tag in the current descriptor
688 tag_flag |= JBD2_FLAG_ESCAPE;
690 tag_flag |= JBD2_FLAG_SAME_UUID;
692 tag = (journal_block_tag_t *) tagp;
693 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
694 tag->t_flags = cpu_to_be16(tag_flag);
695 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
696 commit_transaction->t_tid);
698 space_left -= tag_bytes;
702 memcpy (tagp, journal->j_uuid, 16);
708 /* If there's no more to do, or if the descriptor is full,
711 if (bufs == journal->j_wbufsize ||
712 commit_transaction->t_buffers == NULL ||
713 space_left < tag_bytes + 16 + csum_size) {
715 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
717 /* Write an end-of-descriptor marker before
718 submitting the IOs. "tag" still points to
719 the last tag we set up. */
721 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
723 jbd2_descr_block_csum_set(journal, descriptor);
725 for (i = 0; i < bufs; i++) {
726 struct buffer_head *bh = wbuf[i];
730 if (jbd2_has_feature_checksum(journal)) {
732 jbd2_checksum_data(crc32_sum, bh);
736 clear_buffer_dirty(bh);
737 set_buffer_uptodate(bh);
738 bh->b_end_io = journal_end_buffer_io_sync;
739 submit_bh(WRITE_SYNC, bh);
742 stats.run.rs_blocks_logged += bufs;
744 /* Force a new descriptor to be generated next
745 time round the loop. */
751 err = journal_finish_inode_data_buffers(journal, commit_transaction);
754 "JBD2: Detected IO errors while flushing file data "
755 "on %s\n", journal->j_devname);
756 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
757 jbd2_journal_abort(journal, err);
762 * Get current oldest transaction in the log before we issue flush
763 * to the filesystem device. After the flush we can be sure that
764 * blocks of all older transactions are checkpointed to persistent
765 * storage and we will be safe to update journal start in the
766 * superblock with the numbers we get here.
769 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
771 write_lock(&journal->j_state_lock);
773 long freed = first_block - journal->j_tail;
775 if (first_block < journal->j_tail)
776 freed += journal->j_last - journal->j_first;
777 /* Update tail only if we free significant amount of space */
778 if (freed < journal->j_maxlen / 4)
781 J_ASSERT(commit_transaction->t_state == T_COMMIT);
782 commit_transaction->t_state = T_COMMIT_DFLUSH;
783 write_unlock(&journal->j_state_lock);
786 * If the journal is not located on the file system device,
787 * then we must flush the file system device before we issue
790 if (commit_transaction->t_need_data_flush &&
791 (journal->j_fs_dev != journal->j_dev) &&
792 (journal->j_flags & JBD2_BARRIER))
793 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
795 /* Done it all: now write the commit record asynchronously. */
796 if (jbd2_has_feature_async_commit(journal)) {
797 err = journal_submit_commit_record(journal, commit_transaction,
800 __jbd2_journal_abort_hard(journal);
803 blk_finish_plug(&plug);
805 /* Lo and behold: we have just managed to send a transaction to
806 the log. Before we can commit it, wait for the IO so far to
807 complete. Control buffers being written are on the
808 transaction's t_log_list queue, and metadata buffers are on
811 Wait for the buffers in reverse order. That way we are
812 less likely to be woken up until all IOs have completed, and
813 so we incur less scheduling load.
816 jbd_debug(3, "JBD2: commit phase 3\n");
818 while (!list_empty(&io_bufs)) {
819 struct buffer_head *bh = list_entry(io_bufs.prev,
826 if (unlikely(!buffer_uptodate(bh)))
828 jbd2_unfile_log_bh(bh);
831 * The list contains temporary buffer heads created by
832 * jbd2_journal_write_metadata_buffer().
834 BUFFER_TRACE(bh, "dumping temporary bh");
836 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
837 free_buffer_head(bh);
839 /* We also have to refile the corresponding shadowed buffer */
840 jh = commit_transaction->t_shadow_list->b_tprev;
842 clear_buffer_jwrite(bh);
843 J_ASSERT_BH(bh, buffer_jbddirty(bh));
844 J_ASSERT_BH(bh, !buffer_shadow(bh));
846 /* The metadata is now released for reuse, but we need
847 to remember it against this transaction so that when
848 we finally commit, we can do any checkpointing
850 JBUFFER_TRACE(jh, "file as BJ_Forget");
851 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
852 JBUFFER_TRACE(jh, "brelse shadowed buffer");
856 J_ASSERT (commit_transaction->t_shadow_list == NULL);
858 jbd_debug(3, "JBD2: commit phase 4\n");
860 /* Here we wait for the revoke record and descriptor record buffers */
861 while (!list_empty(&log_bufs)) {
862 struct buffer_head *bh;
864 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
868 if (unlikely(!buffer_uptodate(bh)))
871 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
872 clear_buffer_jwrite(bh);
873 jbd2_unfile_log_bh(bh);
874 __brelse(bh); /* One for getblk */
875 /* AKPM: bforget here */
879 jbd2_journal_abort(journal, err);
881 jbd_debug(3, "JBD2: commit phase 5\n");
882 write_lock(&journal->j_state_lock);
883 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
884 commit_transaction->t_state = T_COMMIT_JFLUSH;
885 write_unlock(&journal->j_state_lock);
887 if (!jbd2_has_feature_async_commit(journal)) {
888 err = journal_submit_commit_record(journal, commit_transaction,
891 __jbd2_journal_abort_hard(journal);
894 err = journal_wait_on_commit_record(journal, cbh);
895 if (jbd2_has_feature_async_commit(journal) &&
896 journal->j_flags & JBD2_BARRIER) {
897 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
901 jbd2_journal_abort(journal, err);
904 * Now disk caches for filesystem device are flushed so we are safe to
905 * erase checkpointed transactions from the log by updating journal
909 jbd2_update_log_tail(journal, first_tid, first_block);
911 /* End of a transaction! Finally, we can do checkpoint
912 processing: any buffers committed as a result of this
913 transaction can be removed from any checkpoint list it was on
916 jbd_debug(3, "JBD2: commit phase 6\n");
918 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
919 J_ASSERT(commit_transaction->t_buffers == NULL);
920 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
921 J_ASSERT(commit_transaction->t_shadow_list == NULL);
925 * As there are other places (journal_unmap_buffer()) adding buffers
926 * to this list we have to be careful and hold the j_list_lock.
928 spin_lock(&journal->j_list_lock);
929 while (commit_transaction->t_forget) {
930 transaction_t *cp_transaction;
931 struct buffer_head *bh;
934 jh = commit_transaction->t_forget;
935 spin_unlock(&journal->j_list_lock);
938 * Get a reference so that bh cannot be freed before we are
942 jbd_lock_bh_state(bh);
943 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
946 * If there is undo-protected committed data against
947 * this buffer, then we can remove it now. If it is a
948 * buffer needing such protection, the old frozen_data
949 * field now points to a committed version of the
950 * buffer, so rotate that field to the new committed
953 * Otherwise, we can just throw away the frozen data now.
955 * We also know that the frozen data has already fired
956 * its triggers if they exist, so we can clear that too.
958 if (jh->b_committed_data) {
959 jbd2_free(jh->b_committed_data, bh->b_size);
960 jh->b_committed_data = NULL;
961 if (jh->b_frozen_data) {
962 jh->b_committed_data = jh->b_frozen_data;
963 jh->b_frozen_data = NULL;
964 jh->b_frozen_triggers = NULL;
966 } else if (jh->b_frozen_data) {
967 jbd2_free(jh->b_frozen_data, bh->b_size);
968 jh->b_frozen_data = NULL;
969 jh->b_frozen_triggers = NULL;
972 spin_lock(&journal->j_list_lock);
973 cp_transaction = jh->b_cp_transaction;
974 if (cp_transaction) {
975 JBUFFER_TRACE(jh, "remove from old cp transaction");
976 cp_transaction->t_chp_stats.cs_dropped++;
977 __jbd2_journal_remove_checkpoint(jh);
980 /* Only re-checkpoint the buffer_head if it is marked
981 * dirty. If the buffer was added to the BJ_Forget list
982 * by jbd2_journal_forget, it may no longer be dirty and
983 * there's no point in keeping a checkpoint record for
987 * A buffer which has been freed while still being journaled by
988 * a previous transaction.
990 if (buffer_freed(bh)) {
992 * If the running transaction is the one containing
993 * "add to orphan" operation (b_next_transaction !=
994 * NULL), we have to wait for that transaction to
995 * commit before we can really get rid of the buffer.
996 * So just clear b_modified to not confuse transaction
997 * credit accounting and refile the buffer to
998 * BJ_Forget of the running transaction. If the just
999 * committed transaction contains "add to orphan"
1000 * operation, we can completely invalidate the buffer
1001 * now. We are rather through in that since the
1002 * buffer may be still accessible when blocksize <
1003 * pagesize and it is attached to the last partial
1007 if (!jh->b_next_transaction) {
1008 clear_buffer_freed(bh);
1009 clear_buffer_jbddirty(bh);
1010 clear_buffer_mapped(bh);
1011 clear_buffer_new(bh);
1012 clear_buffer_req(bh);
1017 if (buffer_jbddirty(bh)) {
1018 JBUFFER_TRACE(jh, "add to new checkpointing trans");
1019 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1020 if (is_journal_aborted(journal))
1021 clear_buffer_jbddirty(bh);
1023 J_ASSERT_BH(bh, !buffer_dirty(bh));
1025 * The buffer on BJ_Forget list and not jbddirty means
1026 * it has been freed by this transaction and hence it
1027 * could not have been reallocated until this
1028 * transaction has committed. *BUT* it could be
1029 * reallocated once we have written all the data to
1030 * disk and before we process the buffer on BJ_Forget
1033 if (!jh->b_next_transaction)
1036 JBUFFER_TRACE(jh, "refile or unfile buffer");
1037 __jbd2_journal_refile_buffer(jh);
1038 jbd_unlock_bh_state(bh);
1040 release_buffer_page(bh); /* Drops bh reference */
1043 cond_resched_lock(&journal->j_list_lock);
1045 spin_unlock(&journal->j_list_lock);
1047 * This is a bit sleazy. We use j_list_lock to protect transition
1048 * of a transaction into T_FINISHED state and calling
1049 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1050 * other checkpointing code processing the transaction...
1052 write_lock(&journal->j_state_lock);
1053 spin_lock(&journal->j_list_lock);
1055 * Now recheck if some buffers did not get attached to the transaction
1056 * while the lock was dropped...
1058 if (commit_transaction->t_forget) {
1059 spin_unlock(&journal->j_list_lock);
1060 write_unlock(&journal->j_state_lock);
1064 /* Add the transaction to the checkpoint list
1065 * __journal_remove_checkpoint() can not destroy transaction
1066 * under us because it is not marked as T_FINISHED yet */
1067 if (journal->j_checkpoint_transactions == NULL) {
1068 journal->j_checkpoint_transactions = commit_transaction;
1069 commit_transaction->t_cpnext = commit_transaction;
1070 commit_transaction->t_cpprev = commit_transaction;
1072 commit_transaction->t_cpnext =
1073 journal->j_checkpoint_transactions;
1074 commit_transaction->t_cpprev =
1075 commit_transaction->t_cpnext->t_cpprev;
1076 commit_transaction->t_cpnext->t_cpprev =
1078 commit_transaction->t_cpprev->t_cpnext =
1081 spin_unlock(&journal->j_list_lock);
1083 /* Done with this transaction! */
1085 jbd_debug(3, "JBD2: commit phase 7\n");
1087 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1089 commit_transaction->t_start = jiffies;
1090 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1091 commit_transaction->t_start);
1094 * File the transaction statistics
1096 stats.ts_tid = commit_transaction->t_tid;
1097 stats.run.rs_handle_count =
1098 atomic_read(&commit_transaction->t_handle_count);
1099 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1100 commit_transaction->t_tid, &stats.run);
1101 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1103 commit_transaction->t_state = T_COMMIT_CALLBACK;
1104 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1105 journal->j_commit_sequence = commit_transaction->t_tid;
1106 journal->j_committing_transaction = NULL;
1107 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1110 * weight the commit time higher than the average time so we don't
1111 * react too strongly to vast changes in the commit time
1113 if (likely(journal->j_average_commit_time))
1114 journal->j_average_commit_time = (commit_time +
1115 journal->j_average_commit_time*3) / 4;
1117 journal->j_average_commit_time = commit_time;
1119 write_unlock(&journal->j_state_lock);
1121 if (journal->j_commit_callback)
1122 journal->j_commit_callback(journal, commit_transaction);
1124 trace_jbd2_end_commit(journal, commit_transaction);
1125 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1126 journal->j_commit_sequence, journal->j_tail_sequence);
1128 write_lock(&journal->j_state_lock);
1129 spin_lock(&journal->j_list_lock);
1130 commit_transaction->t_state = T_FINISHED;
1131 /* Check if the transaction can be dropped now that we are finished */
1132 if (commit_transaction->t_checkpoint_list == NULL &&
1133 commit_transaction->t_checkpoint_io_list == NULL) {
1134 __jbd2_journal_drop_transaction(journal, commit_transaction);
1135 jbd2_journal_free_transaction(commit_transaction);
1137 spin_unlock(&journal->j_list_lock);
1138 write_unlock(&journal->j_state_lock);
1139 wake_up(&journal->j_wait_done_commit);
1142 * Calculate overall stats
1144 spin_lock(&journal->j_history_lock);
1145 journal->j_stats.ts_tid++;
1146 journal->j_stats.ts_requested += stats.ts_requested;
1147 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1148 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1149 journal->j_stats.run.rs_running += stats.run.rs_running;
1150 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1151 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1152 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1153 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1154 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1155 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1156 spin_unlock(&journal->j_history_lock);