From 8b712cd58adfe6aeeb0be4ecc011dc35620719e7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 7 Jul 2009 17:22:12 -0400 Subject: ocfs2: Fixup orphan scan cleanup after failed mount If the mount fails for any reason, ocfs2_dismount_volume calls ocfs2_orphan_scan_stop. It requires that ocfs2_orphan_scan_init be called to setup the mutex and work queues, but that doesn't happen if the mount has failed and we oops accessing an uninitialized work queue. This patch splits the init and startup of the orphan scan, eliminating the oops. Signed-off-by: Jeff Mahoney Signed-off-by: Joel Becker diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f033760..c48b93a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb) os->os_osb = osb; os->os_count = 0; os->os_seqno = 0; - os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 5432c7f..81e8abc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7efb349..63af2e3 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1182,7 +1182,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ - ocfs2_orphan_scan_init(osb); + ocfs2_orphan_scan_start(osb); mlog_exit(status); return status; @@ -1981,6 +1981,8 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + ocfs2_orphan_scan_init(osb); + status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); -- cgit v0.10.2 From 17ae26b669886efe237b77439e43eb390fceb119 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 7 Jul 2009 15:51:40 +0800 Subject: ocfs2: trivial fix for s/migrate/migration/ in dlmrecovery.c logging in dlmrecovery.c:1121, replace 'migrate' to 'migration' to keep the consistency by comparing to other lines with the similar log info in the same file. Signed-off-by: Jeff Liu Signed-off-by: Joel Becker diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index bcb9260..43e6e32 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", dlm->name, res->lockname.len, res->lockname.name, - orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", send_to); /* send it */ -- cgit v0.10.2 From 812e7a6a43fc34bc8f70c2b80db4ea5997d66ea8 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 10 Jul 2009 13:26:04 +0800 Subject: ocfs2: log the actual return value of ocfs2_file_aio_write() in ocfs2_file_aio_write(), log_exit() could don't log the value which is really returned. this patch fixes it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 62442e4..a49fa44 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1918,8 +1918,10 @@ out_sems: mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; + return ret; } static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, -- cgit v0.10.2 From cefcb800fa9536bb6f29485c53e6c82a65b0c022 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Sat, 11 Jul 2009 10:57:27 -0500 Subject: ocfs2: Initialize count in aio_write before generic_write_checks generic_write_checks() expects count to be initialized to the size of the write. Writes to files open with O_DIRECT|O_LARGEFILE write 0 bytes because count is uninitialized. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a49fa44..aa501d3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1851,6 +1851,7 @@ relock: if (ret) goto out_dio; + count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) -- cgit v0.10.2 From 44d8e4e1e9b941065eb7578669fe51eb65c73e63 Mon Sep 17 00:00:00 2001 From: Subrata Modak Date: Tue, 14 Jul 2009 01:19:31 +0530 Subject: ocfs2: Fix compilation warning for fs/ocfs2/xattr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 4.4.1 generates the following build warning on i386: CC [M] fs/ocfs2/xattr.o fs/ocfs2/xattr.c: In function ‘ocfs2_xattr_block_get’: fs/ocfs2/xattr.c:1055: warning: ‘block_off’ may be used uninitialized in this function The following fix is based on a similar approach by David Howells few days back: http://lkml.org/lkml/2009/7/9/109, Signed-off-by: Subrata Modak, Signed-off-by: Joel Becker diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ba320e2..d1a27cd 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; - int ret = -ENODATA, name_offset, name_len, block_off, i; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { -- cgit v0.10.2 From cbfa9639aea5007313b75ec43b689d5f9e0c037d Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:23 +0800 Subject: ocfs2: Fix error return in ocfs2_write_cluster() A typo caused ocfs2_write_cluster() to return 0 in some error cases. Fix it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b2c52b3..25aced3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1301,7 +1301,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } -- cgit v0.10.2 From 1f4cea3790bf44137192f441ee84af256e3bf809 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:58 +0800 Subject: ocfs2: Fail ocfs2_get_block() immediately when a block needs allocation ocfs2_get_block() does no allocation. Hole filling for writes should have happened farther up in the call chain. We detect this case and print an error, but we then continue with the function. We should be exiting immediately. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 25aced3..e511df1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); -- cgit v0.10.2 From 3c5e10683e684ef45614c9071847e48f633d9806 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 21 Jul 2009 15:42:05 +0800 Subject: ocfs2: Add extra credits and access the modified bh in update_edge_lengths. In normal tree rotation left process, we will never touch the tree branch above subtree_index and ocfs2_extend_rotate_transaction doesn't reserve the credits for them either. But when we want to delete the rightmost extent block, we have to update the rightmost records for all the rightmost branch(See ocfs2_update_edge_lengths), so we have to allocate extra credits for them. What's more, we have to access them also. Signed-off-by: Tao Ma Signed-off-by: Joel Becker diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9edcde4..11085af 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2476,15 +2476,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, + handle->h_buffer_credits + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -2505,6 +2527,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, @@ -2717,7 +2741,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(inode, handle, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -3034,7 +3063,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_unlink_subtree(inode, handle, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); -- cgit v0.10.2 From f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Jul 2009 12:12:36 +0200 Subject: ocfs2: Fix deadlock on umount In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we moved the dentry lock put process into ocfs2_wq. This causes problems during umount because ocfs2_wq can drop references to inodes while they are being invalidated by invalidate_inodes() causing all sorts of nasty things (invalidate_inodes() ending in an infinite loop, "Busy inodes after umount" messages etc.). We fix the problem by stopping ocfs2_wq from doing any further releasing of inode references on the superblock being unmounted, wait until it finishes the current round of releasing and finally cleaning up all the references in dentry_lock_list from ocfs2_put_super(). The issue was tracked down by Tao Ma . Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b574431..2f28b7d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -310,22 +310,19 @@ out_attach: return ret; } -static DEFINE_SPINLOCK(dentry_list_lock); +DEFINE_SPINLOCK(dentry_list_lock); /* We limit the number of dentry locks to drop in one go. We have * this limit so that we don't starve other users of ocfs2_wq. */ #define DL_INODE_DROP_COUNT 64 /* Drop inode references from dentry locks */ -void ocfs2_drop_dl_inodes(struct work_struct *work) +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) { - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); struct ocfs2_dentry_lock *dl; - int drop_count = DL_INODE_DROP_COUNT; spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && drop_count--) { + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { dl = osb->dentry_lock_list; osb->dentry_lock_list = dl->dl_next; spin_unlock(&dentry_list_lock); @@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work) kfree(dl); spin_lock(&dentry_list_lock); } - if (osb->dentry_lock_list) + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); spin_unlock(&dentry_list_lock); } +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + /* * ocfs2_dentry_iput() and friends. * @@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, /* We leave dropping of inode reference to ocfs2_wq as that can * possibly lead to inode deletion which gets tricky */ spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list) + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); dl->dl_next = osb->dentry_lock_list; osb->dentry_lock_list = dl; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index faa12e7..f5dd178 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -49,10 +49,13 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); +extern spinlock_t dentry_list_lock; + void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c9345eb..39e1d5a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,10 +224,12 @@ enum ocfs2_mount_options OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; @@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 63af2e3..f289387 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type, mnt); } +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); + + kill_block_super(sb); +} + static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .get_sb = ocfs2_get_sb, /* is this called when we mount * the fs? */ - .kill_sb = kill_block_super, /* set to the generic one - * right now, but do we - * need to change that? */ + .kill_sb = ocfs2_kill_sb, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); -- cgit v0.10.2 From 82e12644cf5227dab15201fbcaf0ca6330ebd70f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 23 Jul 2009 08:12:58 +0800 Subject: ocfs2: Use ocfs2_rec_clusters in ocfs2_adjust_adjacent_records. In ocfs2_adjust_adjacent_records, we will adjust adjacent records according to the extent_list in the lower level. But actually the lower level tree will either be a leaf or a branch. If we only use ocfs2_is_empty_extent we will meet with some problem if the lower tree is a branch (tree_depth > 1). So use !ocfs2_rec_clusters instead. And actually only the leaf record can have holes. So add a BUG_ON for non-leaf branch. Signed-off-by: Tao Ma Signed-off-by: Joel Becker diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 11085af..f9a3e89 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } -- cgit v0.10.2 From b57ac2c43e66cb4aa76c9d2d0e9e0e04f19cc5a6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:15 +0200 Subject: ocfs2: Make global quota files blocksize aligned Change i_size of global quota files so that it always remains aligned to block size. This is mainly because the end of quota block may contain checksum (if checksumming is enabled) and it's a bit awkward for it to be "outside" of quota file (and it makes life harder for ocfs2-tools). Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index edfa60c..a66cb82 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -211,14 +211,17 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, off + len, off); + err = ocfs2_extend_no_holes(gqinode, rounded_end, off); up_write(&OCFS2_I(gqinode)->ip_alloc_sem); if (err < 0) goto out; err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, - off + len); + rounded_end); if (err < 0) goto out; new = 1; -- cgit v0.10.2 From 4b3fa1904c0d192461ebba692e4940a334b74353 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:16 +0200 Subject: ocfs2: Mark buffer uptodate before calling ocfs2_journal_access_dq() In a code path extending local quota files we marked new header buffer uptodate only after calling ocfs2_journal_access_dq() which triggers a bug. Fix it and also call ocfs2 variant of the function marking buffer uptodate. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5a460fa..b624f9b 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -20,6 +20,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "quota.h" +#include "uptodate.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -979,6 +980,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; handle = ocfs2_start_trans(OCFS2_SB(sb), 2); @@ -999,7 +1002,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); - set_buffer_uptodate(bh); unlock_buffer(bh); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { -- cgit v0.10.2 From 0e7f387bf386c99e8ee322649fe4fc23b9cccc6c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:17 +0200 Subject: ocfs2: Initialize blocks allocated to local quota file When we extend local quota file, we should initialize data in newly allocated block. Firstly because on recovery we could parse bogus data, secondly so that block checksums are properly computed. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b624f9b..9d2993d 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -941,7 +941,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; - struct buffer_head *bh = NULL; + struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ @@ -965,34 +965,32 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Initialize chunk header */ down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); - goto out; + goto out_trans; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_trans; } - ocfs2_set_new_buffer_uptodate(lqinode, bh); - dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out; - } - + ocfs2_set_new_buffer_uptodate(lqinode, bh); status = ocfs2_journal_access_dq(handle, lqinode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; @@ -1009,6 +1007,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } + /* Initialize new block with structures */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(lqinode, dbh); + status = ocfs2_journal_access_dq(handle, lqinode, dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + status = ocfs2_journal_dirty(handle, dbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); @@ -1033,6 +1063,7 @@ out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); + brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } @@ -1050,6 +1081,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; int status; handle_t *handle; @@ -1077,12 +1110,46 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + + /* Get buffer from the just added block */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1099,6 +1166,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out_trans; } + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { -- cgit v0.10.2 From 7669f54c55df225cbb2db939a6c9f111445ffc1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:18 +0200 Subject: ocfs2: Zero out padding of on disk dquot structure Padding fields of on-disk dquot structure were not zeroed. Zero them so that it's easier to use them later. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index a66cb82..7248db6 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; } static int ocfs2_global_is_id(void *dp, struct dquot *dquot) -- cgit v0.10.2 From 1c1d9793ff6720531c0125a28d321f283716e32f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:19 +0200 Subject: ocfs2: Fix initialization of blockcheck stats We just set blockcheck stats to zeros but we should also properly initialize the spinlock there. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f289387..b0ee0fd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb, } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; -- cgit v0.10.2 From 4539f1df25bcd0fdf0d8a5e2c92de6bece83c7a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:20 +0200 Subject: ocfs2: Remove syncjiff field from quota info syncjiff is just a converted value of syncms. Some places which are updating syncms forgot to update syncjiff as well. Since the conversion is just a simple division / multiplication and it does not happen frequently, just remove the syncjiff field to avoid forgotten conversions. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 7365e2e..3fb96fcd 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo { unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ - unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ struct list_head dqi_chunk; /* List of chunks */ struct inode *dqi_gqinode; /* Global quota file inode */ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 7248db6..dab4546 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -346,7 +346,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); - oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); @@ -356,7 +355,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -611,7 +610,7 @@ static void qsync_work_fn(struct work_struct *work) dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); } /* -- cgit v0.10.2 From 0584974a77796581eb3a64b6c5005edac4a95128 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:21 +0200 Subject: ocfs2: Define credit counts for quota operations Numbers of needed credits for some quota operations were written as raw numbers. Create appropriate defines instead. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 81e8abc..4a4d3b5 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -330,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + /* global quotafile inode update, data block */ -#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ -#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ -#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index dab4546..cc8785c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -648,10 +648,15 @@ int ocfs2_calc_qdel_credits(struct super_block *sb, int type) return 0; oinfo = sb_dqinfo(sb, type)->dqi_priv; - /* We modify tree, leaf block, global info, local chunk header, - * global and local inode */ - return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + - 2 * OCFS2_INODE_UPDATE_CREDITS; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return oinfo->dqi_gi.dqi_qtree_depth + + OCFS2_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; } static int ocfs2_release_dquot(struct dquot *dquot) @@ -701,7 +706,10 @@ int ocfs2_calc_qinit_credits(struct super_block *sb, int type) * global file we can modify info, tree and leaf block */ return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + oinfo->dqi_gi.dqi_qtree_depth + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; } static int ocfs2_acquire_dquot(struct dquot *dquot) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 9d2993d..bdb09cb 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -101,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, handle_t *handle; int status; - handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -611,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ - handle = ocfs2_start_trans(osb, 1); + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -965,7 +967,10 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -1128,7 +1133,10 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( } ocfs2_set_new_buffer_uptodate(lqinode, bh); - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); -- cgit v0.10.2 From 7194e6f9c083e87171ddfc8b746f05e007f58132 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 24 Jul 2009 17:05:00 +0300 Subject: UBI: fix double free on error path If we fail in 'ubi_eba_init_scan()', we free 'ubi->volumes[i]->eba_tbl' in there, but also later free it in 'free_internal_volumes()'. Fix this by assigning NULL to 'ubi->volumes[i]->eba_tbl' after it is freed. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index 0f2034c..e4d9ef0 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1254,6 +1254,7 @@ out_free: if (!ubi->volumes[i]) continue; kfree(ubi->volumes[i]->eba_tbl); + ubi->volumes[i]->eba_tbl = NULL; } return err; } -- cgit v0.10.2 From 32bc4820287a1a03982979515949e8ea56eac641 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 24 Jul 2009 19:16:04 +0300 Subject: UBI: compatible fallback in absense of sequence numbers Fall back onto thinking everything's OK if either of the sequence numbers we are asked to compare is zero, which is what was used before sequence numbers were introduced. [ Artem: modified the patch to be applicable to upstream UBI, added big comment ] Signed-off-by: Adrian Hunter Signed-off-by: Phil Carmody Signed-off-by: Artem Bityutskiy diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c index a423131..b847745 100644 --- a/drivers/mtd/ubi/scan.c +++ b/drivers/mtd/ubi/scan.c @@ -781,11 +781,22 @@ static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, return -EINVAL; } + /* + * Make sure that all PEBs have the same image sequence number. + * This allows us to detect situations when users flash UBI + * images incorrectly, so that the flash has the new UBI image + * and leftovers from the old one. This feature was added + * relatively recently, and the sequence number was always + * zero, because old UBI implementations always set it to zero. + * For this reasons, we do not panic if some PEBs have zero + * sequence number, while other PEBs have non-zero sequence + * number. + */ image_seq = be32_to_cpu(ech->image_seq); if (!si->image_seq_set) { ubi->image_seq = image_seq; si->image_seq_set = 1; - } else if (ubi->image_seq != image_seq) { + } else if (ubi->image_seq && ubi->image_seq != image_seq) { ubi_err("bad image sequence number %d in PEB %d, " "expected %d", image_seq, pnum, ubi->image_seq); ubi_dbg_dump_ec_hdr(ech); -- cgit v0.10.2 From e9956fae7dbce6ac770e7a00436c496ddbbd3215 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 30 Jul 2009 16:07:10 +0800 Subject: ocfs2/quota: Release lock for error in ocfs2_quota_write. ocfs2_quota_write needs to release the lock if it fails to read quota block. So use "goto out" instead of "return err". Signed-off-by: Tao Ma Acked-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index cc8785c..d604a6a 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -238,7 +238,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, } if (err) { mlog_errno(err); - return err; + goto out; } lock_buffer(bh); if (new) -- cgit v0.10.2 From ab57a40827d99e2d8e59066a56b93bf6c844c916 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 31 Jul 2009 14:28:02 -0500 Subject: ocfs2: Remove redundant BUG_ON in __dlm_queue_ast() We BUG_ON() the same thing twice. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index d07ddbe..81eff8e 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) lock->ast_pending, lock->ml.type); BUG(); } - BUG_ON(!list_empty(&lock->ast_list)); if (lock->ast_pending) mlog(0, "lock has an ast getting flushed right now\n"); -- cgit v0.10.2 From 7d5b005652bc5ae3e1e0efc53fd0e25a643ec506 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 4 Aug 2009 15:34:22 -0700 Subject: x86: Fix VMI && stack protector With CONFIG_STACK_PROTECTOR turned on, VMI doesn't boot with more than one processor. The problem is with the gs value not being initialized correctly when registering the secondary processor for VMI's case. The patch below initializes the gs value for the AP to __KERNEL_STACK_CANARY. Without this the secondary processor keeps on taking a GP on every gs access. Signed-off-by: Alok N Kataria Cc: # for v2.6.30.x LKML-Reference: <1249425262.18955.40.camel@ank32.eng.vmware.com> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423..95a7289 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; -- cgit v0.10.2 From e125e7b6944898831b56739a5448e705578bf7e2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 2 Jul 2009 21:45:47 +0200 Subject: KVM: Fix KVM_GET_MSR_INDEX_LIST So far, KVM copied the emulated_msrs (only MSR_IA32_MISC_ENABLE) to a wrong address in user space due to broken pointer arithmetic. This caused subtle corruption up there (missing MSR_IA32_MISC_ENABLE had probably no practical relevance). Moreover, the size check for the user-provided kvm_msr_list forgot about emulated MSRs. Cc: stable@kernel.org Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe5474a..7bc3114 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,14 +1079,13 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; - if (n < num_msrs_to_save) + if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, ARRAY_SIZE(emulated_msrs) * sizeof(u32))) goto out; -- cgit v0.10.2 From 0ff77873b1318fc2d77a85e70690d3cd6cafbd41 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 2 Jul 2009 20:02:15 -0300 Subject: KVM: PIT: fix kpit_elapsed division by zero Fix division by zero triggered by latch count command on uninitialized counter. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d2..21f68e0 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) ktime_t remaining; struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + if (!ps->pit_timer.period) + return 0; + /* * The Counter does not stop when it reaches zero. In * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to -- cgit v0.10.2 From d6289b9365c3f622a8cfe62c4fb054bb70b5061a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 22 Jun 2009 15:27:56 -0300 Subject: KVM: x86: verify MTRR/PAT validity Do not allow invalid memory types in MTRR/PAT (generating a #GP otherwise). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bc3114..3d45290 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) return false; } +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!msr_mtrr_valid(msr)) + if (!mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { -- cgit v0.10.2 From 4b656b1202498184a0ecef86b3b89ff613b9c6ab Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 21 Jul 2009 12:47:45 -0300 Subject: KVM: SVM: force new asid on vcpu migration If a migrated vcpu matches the asid_generation value of the target pcpu, there will be no TLB flush via TLB_CONTROL_FLUSH_ALL_ASID. The check for vcpu.cpu in pre_svm_run is meaningless since svm_vcpu_load already updated it on schedule in. Such vcpu will VMRUN with stale TLB entries. Based on original patch from Joerg Roedel (http://patchwork.kernel.org/patch/10021/) Signed-off-by: Marcelo Tosatti Acked-by: Joerg Roedel Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 71510e0..b1f658a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); + svm->asid_generation = 0; } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) @@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->vcpu.cpu = svm_data->cpu; svm->asid_generation = svm_data->asid_generation; svm->vmcb->control.asid = svm_data->next_asid++; } @@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm) struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (svm->vcpu.cpu != cpu || - svm->asid_generation != svm_data->asid_generation) + /* FIXME: handle wraparound of asid_generation */ + if (svm->asid_generation != svm_data->asid_generation) new_asid(svm, svm_data); } -- cgit v0.10.2 From 025dbbf36a7680bffe54d9dcbf0a8bc01a7cbd10 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 22 Jul 2009 13:05:49 -0300 Subject: KVM: MMU: handle n_free_mmu_pages > n_alloc_mmu_pages in kvm_mmu_change_mmu_pages kvm_mmu_change_mmu_pages mishandles the case where n_alloc_mmu_pages is smaller then n_free_mmu_pages, by not checking if the result of the subtraction is negative. Its a valid condition which can happen if a large number of pages has been recently freed. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7030b5f..49a10d0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1407,24 +1407,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; + used_pages--; } kvm->arch.n_free_mmu_pages = 0; } -- cgit v0.10.2 From 34f0c1ad27a74bd5eb0f99ea43ab6a4658d6419d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 22 Jul 2009 23:53:26 +0200 Subject: KVM: VMX: Fix locking order in handle_invalid_guest_state Release and re-acquire preemption and IRQ lock in the same order as vcpu_enter_guest does. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 356a0ce..6bf58c0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - preempt_enable(); local_irq_enable(); + preempt_enable(); while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, schedule(); } - local_irq_disable(); preempt_disable(); + local_irq_disable(); vmx->invalid_state_emulation_result = err; } -- cgit v0.10.2 From 263799a3616242201e20fd2025fe84047b1379b1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 21 Jul 2009 10:43:07 +0200 Subject: KVM: VMX: Fix locking imbalance on emulation failure We have to disable preemption and IRQs on every exit from handle_invalid_guest_state, otherwise we generate at least a preempt_disable imbalance. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6bf58c0..29f9129 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + break; } if (signal_pending(current)) -- cgit v0.10.2 From d3bc2f91b4761a8d9f96bea167fef2f8c00dea54 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 16 Jul 2009 17:17:37 +0200 Subject: KVM: s390: fix wait_queue handling There are two waitqueues in kvm for wait handling: vcpu->wq for virt/kvm/kvm_main.c and vpcu->arch.local_int.wq for the s390 specific wait code. the wait handling in kvm_s390_handle_wait was broken by using different wait_queues for add_wait queue and remove_wait_queue. There are two options to fix the problem: o move all the s390 specific code to vcpu->wq and remove vcpu->arch.local_int.wq o move all the s390 specific code to vcpu->arch.local_int.wq This patch chooses the 2nd variant for two reasons: o s390 does not use kvm_vcpu_block but implements its own enabled wait handling. Having a separate wait_queue make it clear, that our wait mechanism is different o the patch is much smaller Report-by: Julia Lawall Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f04f530..4d61341 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -386,7 +386,7 @@ no_timer: } __unset_cpu_idle(vcpu); __set_current_state(TASK_RUNNING); - remove_wait_queue(&vcpu->wq, &wait); + remove_wait_queue(&vcpu->arch.local_int.wq, &wait); spin_unlock_bh(&vcpu->arch.local_int.lock); spin_unlock(&vcpu->arch.local_int.float_int->lock); hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); -- cgit v0.10.2 From 5116d8f6b977970ebefc1932c0f313163a6ec91f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 17:10:01 +0300 Subject: KVM: fix ack not being delivered when msi present kvm_notify_acked_irq does not check irq type, so that it sometimes interprets msi vector as irq. As a result, ack notifiers are not called, which typially hangs the guest. The fix is to track and check irq type. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 16713dc..3060bdc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -110,6 +110,7 @@ struct kvm_memory_slot { struct kvm_kernel_irq_routing_entry { u32 gsi; + u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level); union { diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a8bd466..ddc17f0 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -160,7 +160,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) unsigned gsi = pin; list_for_each_entry(e, &kvm->irq_routing, link) - if (e->irqchip.irqchip == irqchip && + if (e->type == KVM_IRQ_ROUTING_IRQCHIP && + e->irqchip.irqchip == irqchip && e->irqchip.pin == pin) { gsi = e->gsi; break; @@ -259,6 +260,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, int delta; e->gsi = ue->gsi; + e->type = ue->type; switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; -- cgit v0.10.2 From c428dcc9b9f967945992a2f8529e8c50a31d7913 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 17 Jun 2009 15:04:19 +1000 Subject: KVM: Make KVM_HPAGES_PER_HPAGE unsigned long to avoid build error on powerpc Eliminates this compiler warning: arch/powerpc/kvm/../../../virt/kvm/kvm_main.c:1178: error: integer overflow in expression Signed-off-by: Stephen Rothwell Signed-off-by: Avi Kivity diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index dfdf13c..fddc3ed 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -34,7 +34,7 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 /* We don't currently support large pages. */ -#define KVM_PAGES_PER_HPAGE (1<<31) +#define KVM_PAGES_PER_HPAGE (1UL << 31) struct kvm; struct kvm_run; -- cgit v0.10.2 From e9cbde8c158629cc96a26b2323c4a243536c1951 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 6 Jul 2009 12:49:39 +0300 Subject: KVM: ia64: fix build failures due to ia64/unsigned long mismatches Signed-off-by: Avi Kivity diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c index 21f63ff..9bf55af 100644 --- a/arch/ia64/kvm/mmio.c +++ b/arch/ia64/kvm/mmio.c @@ -247,7 +247,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma) vcpu_get_fpreg(vcpu, inst.M9.f2, &v); /* Write high word. FIXME: this is a kludge! */ v.u.bits[1] &= 0x3ffff; - mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE); + mmio_access(vcpu, padr + 8, (u64 *)&v.u.bits[1], 8, + ma, IOREQ_WRITE); data = v.u.bits[0]; size = 3; } else if (inst.M10.major == 7 && inst.M10.x6 == 0x3B) { @@ -265,7 +266,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma) /* Write high word.FIXME: this is a kludge! */ v.u.bits[1] &= 0x3ffff; - mmio_access(vcpu, padr + 8, &v.u.bits[1], 8, ma, IOREQ_WRITE); + mmio_access(vcpu, padr + 8, (u64 *)&v.u.bits[1], + 8, ma, IOREQ_WRITE); data = v.u.bits[0]; size = 3; } else if (inst.M10.major == 7 && inst.M10.x6 == 0x31) { diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index 46b02cb..cc406d0 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -461,7 +461,7 @@ void setreg(unsigned long regnum, unsigned long val, u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg) { struct kvm_pt_regs *regs = vcpu_regs(vcpu); - u64 val; + unsigned long val; if (!reg) return 0; @@ -469,7 +469,7 @@ u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg) return val; } -void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 value, int nat) +void vcpu_set_gr(struct kvm_vcpu *vcpu, unsigned long reg, u64 value, int nat) { struct kvm_pt_regs *regs = vcpu_regs(vcpu); long sof = (regs->cr_ifs) & 0x7f; @@ -1072,7 +1072,7 @@ void kvm_ttag(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_gr(vcpu, inst.M46.r1, tag, 0); } -int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, u64 *padr) +int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, unsigned long *padr) { struct thash_data *data; union ia64_isr visr, pt_isr; diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h index 042af92..360724d 100644 --- a/arch/ia64/kvm/vcpu.h +++ b/arch/ia64/kvm/vcpu.h @@ -686,14 +686,15 @@ static inline int highest_inservice_irq(struct kvm_vcpu *vcpu) return highest_bits((int *)&(VMX(vcpu, insvc[0]))); } -extern void vcpu_get_fpreg(struct kvm_vcpu *vcpu, u64 reg, +extern void vcpu_get_fpreg(struct kvm_vcpu *vcpu, unsigned long reg, struct ia64_fpreg *val); -extern void vcpu_set_fpreg(struct kvm_vcpu *vcpu, u64 reg, +extern void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg, struct ia64_fpreg *val); -extern u64 vcpu_get_gr(struct kvm_vcpu *vcpu, u64 reg); -extern void vcpu_set_gr(struct kvm_vcpu *vcpu, u64 reg, u64 val, int nat); -extern u64 vcpu_get_psr(struct kvm_vcpu *vcpu); -extern void vcpu_set_psr(struct kvm_vcpu *vcpu, u64 val); +extern u64 vcpu_get_gr(struct kvm_vcpu *vcpu, unsigned long reg); +extern void vcpu_set_gr(struct kvm_vcpu *vcpu, unsigned long reg, + u64 val, int nat); +extern unsigned long vcpu_get_psr(struct kvm_vcpu *vcpu); +extern void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val); extern u64 vcpu_thash(struct kvm_vcpu *vcpu, u64 vadr); extern void vcpu_bsw0(struct kvm_vcpu *vcpu); extern void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, -- cgit v0.10.2 From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7..2fd1752 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit v0.10.2 From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752..2606cee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit v0.10.2 From 3f6e968ef4e1d8d93d8a8505461b0e50a9e97ad8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 22:00:14 -0400 Subject: tracing: do not use functions starting with .L in recordmcount.pl On Wed, 5 Aug 2009, Ingo Molnar wrote: > * Dave Airlie wrote: > > > Hey, > > > > So I spent 3-4 hrs today (I'm stupid yes) tracking down a .o > > breakage by blaming rawhide gcc/binutils as I was using make > > V=1and seeing only the compiler chain running, > > Hm, is this that powerpc related build bug you just reported? Well we tracked it down and it is powerpc64 specific. Seems that in drivers/hwmon/lm93.c there's a function called: LM93_IN_FROM_REG() But PPC64 has function descriptors and the real function names (the ones you see in objdump) start with a '.'. Thus this in objdump you have: Disassembly of section .text: 0000000000000000 <.LM93_IN_FROM_REG>: 0: 7c 08 02 a6 mflr r0 4: fb 81 ff e0 std r28,-32(r1) The function name used is .LM93_IN_FROM_REG. But gcc considers symbols that start with ".L" as a special symbol that is used inside the assembly stage. The nm passed into recordmcount uses the --synthetic option which shows the ".L" symbols (my runs outside of the build did not include the --synthetic option, so my older patch worked). We see the function as a local. Now to capture all the locations that use "mcount" we need to have a reference to link into the object file a list of mcount callers. We need a reference that will not disappear. We try to use a global function and if that does not work, we use a local function as a reference. But to relink the section back into the object, we need to make it global. In this case, we run objcopy using --globalize-symbol and --localize-symbol to convert the symbol into a global symbol, link the mcount list, then convert it back to a local symbol. This works great except for this case. .L* symbols can not be converted into a global symbol, and the mcount section referencing it will remain unresolved. Reported-by: Dave Airlie LKML-Reference: Signed-off-by: Steven Rostedt diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index d29baa2..4889c44 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -414,7 +414,10 @@ while () { $offset = hex $1; } else { # if we already have a function, and this is weak, skip it - if (!defined($ref_func) && !defined($weak{$text})) { + if (!defined($ref_func) && !defined($weak{$text}) && + # PPC64 can have symbols that start with .L and + # gcc considers these special. Don't use them! + $text !~ /^\.L/) { $ref_func = $text; $offset = hex $1; } -- cgit v0.10.2 From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 12:05:21 -0700 Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock() In the event of a lock steal or owner died, rt_mutex_start_proxy_lock() will give the rt_mutex to the waiting task, but it fails to release the wait_lock. This leads to subsequent deadlocks when other tasks try to acquire the rt_mutex. I also removed a few extra blank lines that really spaced this routine out. I must have been high on the \n when I wrote this originally... Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A79D7F1.4000405@us.ibm.com> Signed-off-by: Ingo Molnar diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a..29bd4ba 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); - + spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { /* * Reset the return value. We might have -- cgit v0.10.2 From 53a27b39ff4d2492f84b1fdc2f0047175f0b0b93 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 5 Aug 2009 15:43:58 -0300 Subject: KVM: MMU: limit rmap chain length Otherwise the host can spend too long traversing an rmap chain, which happens under a spinlock. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 49a10d0..0ef5bb2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. + * + * Returns the number of rmap entries before the spte was added or zero if + * the spte was not added. + * */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; - int i; + int i, count = 0; if (!is_rmap_pte(*spte)) - return; + return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; @@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { desc = desc->more; + count += RMAP_EXT; + } if (desc->shadow_ptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; @@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) ; desc->shadow_ptes[i] = spte; } + return count; } static void rmap_desc_remove_entry(unsigned long *rmapp, @@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) return young; } +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +{ + unsigned long *rmapp; + + gfn = unalias_gfn(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + + kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_flush_remote_tlbs(vcpu->kvm); +} + int kvm_age_hva(struct kvm *kvm, unsigned long hva) { return kvm_handle_hva(kvm, hva, kvm_age_rmapp); @@ -1741,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); + int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -1782,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn, largepage); + rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) kvm_release_pfn_clean(pfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, gfn, largepage); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); -- cgit v0.10.2 From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee..d4d3580 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit v0.10.2 From 7dbdee2e9a2ac42ea5135801bcc9d1a8e3f672aa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Aug 2009 19:53:18 -0400 Subject: tracing: Fix recordmcount.pl to handle sections with only weak functions Roland Dreier found that a section that contained only a weak function in one of the staging drivers and this caused recordmcount.pl to spit out a warning and fail. Although it is strange that a driver would have a weak function, and this function only be used in one place, it should not be something to make recordmcount.pl fail. This patch fixes the issue in a simple manner: if only weak functions exist in a section, then that section will not be recorded. Reported-by: Roland Dreier Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 4889c44..911ba7f 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -393,7 +393,7 @@ while () { $read_function = 0; } # print out any recorded offsets - update_funcs() if ($text_found); + update_funcs() if (defined($ref_func)); # reset all markers and arrays $text_found = 0; @@ -444,7 +444,7 @@ while () { } # dump out anymore offsets that may have been found -update_funcs() if ($text_found); +update_funcs() if (defined($ref_func)); # If we did not find any mcount callers, we are done (do nothing). if (!$opened) { -- cgit v0.10.2 From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 16:37:10 +0800 Subject: lockdep: Fix file mode of lock_stat /proc/lock_stat is writable. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com> Signed-off-by: Ingo Molnar diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa..e94caa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void) &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); #endif return 0; -- cgit v0.10.2 From 0e692a94e378628b7d527260ad939894454bcca8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 15:10:54 +0800 Subject: lockdep: Fix typos in documentation s/head/held Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BD37E.9060806@cn.fujitsu.com> Signed-off-by: Ingo Molnar diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt index e20d913..abf768c 100644 --- a/Documentation/lockdep-design.txt +++ b/Documentation/lockdep-design.txt @@ -30,9 +30,9 @@ State The validator tracks lock-class usage history into 4n + 1 separate state bits: - 'ever held in STATE context' -- 'ever head as readlock in STATE context' -- 'ever head with STATE enabled' -- 'ever head as readlock with STATE enabled' +- 'ever held as readlock in STATE context' +- 'ever held with STATE enabled' +- 'ever held as readlock with STATE enabled' Where STATE can be either one of (kernel/lockdep_states.h) - hardirq -- cgit v0.10.2 From afc5e65245255a268ab22a20477ed2c9f2cdfcd3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 7 Aug 2009 16:33:53 +0200 Subject: ASoC: Add missing DRV_NAME definitions for fsl/* drivers Module builds are broken due to missing DRV_NAME for efika-audio-fabric and pcm030-audio-fabric. Signed-off-by: Takashi Iwai diff --git a/sound/soc/fsl/efika-audio-fabric.c b/sound/soc/fsl/efika-audio-fabric.c index 85b0e75..3326e2a 100644 --- a/sound/soc/fsl/efika-audio-fabric.c +++ b/sound/soc/fsl/efika-audio-fabric.c @@ -30,6 +30,8 @@ #include "mpc5200_psc_ac97.h" #include "../codecs/stac9766.h" +#define DRV_NAME "efika-audio-fabric" + static struct snd_soc_device device; static struct snd_soc_card card; diff --git a/sound/soc/fsl/pcm030-audio-fabric.c b/sound/soc/fsl/pcm030-audio-fabric.c index 8766f7a..b928ef7 100644 --- a/sound/soc/fsl/pcm030-audio-fabric.c +++ b/sound/soc/fsl/pcm030-audio-fabric.c @@ -30,6 +30,8 @@ #include "mpc5200_psc_ac97.h" #include "../codecs/wm9712.h" +#define DRV_NAME "pcm030-audio-fabric" + static struct snd_soc_device device; static struct snd_soc_card card; -- cgit v0.10.2 From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580..a330513 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit v0.10.2 From d25f14389a65c7f95512b01415d8d4a8d62855ab Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 27 Jul 2009 12:05:38 +0900 Subject: PCI hotplug: SGI hotplug: fix build failure The commit bd3d99c17039fd05a29587db3f4a180c48da115a ("PCI: Remove untested Electromechanical Interlock (EMI) support in pciehp."), which removes the definition of "struct hotplug_slot_attr", broke SGI hotplug driver. By this commit, we get the following compile error. drivers/pci/hotplug/sgi_hotplug.c:106: error: variable 'sn_slot_path_attr' has initializer but incomplete type drivers/pci/hotplug/sgi_hotplug.c:106: error: unknown field 'attr' specified in initializer drivers/pci/hotplug/sgi_hotplug.c:106: error: extra brace group at end of initializer drivers/pci/hotplug/sgi_hotplug.c:106: error: (near initialization for 'sn_slot_path_attr') drivers/pci/hotplug/sgi_hotplug.c:106: warning: excess elements in struct initializer drivers/pci/hotplug/sgi_hotplug.c:106: warning: (near initialization for 'sn_slot_path_attr') drivers/pci/hotplug/sgi_hotplug.c:106: error: unknown field 'show' specified in initializer drivers/pci/hotplug/sgi_hotplug.c:106: warning: excess elements in struct initializer drivers/pci/hotplug/sgi_hotplug.c:106: warning: (near initialization for 'sn_slot_path_attr') drivers/pci/hotplug/sgi_hotplug.c: In function 'sn_hp_destroy': drivers/pci/hotplug/sgi_hotplug.c:203: error: invalid use of undefined type 'struct hotplug_slot_attribute' drivers/pci/hotplug/sgi_hotplug.c: In function 'sn_hotplug_slot_register': drivers/pci/hotplug/sgi_hotplug.c:655: error: invalid use of undefined type 'struct hotplug_slot_attribute' This patch fixes this regression by adding the definition of struct hotplug_slot_attr into sgi_hotplug.c. Tested-by: Mike Habeck Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c index a4494d7..0a724bb 100644 --- a/drivers/pci/hotplug/sgi_hotplug.c +++ b/drivers/pci/hotplug/sgi_hotplug.c @@ -103,6 +103,12 @@ static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot, return retval; } +struct hotplug_slot_attribute { + struct attribute attr; + ssize_t (*show)(struct hotplug_slot *, char *); + ssize_t (*store)(struct hotplug_slot *, const char *, size_t); +}; + static struct hotplug_slot_attribute sn_slot_path_attr = __ATTR_RO(path); static int sn_pci_slot_valid(struct pci_bus *pci_bus, int device) -- cgit v0.10.2 From 94f81a47c4a7a2d7a16fcfdd6d81da381732c101 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Mon, 27 Jul 2009 12:06:46 +0900 Subject: PCI hotplug: SGI hotplug: do not use hotplug_slot_attr By the pci slot changes, callbacks of attributes under slot directory (/sys/bus/pci/slots) had been changed to get the pointer to struct pci_slot instead of struct hotplug_slot. So the path_show() that assumes the parameter is a pointer to struct hotplug_slot seems broken. Tested-by: Mike Habeck Signed-off-by: Kenji Kaneshige Signed-off-by: Jesse Barnes diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c index 0a724bb..8aebe1e9 100644 --- a/drivers/pci/hotplug/sgi_hotplug.c +++ b/drivers/pci/hotplug/sgi_hotplug.c @@ -90,11 +90,10 @@ static struct hotplug_slot_ops sn_hotplug_slot_ops = { static DEFINE_MUTEX(sn_hotplug_mutex); -static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot, - char *buf) +static ssize_t path_show(struct pci_slot *pci_slot, char *buf) { int retval = -ENOENT; - struct slot *slot = bss_hotplug_slot->private; + struct slot *slot = pci_slot->hotplug->private; if (!slot) return retval; @@ -103,13 +102,7 @@ static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot, return retval; } -struct hotplug_slot_attribute { - struct attribute attr; - ssize_t (*show)(struct hotplug_slot *, char *); - ssize_t (*store)(struct hotplug_slot *, const char *, size_t); -}; - -static struct hotplug_slot_attribute sn_slot_path_attr = __ATTR_RO(path); +static struct pci_slot_attribute sn_slot_path_attr = __ATTR_RO(path); static int sn_pci_slot_valid(struct pci_bus *pci_bus, int device) { -- cgit v0.10.2 From e7432675f8ca868a4af365759a8d4c3779a3d922 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 6 Aug 2009 16:12:58 -0700 Subject: ocfs2: Initialize the cluster we're writing to in a non-sparse extend In a non-sparse extend, we correctly allocate (and zero) the clusters between the old_i_size and pos, but we don't zero the portions of the cluster we're writing to outside of pos<->len. It handles clustersize > pagesize and blocksize < pagesize. [Cleaned up by Joel Becker.] Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e511df1..b401654 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -895,18 +895,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -984,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1218,20 +1218,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1342,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1392,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1467,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1633,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + return ret; } @@ -1645,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; @@ -1723,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1757,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); goto out_quota; -- cgit v0.10.2 From 8a57a9dda760ea7845390f1cd36f3eb2a49391d8 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 16:07:50 -0700 Subject: ocfs2: keep index within status_map[] Do not exceed array status_map[] Signed-off-by: Roel Kluin Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Joel Becker diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3f66137..e49c410 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -17,6 +17,7 @@ * General Public License for more details. */ +#include #include #include @@ -153,7 +154,7 @@ static int status_map[] = { static int dlm_status_to_errno(enum dlm_status status) { - BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } -- cgit v0.10.2 From 087d7e56deffb611a098e7e257388a41edbeef1f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 08:59:59 -0700 Subject: x86: Fix MSI-X initialization by using online_mask for x2apic target_cpus found a system where x2apic reports an MSI-X irq initialization failure: [ 302.859446] igbvf 0000:81:10.4: enabling device (0000 -> 0002) [ 302.874369] igbvf 0000:81:10.4: using 64bit DMA mask [ 302.879023] igbvf 0000:81:10.4: using 64bit consistent DMA mask [ 302.894386] igbvf 0000:81:10.4: enabling bus mastering [ 302.898171] igbvf 0000:81:10.4: setting latency timer to 64 [ 302.914050] reserve_memtype added 0xefb08000-0xefb0c000, track uncached-minus, req uncached-minus, ret uncached-minus [ 302.933839] reserve_memtype added 0xefb28000-0xefb29000, track uncached-minus, req uncached-minus, ret uncached-minus [ 302.940367] alloc irq_desc for 265 on node 4 [ 302.956874] alloc kstat_irqs on node 4 [ 302.959452] alloc irq_2_iommu on node 0 [ 302.974328] igbvf 0000:81:10.4: irq 265 for MSI/MSI-X [ 302.977778] alloc irq_desc for 266 on node 4 [ 302.980347] alloc kstat_irqs on node 4 [ 302.995312] free_memtype request 0xefb28000-0xefb29000 [ 302.998816] igbvf 0000:81:10.4: Failed to initialize MSI-X interrupts. ... it turns out that when trying to enable MSI-X, __assign_irq_vector(new, cfg_new, apic->target_cpus()) can not get vector because for x2apic target-cpus returns cpumask_of(0) Update that to online_mask like xapic. Signed-off-by: Yinghai Lu Acked-by: Suresh Siddha LKML-Reference: <4A785AFF.3050902@kernel.org> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 2ed4e2b..a5371ec 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 0b631c6..a8989aa 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -- cgit v0.10.2 From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee..3fd3019 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit v0.10.2 From 498cdbfbcf98e0d2c90a26e6a02a82f043876e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ozan=20=C3=87a=C4=9Flayan?= Date: Tue, 4 Aug 2009 19:39:31 +0300 Subject: x86: Add quirk to make Apple MacBookPro5,1 use reboot=pci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MacBookPro5,1 is not able to reboot unless reboot=pci is set. This patch forces it through a DMI quirk specific to this device. Signed-off-by: Ozan Çağlayan LKML-Reference: <1249403971-6543-1-git-send-email-ozan@pardus.org.tr> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 834c9da..9eb8976 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -405,7 +405,7 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ /* - * Apple MacBook5,2 (2009 MacBook) needs reboot=p + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ static int __init set_pci_reboot(const struct dmi_system_id *d) { @@ -426,6 +426,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), }, }, + { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + }, + }, { } }; -- cgit v0.10.2 From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:09 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. I noticed the same problem also existed for the create_pred() case and added a fix for that. Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746549.6453.29.camel@tropicana> Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621..1557148 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); @@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); -- cgit v0.10.2 From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:53 -0500 Subject: tracing/filters: Always free pred on filter_add_subsystem_pred() failure If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM, the pred doesn't get freed, while as a side effect it does for other errors. Make it so the caller always frees the pred for any error. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746593.6453.32.camel@tropicana> Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1557148..f32dc9d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, return -ENOSPC; } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, } replace_filter_string(call->filter, filter_string); } + + filter->preds[filter->n_preds] = pred; + filter->n_preds++; out: return err; } @@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; @@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; -- cgit v0.10.2 From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 6 Aug 2009 16:03:30 -0700 Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer() When the process exits we don't have to run new cputimer nor use running one (as it not accounts when tsk->exit_state != 0) to get process CPU times. As there is only one thread we can just use CPU times fields from task and signal structs. Signed-off-by: Stanislaw Gruszka Cc: Peter Zijlstra Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0..e33a21c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct task_cputime cputime; + struct signal_struct *const sig = tsk->signal; - thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, - cputime.utime, cputime.stime, cputime.sum_exec_runtime); + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -- cgit v0.10.2 From 38d5487db7f289be1d56ac7df704ee49ed3213b9 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 20 Jul 2009 14:49:17 -0700 Subject: drm: When adding probed modes, preserve duplicate mode types The code which takes probed modes and adds them to a connector eliminates duplicate modes by comparing them using drm_mode_equal. That function doesn't consider the type bits, which means that any modes which differ only in the type field will be lost. One of the bits in the mode->type field is the DRM_MODE_TYPE_PREFERRED bit. If the mode with that bit is lost, then higher level code will not know which mode to select, causing a random mode to be used instead. This patch simply merges the two mode type bits together; that seems reasonable to me, but perhaps only a subset of the bits should be used? None of these can be user defined as they all come from looking at just the hardware. Signed-off-by: Keith Packard Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 54f492a..7914097 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -566,6 +566,8 @@ void drm_mode_connector_list_update(struct drm_connector *connector) found_it = 1; /* if equal delete the probed mode */ mode->status = pmode->status; + /* Merge type bits together */ + mode->type |= pmode->type; list_del(&pmode->head); drm_mode_destroy(connector->dev, pmode); break; -- cgit v0.10.2 From 8d3457ec3198a569dd14dc9e3ae8b6163bcaa0b5 Mon Sep 17 00:00:00 2001 From: Paul Rolland Date: Sun, 9 Aug 2009 12:24:01 +1000 Subject: drm: silence pointless vblank warning. Some applications/hardware combinations are triggering the message "failed to acquire vblank counter" to be issued up to 20 times a second, which makes it both useless and dangerous, as this may hide other important messages. This changes makes it only appear when people are debugging. Signed-off-by: Paul Rolland Reviewed-by: Jesse Barnes Lost-twice-by: Dave Airlie Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index b4a3dbc..f85aaf2 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -566,7 +566,7 @@ int drm_wait_vblank(struct drm_device *dev, void *data, ret = drm_vblank_get(dev, crtc); if (ret) { - DRM_ERROR("failed to acquire vblank counter, %d\n", ret); + DRM_DEBUG("failed to acquire vblank counter, %d\n", ret); return ret; } seq = drm_vblank_count(dev, crtc); -- cgit v0.10.2 From 6cb504c29b1338925c83e4430e42a51eaa43781e Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Sun, 9 Aug 2009 12:25:29 +1000 Subject: drm/i915: silence vblank warnings these errors are pretty pointless Reviewed-by: Jesse Barnes Signed-off-by: Dave Airlie diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 83aee80..7ebc84c 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -190,7 +190,7 @@ u32 i915_get_vblank_counter(struct drm_device *dev, int pipe) low_frame = pipe ? PIPEBFRAMEPIXEL : PIPEAFRAMEPIXEL; if (!i915_pipe_enabled(dev, pipe)) { - DRM_ERROR("trying to get vblank count for disabled pipe %d\n", pipe); + DRM_DEBUG("trying to get vblank count for disabled pipe %d\n", pipe); return 0; } @@ -219,7 +219,7 @@ u32 gm45_get_vblank_counter(struct drm_device *dev, int pipe) int reg = pipe ? PIPEB_FRMCOUNT_GM45 : PIPEA_FRMCOUNT_GM45; if (!i915_pipe_enabled(dev, pipe)) { - DRM_ERROR("trying to get vblank count for disabled pipe %d\n", pipe); + DRM_DEBUG("trying to get vblank count for disabled pipe %d\n", pipe); return 0; } -- cgit v0.10.2 From fdb8a42742ac95606668f73481dfb2f760658fdd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 15:58:13 -0700 Subject: x86: fix buffer overflow in efi_init() If the vendor name (from c16) can be longer than 100 bytes (or missing a terminating null), then the null is written past the end of vendor[]. Found with Parfait, http://research.sun.com/projects/parfait/ Signed-off-by: Roel Kluin Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin Cc: Huang Ying diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 19ccf6d..fe26ba3 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -354,7 +354,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else -- cgit v0.10.2 From b4a2f5e723e4f7df46731106faf9e2405673c073 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Jul 2009 18:48:11 +0300 Subject: KVM: Avoid redelivery of edge interrupt before next edge The check for an edge is broken in current ioapic code. ioapic->irr is cleared on each edge interrupt by ioapic_service() and this makes old_irr != ioapic->irr condition in kvm_ioapic_set_irq() to be always true. The patch fixes the code to properly recognise edge. Some HW emulation calls set_irq() without level change. If each such call is propagated to an OS it may confuse a device driver. This is the case with keyboard device emulation and Windows XP x64 installer on SMP VM. Each keystroke produce two interrupts (down/up) one interrupt is submitted to CPU0 and another to CPU1. This confuses Windows somehow and it ignores keystrokes. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1eddae9..1150c6d 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -95,8 +95,6 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) pent->fields.remote_irr = 1; } - if (!pent->fields.trig_mode) - ioapic->irr &= ~(1 << idx); return injected; } @@ -136,7 +134,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) mask_after = ioapic->redirtbl[index].fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); - if (ioapic->irr & (1 << index)) + if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG + && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); break; } @@ -184,9 +183,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) if (!level) ioapic->irr &= ~mask; else { + int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); ioapic->irr |= mask; - if ((!entry.fields.trig_mode && old_irr != ioapic->irr) - || !entry.fields.remote_irr) + if ((edge && old_irr != ioapic->irr) || + (!edge && !entry.fields.remote_irr)) ret = ioapic_service(ioapic, irq); } } -- cgit v0.10.2 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553..fec71f8 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aa..52eb4b6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v0.10.2 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193..a81170d 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6e..a67dd5c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8..7fb16d9 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b6..8681021 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39..c22b40f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5..8b9f4f6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da0992..90c9808 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v0.10.2 From 923c42c19944da214d697e312a040384a0e33e78 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 22 Jul 2009 20:36:03 +0200 Subject: perf_counter tools: Fix/resurrect perf top annotation in a simple interactive form perf top used to have annotation support, but it has bitrotted and removed. This patch restores that: it allows the user to select any symbol in kernel space for source level annotation on the fly, switch between event counters and alter display variables. When symbol details are being displayed, stopping annotation reverts to normal. known keys: [d] select display delay. [e] select display entries (lines). [E] select annotation event counter. [f] select normal display count filter. [F] select annotation display count filter (percentage). [qQ] quit. [s] select annotation symbol and start annotation. [S] stop annotation, revert to normal display. [z] toggle event count zeroing. Sample: ------------------------------------------------------------------------------ PerfTop: 16719 irqs/sec kernel:78.7% [cache-misses/cache-references/instructions/cycles], (all, 4 CPUs) ------------------------------------------------------------------------------ Showing cache-misses for e1000_clean_rx_irq Events Pcnt (>=3%) 0 0.0% /* adjust length to remove Ethernet CRC */ 0 0.0% if (!(adapter->flags2 & FLAG2_CRC_STRIPPING)) 0 0.0% length -= 4; 436 5.0% f039: 41 f6 84 24 5c 29 00 testb $0x1,0x295c(%r12) 0 0.0% f089: 8b 4d 84 mov -0x7c(%rbp),%ecx 0 0.0% f08c: 48 83 ef 02 sub $0x2,%rdi 0 0.0% f090: 48 83 ee 02 sub $0x2,%rsi 811 9.3% f094: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 0 0.0% 0 0.0% while (rx_desc->status & E1000_RXD_STAT_DD) { 0 0.0% f114: 41 f6 47 0c 01 testb $0x1,0xc(%r15) 7226 82.6% f119: 0f 85 24 fe ff ff jne ef43 Available events: 0 cache-misses 1 cache-references 2 instructions 3 cycles Enter details event counter: 2 ------------------------------------------------------------------------------ PerfTop: 15035 irqs/sec kernel:79.0% [cache-misses/cache-references/instructions/cycles], (all, 4 CPUs) ------------------------------------------------------------------------------ Showing instructions for e1000_clean_rx_irq Events Pcnt (>=3%) 0 0.0% int *work_done, int work_to_do) 0 0.0% { 175 0.9% eebf: 55 push %rbp 1898 9.8% eec0: 48 89 e5 mov %rsp,%rbp 0 0.0% 0 0.0% i = rx_ring->next_to_clean; 140 0.7% ef0a: 0f b7 41 1a movzwl 0x1a(%rcx),%eax 670 3.4% ef0e: 89 45 ac mov %eax,-0x54(%rbp) 0 0.0% { 0 0.0% memcpy(skb->data + offset, from, len); 91 0.5% f07b: 49 8b b6 e8 00 00 00 mov 0xe8(%r14),%rsi 1153 5.9% f082: 48 8b b8 e8 00 00 00 mov 0xe8(%rax),%rdi 42 0.2% f089: 8b 4d 84 mov -0x7c(%rbp),%ecx 14 0.1% f08c: 48 83 ef 02 sub $0x2,%rdi 0 0.0% f090: 48 83 ee 02 sub $0x2,%rsi 1618 8.3% f094: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 0 0.0% 0 0.0% /* return some buffers to hardware, one at a time is too slow */ 0 0.0% if (cleaned_count >= E1000_RX_BUFFER_WRITE) { 867 4.5% f0e7: 83 7d b0 0f cmpl $0xf,-0x50(%rbp) 0 0.0% 0 0.0% while (rx_desc->status & E1000_RXD_STAT_DD) { 37 0.2% f114: 41 f6 47 0c 01 testb $0x1,0xc(%r15) 4047 20.8% f119: 0f 85 24 fe ff ff jne ef43 Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f139f1a..d587013 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include #include @@ -54,7 +56,7 @@ static int system_wide = 0; static int default_interval = 100000; -static u64 count_filter = 5; +static int count_filter = 5; static int print_entries = 15; static int target_pid = -1; @@ -69,15 +71,27 @@ static int freq = 0; static int verbose = 0; static char *vmlinux = NULL; -static char *sym_filter; -static unsigned long filter_start; -static unsigned long filter_end; - static int delay_secs = 2; static int zero; static int dump_symtab; /* + * Source + */ + +struct source_line { + u64 eip; + unsigned long count[MAX_COUNTERS]; + char *line; + struct source_line *next; +}; + +static char *sym_filter = NULL; +struct sym_entry *sym_filter_entry = NULL; +static int sym_pcnt_filter = 5; +static int sym_counter = 0; + +/* * Symbols */ @@ -91,9 +105,237 @@ struct sym_entry { unsigned long snap_count; double weight; int skip; + struct source_line *source; + struct source_line *lines; + struct source_line **lines_tail; + pthread_mutex_t source_lock; }; -struct sym_entry *sym_filter_entry; +/* + * Source functions + */ + +static void parse_source(struct sym_entry *syme) +{ + struct symbol *sym; + struct module *module; + struct section *section = NULL; + FILE *file; + char command[PATH_MAX*2], *path = vmlinux; + u64 start, end, len; + + if (!syme) + return; + + if (syme->lines) { + pthread_mutex_lock(&syme->source_lock); + goto out_assign; + } + + sym = (struct symbol *)(syme + 1); + module = sym->module; + + if (module) + path = module->path; + if (!path) + return; + + start = sym->obj_start; + if (!start) + start = sym->start; + + if (module) { + section = module->sections->find_section(module->sections, ".text"); + if (section) + start -= section->vma; + } + + end = start + sym->end - sym->start + 1; + len = sym->end - sym->start; + + sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", start, end, path); + + file = popen(command, "r"); + if (!file) + return; + + pthread_mutex_lock(&syme->source_lock); + syme->lines_tail = &syme->lines; + while (!feof(file)) { + struct source_line *src; + size_t dummy = 0; + char *c; + + src = malloc(sizeof(struct source_line)); + assert(src != NULL); + memset(src, 0, sizeof(struct source_line)); + + if (getline(&src->line, &dummy, file) < 0) + break; + if (!src->line) + break; + + c = strchr(src->line, '\n'); + if (c) + *c = 0; + + src->next = NULL; + *syme->lines_tail = src; + syme->lines_tail = &src->next; + + if (strlen(src->line)>8 && src->line[8] == ':') { + src->eip = strtoull(src->line, NULL, 16); + if (section) + src->eip += section->vma; + } + if (strlen(src->line)>8 && src->line[16] == ':') { + src->eip = strtoull(src->line, NULL, 16); + if (section) + src->eip += section->vma; + } + } + pclose(file); +out_assign: + sym_filter_entry = syme; + pthread_mutex_unlock(&syme->source_lock); +} + +static void __zero_source_counters(struct sym_entry *syme) +{ + int i; + struct source_line *line; + + line = syme->lines; + while (line) { + for (i = 0; i < nr_counters; i++) + line->count[i] = 0; + line = line->next; + } +} + +static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) +{ + struct source_line *line; + + if (syme != sym_filter_entry) + return; + + if (pthread_mutex_trylock(&syme->source_lock)) + return; + + if (!syme->source) + goto out_unlock; + + for (line = syme->lines; line; line = line->next) { + if (line->eip == ip) { + line->count[counter]++; + break; + } + if (line->eip > ip) + break; + } +out_unlock: + pthread_mutex_unlock(&syme->source_lock); +} + +static void lookup_sym_source(struct sym_entry *syme) +{ + struct symbol *symbol = (struct symbol *)(syme + 1); + struct source_line *line; + char pattern[PATH_MAX]; + char *idx; + + sprintf(pattern, "<%s>:", symbol->name); + + if (symbol->module) { + idx = strstr(pattern, "\t"); + if (idx) + *idx = 0; + } + + pthread_mutex_lock(&syme->source_lock); + for (line = syme->lines; line; line = line->next) { + if (strstr(line->line, pattern)) { + syme->source = line; + break; + } + } + pthread_mutex_unlock(&syme->source_lock); +} + +static void show_lines(struct source_line *queue, int count, int total) +{ + int i; + struct source_line *line; + + line = queue; + for (i = 0; i < count; i++) { + float pcnt = 100.0*(float)line->count[sym_counter]/(float)total; + + printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line); + line = line->next; + } +} + +#define TRACE_COUNT 3 + +static void show_details(struct sym_entry *syme) +{ + struct symbol *symbol; + struct source_line *line; + struct source_line *line_queue = NULL; + int displayed = 0; + int line_queue_count = 0, total = 0, more = 0; + + if (!syme) + return; + + if (!syme->source) + lookup_sym_source(syme); + + if (!syme->source) + return; + + symbol = (struct symbol *)(syme + 1); + printf("Showing %s for %s\n", event_name(sym_counter), symbol->name); + printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); + + pthread_mutex_lock(&syme->source_lock); + line = syme->source; + while (line) { + total += line->count[sym_counter]; + line = line->next; + } + + line = syme->source; + while (line) { + float pcnt = 0.0; + + if (!line_queue_count) + line_queue = line; + line_queue_count++; + + if (line->count[sym_counter]) + pcnt = 100.0 * line->count[sym_counter] / (float)total; + if (pcnt >= (float)sym_pcnt_filter) { + if (displayed <= print_entries) + show_lines(line_queue, line_queue_count, total); + else more++; + displayed += line_queue_count; + line_queue_count = 0; + line_queue = NULL; + } else if (line_queue_count > TRACE_COUNT) { + line_queue = line_queue->next; + line_queue_count--; + } + + line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8; + line = line->next; + } + pthread_mutex_unlock(&syme->source_lock); + if (more) + printf("%d lines not displayed, maybe increase display entries [e]\n", more); +} struct dso *kernel_dso; @@ -228,6 +470,11 @@ static void print_sym_table(void) printf("------------------------------------------------------------------------------\n\n"); + if (sym_filter_entry) { + show_details(sym_filter_entry); + return; + } + if (nr_counters == 1) printf(" samples pcnt"); else @@ -242,7 +489,7 @@ static void print_sym_table(void) struct symbol *sym = (struct symbol *)(syme + 1); double pcnt; - if (++printed > print_entries || syme->snap_count < count_filter) + if (++printed > print_entries || (int)syme->snap_count < count_filter) continue; pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / @@ -261,19 +508,208 @@ static void print_sym_table(void) } } +static void prompt_integer(int *target, const char *msg) +{ + char *buf = malloc(0), *p; + size_t dummy = 0; + int tmp; + + fprintf(stdout, "\n%s: ", msg); + if (getline(&buf, &dummy, stdin) < 0) + return; + + p = strchr(buf, '\n'); + if (p) + *p = 0; + + p = buf; + while(*p) { + if (!isdigit(*p)) + goto out_free; + p++; + } + tmp = strtoul(buf, NULL, 10); + *target = tmp; +out_free: + free(buf); +} + +static void prompt_percent(int *target, const char *msg) +{ + int tmp = 0; + + prompt_integer(&tmp, msg); + if (tmp >= 0 && tmp <= 100) + *target = tmp; +} + +static void prompt_symbol(struct sym_entry **target, const char *msg) +{ + char *buf = malloc(0), *p; + struct sym_entry *syme = *target, *n, *found = NULL; + size_t dummy = 0; + + /* zero counters of active symbol */ + if (syme) { + pthread_mutex_lock(&syme->source_lock); + __zero_source_counters(syme); + *target = NULL; + pthread_mutex_unlock(&syme->source_lock); + } + + fprintf(stdout, "\n%s: ", msg); + if (getline(&buf, &dummy, stdin) < 0) + goto out_free; + + p = strchr(buf, '\n'); + if (p) + *p = 0; + + pthread_mutex_lock(&active_symbols_lock); + syme = list_entry(active_symbols.next, struct sym_entry, node); + pthread_mutex_unlock(&active_symbols_lock); + + list_for_each_entry_safe_from(syme, n, &active_symbols, node) { + struct symbol *sym = (struct symbol *)(syme + 1); + + if (!strcmp(buf, sym->name)) { + found = syme; + break; + } + } + + if (!found) { + fprintf(stderr, "Sorry, %s is not active.\n", sym_filter); + sleep(1); + return; + } else + parse_source(found); + +out_free: + free(buf); +} + +static void print_known_keys(void) +{ + fprintf(stdout, "\nknown keys:\n"); + fprintf(stdout, "\t[d] select display delay.\n"); + fprintf(stdout, "\t[e] select display entries (lines).\n"); + fprintf(stdout, "\t[E] select annotation event counter.\n"); + fprintf(stdout, "\t[f] select normal display count filter.\n"); + fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); + fprintf(stdout, "\t[qQ] quit.\n"); + fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); + fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); + fprintf(stdout, "\t[z] toggle event count zeroing.\n"); +} + +static void handle_keypress(int c) +{ + int once = 0; +repeat: + switch (c) { + case 'd': + prompt_integer(&delay_secs, "Enter display delay"); + break; + case 'e': + prompt_integer(&print_entries, "Enter display entries (lines)"); + break; + case 'E': + if (nr_counters > 1) { + int i; + + fprintf(stderr, "\nAvailable events:"); + for (i = 0; i < nr_counters; i++) + fprintf(stderr, "\n\t%d %s", i, event_name(i)); + + prompt_integer(&sym_counter, "Enter details event counter"); + + if (sym_counter >= nr_counters) { + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0)); + sym_counter = 0; + sleep(1); + } + } else sym_counter = 0; + break; + case 'f': + prompt_integer(&count_filter, "Enter display event count filter"); + break; + case 'F': + prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); + break; + case 'q': + case 'Q': + printf("exiting.\n"); + exit(0); + case 's': + prompt_symbol(&sym_filter_entry, "Enter details symbol"); + break; + case 'S': + if (!sym_filter_entry) + break; + else { + struct sym_entry *syme = sym_filter_entry; + + pthread_mutex_lock(&syme->source_lock); + sym_filter_entry = NULL; + __zero_source_counters(syme); + pthread_mutex_unlock(&syme->source_lock); + } + break; + case 'z': + zero = ~zero; + break; + default: { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + struct termios tc, save; + + if (!once) { + print_known_keys(); + once++; + } + + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; + tcsetattr(0, TCSANOW, &tc); + + poll(&stdin_poll, 1, -1); + c = getc(stdin); + + tcsetattr(0, TCSAFLUSH, &save); + goto repeat; + } + } +} + static void *display_thread(void *arg __used) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; - int delay_msecs = delay_secs * 1000; - - printf("PerfTop refresh period: %d seconds\n", delay_secs); + struct termios tc, save; + int delay_msecs, c; + + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; +repeat: + delay_msecs = delay_secs * 1000; + tcsetattr(0, TCSANOW, &tc); + /* trash return*/ + getc(stdin); do { print_sym_table(); } while (!poll(&stdin_poll, 1, delay_msecs) == 1); - printf("key pressed - exiting.\n"); - exit(0); + c = getc(stdin); + tcsetattr(0, TCSAFLUSH, &save); + + handle_keypress(c); + goto repeat; return NULL; } @@ -293,7 +729,6 @@ static const char *skip_symbols[] = { static int symbol_filter(struct dso *self, struct symbol *sym) { - static int filter_match; struct sym_entry *syme; const char *name = sym->name; int i; @@ -315,6 +750,10 @@ static int symbol_filter(struct dso *self, struct symbol *sym) return 1; syme = dso__sym_priv(self, sym); + pthread_mutex_init(&syme->source_lock, NULL); + if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) + sym_filter_entry = syme; + for (i = 0; skip_symbols[i]; i++) { if (!strcmp(skip_symbols[i], name)) { syme->skip = 1; @@ -322,29 +761,6 @@ static int symbol_filter(struct dso *self, struct symbol *sym) } } - if (filter_match == 1) { - filter_end = sym->start; - filter_match = -1; - if (filter_end - filter_start > 10000) { - fprintf(stderr, - "hm, too large filter symbol <%s> - skipping.\n", - sym_filter); - fprintf(stderr, "symbol filter start: %016lx\n", - filter_start); - fprintf(stderr, " end: %016lx\n", - filter_end); - filter_end = filter_start = 0; - sym_filter = NULL; - sleep(1); - } - } - - if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) { - filter_match = 1; - filter_start = sym->start; - } - - return 0; } @@ -380,8 +796,6 @@ out_delete_dso: return -1; } -#define TRACE_COUNT 3 - /* * Binary search in the histogram table and record the hit: */ @@ -394,6 +808,7 @@ static void record_ip(u64 ip, int counter) if (!syme->skip) { syme->count[counter]++; + record_precise_ip(syme, counter, ip); pthread_mutex_lock(&active_symbols_lock); if (list_empty(&syme->node) || !syme->node.next) __list_insert_active_sym(syme); @@ -690,8 +1105,8 @@ static const struct option options[] = { "put the counters into a counter group"), OPT_BOOLEAN('i', "inherit", &inherit, "child tasks inherit counters"), - OPT_STRING('s', "sym-filter", &sym_filter, "pattern", - "only display symbols matchig this pattern"), + OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name", + "symbol to annotate - requires -k option"), OPT_BOOLEAN('z', "zero", &zero, "zero history across updates"), OPT_INTEGER('F', "freq", &freq, @@ -734,6 +1149,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) delay_secs = 1; parse_symbols(); + parse_source(sym_filter_entry); /* * Fill in the ones not specifically initialized via -c: -- cgit v0.10.2 From 46ab976443c6c566c8fe6fc72a6733a55ba9fbea Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 24 Jul 2009 10:09:50 +0200 Subject: perf_counter tools: Allow perf top top users to switch between weighted and individual counter display Add [w]eighted hotkey. Pressing [w] toggles between displaying weighted total of all counters, and the counter selected via [E]vent select key. ------------------------------------------------------------------------------ PerfTop: 90395 irqs/sec kernel:16.1% [cache-misses/cache-references/instructions], (all, 4 CPUs) ------------------------------------------------------------------------------ weight samples pcnt RIP kernel function ______ _______ _____ ________________ _______________ 1275408.6 10881 - 5.3% - ffffffff81146f70 : copy_page_c 553683.4 43569 - 21.3% - ffffffff81146f20 : clear_page_c 74075.0 6768 - 3.3% - ffffffff81147190 : copy_user_generic_string 40602.9 7538 - 3.7% - ffffffff81284ba2 : _spin_lock 26882.1 965 - 0.5% - ffffffff8109d280 : file_ra_state_init [w] ------------------------------------------------------------------------------ PerfTop: 91221 irqs/sec kernel:14.5% [10000Hz cache-misses], (all, 4 CPUs) ------------------------------------------------------------------------------ weight samples pcnt RIP kernel function ______ _______ _____ ________________ _______________ 47320.00 - 22.3% - ffffffff81146f20 : clear_page_c 14261.00 - 6.7% - ffffffff810992f5 : __rmqueue 11046.00 - 5.2% - ffffffff81146f70 : copy_page_c 7842.00 - 3.7% - ffffffff81284ba2 : _spin_lock 7234.00 - 3.4% - ffffffff810aa1d6 : unmap_vmas Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d587013..4eef346 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -90,6 +90,7 @@ static char *sym_filter = NULL; struct sym_entry *sym_filter_entry = NULL; static int sym_pcnt_filter = 5; static int sym_counter = 0; +static int display_weighted = -1; /* * Symbols @@ -354,6 +355,9 @@ static double sym_weight(const struct sym_entry *sym) double weight = sym->snap_count; int counter; + if (!display_weighted) + return weight; + for (counter = 1; counter < nr_counters-1; counter++) weight *= sym->count[counter]; @@ -401,7 +405,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) static void print_sym_table(void) { int printed = 0, j; - int counter; + int counter, snap = !display_weighted ? sym_counter : 0; float samples_per_sec = samples/delay_secs; float ksamples_per_sec = (samples-userspace_samples)/delay_secs; float sum_ksamples = 0.0; @@ -417,7 +421,7 @@ static void print_sym_table(void) pthread_mutex_unlock(&active_symbols_lock); list_for_each_entry_safe_from(syme, n, &active_symbols, node) { - syme->snap_count = syme->count[0]; + syme->snap_count = syme->count[snap]; if (syme->snap_count != 0) { syme->weight = sym_weight(syme); rb_insert_active_sym(&tmp, syme); @@ -437,7 +441,7 @@ static void print_sym_table(void) samples_per_sec, 100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec))); - if (nr_counters == 1) { + if (nr_counters == 1 || !display_weighted) { printf("%Ld", (u64)attrs[0].sample_period); if (freq) printf("Hz "); @@ -445,7 +449,9 @@ static void print_sym_table(void) printf(" "); } - for (counter = 0; counter < nr_counters; counter++) { + if (!display_weighted) + printf("%s", event_name(sym_counter)); + else for (counter = 0; counter < nr_counters; counter++) { if (counter) printf("/"); @@ -495,7 +501,7 @@ static void print_sym_table(void) pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / sum_ksamples)); - if (nr_counters == 1) + if (nr_counters == 1 || !display_weighted) printf("%20.2f - ", syme->weight); else printf("%9.1f %10ld - ", syme->weight, syme->snap_count); @@ -594,13 +600,14 @@ static void print_known_keys(void) fprintf(stdout, "\nknown keys:\n"); fprintf(stdout, "\t[d] select display delay.\n"); fprintf(stdout, "\t[e] select display entries (lines).\n"); - fprintf(stdout, "\t[E] select annotation event counter.\n"); + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); fprintf(stdout, "\t[f] select normal display count filter.\n"); fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); fprintf(stdout, "\t[qQ] quit.\n"); fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); - fprintf(stdout, "\t[z] toggle event count zeroing.\n"); + fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); } static void handle_keypress(int c) @@ -656,6 +663,9 @@ repeat: pthread_mutex_unlock(&syme->source_lock); } break; + case 'w': + display_weighted = ~display_weighted; + break; case 'z': zero = ~zero; break; -- cgit v0.10.2 From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8681021..615440a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,27 +3560,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; /* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* * Software counter: cpu wall time clock */ -- cgit v0.10.2 From 091bd2e993fcb1094a23e36157285b62bc87afdf Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 4 Aug 2009 10:21:23 +0200 Subject: perf top: Improve interactive key handling Pressing any key which is not currently mapped to functionality, based on startup command line options, displays currently mapped keys, and prompts for input. Pressing any unmapped key at the prompt returns the user to display mode with variables unchanged. eg, pressing ? etc displays currently available keys, the value of the variable associated with that key, and prompts. Pressing same again aborts input. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4eef346..7de28ce 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -595,25 +595,84 @@ out_free: free(buf); } -static void print_known_keys(void) +static void print_mapped_keys(void) { - fprintf(stdout, "\nknown keys:\n"); - fprintf(stdout, "\t[d] select display delay.\n"); - fprintf(stdout, "\t[e] select display entries (lines).\n"); - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); - fprintf(stdout, "\t[f] select normal display count filter.\n"); - fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); - fprintf(stdout, "\t[qQ] quit.\n"); - fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); - fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); - fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + char *name = NULL; + + if (sym_filter_entry) { + struct symbol *sym = (struct symbol *)(sym_filter_entry+1); + name = sym->name; + } + + fprintf(stdout, "\nMapped keys:\n"); + fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", delay_secs); + fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); + + if (nr_counters > 1) + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); + + fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); + + if (vmlinux) { + fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); + fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); + fprintf(stdout, "\t[S] stop annotation.\n"); + } + + if (nr_counters > 1) + fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); + fprintf(stdout, "\t[qQ] quit.\n"); +} + +static int key_mapped(int c) +{ + switch (c) { + case 'd': + case 'e': + case 'f': + case 'z': + case 'q': + case 'Q': + return 1; + case 'E': + case 'w': + return nr_counters > 1 ? 1 : 0; + case 'F': + case 's': + case 'S': + return vmlinux ? 1 : 0; + } + + return 0; } static void handle_keypress(int c) { - int once = 0; -repeat: + if (!key_mapped(c)) { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + struct termios tc, save; + + print_mapped_keys(); + fprintf(stdout, "\nEnter selection, or unmapped key to continue: "); + fflush(stdout); + + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; + tcsetattr(0, TCSANOW, &tc); + + poll(&stdin_poll, 1, -1); + c = getc(stdin); + + tcsetattr(0, TCSAFLUSH, &save); + if (!key_mapped(c)) + return; + } + switch (c) { case 'd': prompt_integer(&delay_secs, "Enter display delay"); @@ -669,28 +728,6 @@ repeat: case 'z': zero = ~zero; break; - default: { - struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; - struct termios tc, save; - - if (!once) { - print_known_keys(); - once++; - } - - tcgetattr(0, &save); - tc = save; - tc.c_lflag &= ~(ICANON | ECHO); - tc.c_cc[VMIN] = 0; - tc.c_cc[VTIME] = 0; - tcsetattr(0, TCSANOW, &tc); - - poll(&stdin_poll, 1, -1); - c = getc(stdin); - - tcsetattr(0, TCSAFLUSH, &save); - goto repeat; - } } } @@ -705,6 +742,7 @@ static void *display_thread(void *arg __used) tc.c_lflag &= ~(ICANON | ECHO); tc.c_cc[VMIN] = 0; tc.c_cc[VTIME] = 0; + repeat: delay_msecs = delay_secs * 1000; tcsetattr(0, TCSANOW, &tc); -- cgit v0.10.2 From 836179834833272f89098c6d1e1b89e8e69797c2 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 4 Aug 2009 10:24:41 +0200 Subject: perf top: Update man page perf_counter tools: update perf top manual page to reflect current implementation. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 539d012..4a7d558 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -3,36 +3,122 @@ perf-top(1) NAME ---- -perf-top - Run a command and profile it +perf-top - System profiling tool. SYNOPSIS -------- [verse] -'perf top' [-e | --event=EVENT] [-l] [-a] +'perf top' [-e | --event=EVENT] [] DESCRIPTION ----------- -This command runs a command and gathers a performance counter profile -from it. +This command generates and displays a performance counter profile in realtime. OPTIONS ------- -...:: - Any command you can specify in a shell. +-a:: +--all-cpus:: + System-wide collection. (default) + +-c :: +--count=:: + Event period to sample. + +-C :: +--CPU=:: + CPU to profile. + +-d :: +--delay=:: + Number of seconds to delay between refreshes. --e:: ---event=:: +-e :: +--event=:: Select the PMU event. Selection can be a symbolic event name (use 'perf list' to list all events) or a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + hexadecimal event descriptor. --a:: - system-wide collection +-E :: +--entries=:: + Display this many functions. + +-f :: +--count-filter=:: + Only display functions with more events than this. + +-F :: +--freq=:: + Profile at this frequency. + +-i:: +--inherit:: + Child tasks inherit counters, only makes sens with -p option. + +-k :: +--vmlinux=:: + Path to vmlinux. Required for annotation functionality. + +-m :: +--mmap-pages=:: + Number of mmapped data pages. + +-p :: +--pid=:: + Profile events on existing pid. + +-r :: +--realtime=:: + Collect data with this RT SCHED_FIFO priority. + +-s :: +--sym-annotate=:: + Annotate this symbol. Requires -k option. + +-v:: +--verbose:: + Be more verbose (show counter open errors, etc). + +-z:: +--zero:: + Zero history across display updates. + +INTERACTIVE PROMPTING KEYS +-------------------------- + +[d]:: + Display refresh delay. + +[e]:: + Number of entries to display. + +[E]:: + Event to display when multiple counters are active. + +[f]:: + Profile display filter (>= hit count). + +[F]:: + Annotation display filter (>= % of total). + +[s]:: + Annotate symbol. + +[S]:: + Stop annotation, return to full profile display. + +[w]:: + Toggle between weighted sum and individual count[E]r profile. + +[z]:: + Toggle event count zeroing across display updates. + +[qQ]:: + Quit. + +Pressing any unmapped key displays a menu, and prompts for input. --l:: - scale counter values SEE ALSO -------- -- cgit v0.10.2 From 1953287bfe8afcbbd235bd6c42c9df06d52438dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 07:11:05 +0200 Subject: perf tools: Fix call-chain cumul hit based sub-total (fractal mode) The callchain fractal mode builds each new total hits in a new branch of profiling by using the parent's hits of the current branch plus the hits of the children. This is wrong, the total hits of a branch should be made of the sum of every children hits, we must ignore the parent hits in this scope. This patch also fixes another mistake with the hit counting. Now the rates are correct. Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Pekka Enberg Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8cb58d6..da402e1 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -901,7 +901,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, int i; if (callchain_param.mode == CHAIN_GRAPH_REL) - new_total = self->cumul_hit; + new_total = self->children_hit; else new_total = total_samples; @@ -930,7 +930,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, ret += ipchain__fprintf_graph(fp, chain, depth, new_depth_mask, i++, new_total, - child->cumul_hit); + cumul_hits(child)); } ret += callchain__fprintf_graph(fp, child, new_total, depth + 1, diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 9d3c814..98c5627 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -26,10 +26,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct callchain_node *rnode; + u64 chain_cumul = cumul_hits(chain); while (*p) { + u64 rnode_cumul; + parent = *p; rnode = rb_entry(parent, struct callchain_node, rb_node); + rnode_cumul = cumul_hits(rnode); switch (mode) { case CHAIN_FLAT: @@ -40,7 +44,7 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, break; case CHAIN_GRAPH_ABS: /* Falldown */ case CHAIN_GRAPH_REL: - if (rnode->cumul_hit < chain->cumul_hit) + if (rnode_cumul < chain_cumul) p = &(*p)->rb_left; else p = &(*p)->rb_right; @@ -87,7 +91,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node, chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); - if (child->cumul_hit >= min_hit) + if (cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_ABS); } @@ -108,11 +112,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node, u64 min_hit; node->rb_root = RB_ROOT; - min_hit = node->cumul_hit * min_percent / 100.0; + min_hit = node->children_hit * min_percent / 100.0; chain_for_each_child(child, node) { __sort_chain_graph_rel(child, min_percent); - if (child->cumul_hit >= min_hit) + if (cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_REL); } @@ -211,7 +215,8 @@ add_child(struct callchain_node *parent, struct ip_callchain *chain, new = create_child(parent, false); fill_node(new, chain, start, syms); - new->cumul_hit = new->hit = 1; + new->children_hit = 0; + new->hit = 1; } /* @@ -241,7 +246,8 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain, /* split the hits */ new->hit = parent->hit; - new->cumul_hit = parent->cumul_hit; + new->children_hit = parent->children_hit; + parent->children_hit = cumul_hits(new); new->val_nr = parent->val_nr - idx_local; parent->val_nr = idx_local; @@ -249,6 +255,7 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain, if (idx_total < chain->nr) { parent->hit = 0; add_child(parent, chain, idx_total, syms); + parent->children_hit++; } else { parent->hit = 1; } @@ -269,13 +276,13 @@ __append_chain_children(struct callchain_node *root, struct ip_callchain *chain, unsigned int ret = __append_chain(rnode, chain, start, syms); if (!ret) - goto cumul; + goto inc_children_hit; } /* nothing in children, add to the current node */ add_child(root, chain, start, syms); -cumul: - root->cumul_hit++; +inc_children_hit: + root->children_hit++; } static int @@ -317,8 +324,6 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain, /* we match 100% of the path, increment the hit */ if (i - start == root->val_nr && i == chain->nr) { root->hit++; - root->cumul_hit++; - return 0; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 7812122..b2d128e 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -21,7 +21,7 @@ struct callchain_node { struct rb_root rb_root; /* sorted tree of children */ unsigned int val_nr; u64 hit; - u64 cumul_hit; /* hit + hits of children */ + u64 children_hit; }; struct callchain_param; @@ -48,6 +48,11 @@ static inline void callchain_init(struct callchain_node *node) INIT_LIST_HEAD(&node->val); } +static inline u64 cumul_hits(struct callchain_node *node) +{ + return node->hit + node->children_hit; +} + int register_callchain_param(struct callchain_param *param); void append_chain(struct callchain_node *root, struct ip_callchain *chain, struct symbol **syms); -- cgit v0.10.2 From 1c222bce7dd0cb9578b4c5cd3874a74f1db497c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 20:57:41 +0200 Subject: perf tools: Fix multi-counter stat bug caused by incorrect reading of perf.data file header Brice Goglin reported that only the first result from a multi-counter perf record --stat run is accurate, the rest looks bogus. A silly mistake made us re-read the first attribute for every recorded attribute. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin Cc: paulus@samba.org Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 450384b..95a44bc 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -213,9 +213,10 @@ struct perf_header *perf_header__read(int fd) for (i = 0; i < nr_attrs; i++) { struct perf_header_attr *attr; - off_t tmp = lseek(fd, 0, SEEK_CUR); + off_t tmp; do_read(fd, &f_attr, sizeof(f_attr)); + tmp = lseek(fd, 0, SEEK_CUR); attr = perf_header_attr__new(&f_attr.attr); -- cgit v0.10.2 From 8f18aec535b5ca513dd13b531730177d35175ffa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 19:40:28 +0200 Subject: perf report: Fix per task mult-counter stat reporting Brice Goglin reported: > I can easily sort them by thread id, but I don't know how to match > my 4 events with each group of 4 lines. Also report the counter id and the time running/enabled stats (in case the counter got time-shared). Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index da402e1..8420546 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -112,7 +112,9 @@ struct read_event { struct perf_event_header header; u32 pid,tid; u64 value; - u64 format[3]; + u64 time_enabled; + u64 time_running; + u64 id; }; typedef union event_union { @@ -1690,14 +1692,37 @@ static void trace_event(event_t *event) dprintf(".\n"); } +static struct perf_header *header; + +static struct perf_counter_attr *perf_header__find_attr(u64 id) +{ + int i; + + for (i = 0; i < header->attrs; i++) { + struct perf_header_attr *attr = header->attr[i]; + int j; + + for (j = 0; j < attr->ids; j++) { + if (attr->id[j] == id) + return &attr->attr; + } + } + + return NULL; +} + static int process_read_event(event_t *event, unsigned long offset, unsigned long head) { - dprintf("%p [%p]: PERF_EVENT_READ: %d %d %Lu\n", + struct perf_counter_attr *attr = perf_header__find_attr(event->read.id); + + dprintf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), event->read.pid, event->read.tid, + attr ? __event_name(attr->type, attr->config) + : "FAIL", event->read.value); return 0; @@ -1743,8 +1768,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static struct perf_header *header; - static u64 perf_header__sample_type(void) { u64 sample_type = 0; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 7bdad8d..f77407b 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -223,9 +223,15 @@ char *event_name(int counter) { u64 config = attrs[counter].config; int type = attrs[counter].type; + + return __event_name(type, config); +} + +char *__event_name(int type, u64 config) +{ static char buf[32]; - if (attrs[counter].type == PERF_TYPE_RAW) { + if (type == PERF_TYPE_RAW) { sprintf(buf, "raw 0x%llx", config); return buf; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 1ea5d09..192a962 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -10,6 +10,7 @@ extern int nr_counters; extern struct perf_counter_attr attrs[MAX_COUNTERS]; extern char *event_name(int ctr); +extern char *__event_name(int type, u64 config); extern int parse_events(const struct option *opt, const char *str, int unset); -- cgit v0.10.2 From 94cb9e385d5b4d55a5ae389baa10ad2835ea39bb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 6 Aug 2009 14:43:17 -0300 Subject: perf report: Add debug help for the finding of symbol bugs - show the symtab origin (DSO, build-id, kernel, etc) Used with perf report --verbose: [acme@doppio linux-2.6-tip]$ perf report -v | head -16 5.17% firefox /usr/lib64/xulrunner-1.9.1/libxul.so 0x00000000005d8eee f [.] imgContainer::DrawFrameTo(gfxIImageFrame*, gfxIImageFrame*, nsRect&) 2.56% firefox /lib64/libpthread-2.10.1.so 0x0000000000008e02 d [.] __pthread_mutex_lock_internal 1.94% firefox /usr/lib64/xulrunner-1.9.1/libxul.so 0x0000000000d0af8f f [.] SearchTable 1.75% firefox [kernel] 0xffffffffff60013b k [.] vread_hpet 1.63% firefox /lib64/libpthread-2.10.1.so 0x000000000000a404 d [.] __pthread_mutex_unlock 1.47% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000482ea f [.] js_Interpret 1.42% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x000000000003eda3 f [.] JS_CallTracer 1.24% firefox [kernel] 0xffffffff8102ca4a k [k] read_hpet 1.16% firefox [kernel] 0xffffffff810f3dd4 k [k] fget_light 1.11% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000567ff f [.] js_TraceObject 0.98% firefox /usr/lib64/firefox-3.5.2/firefox 0x000000000000dd23 b [.] arena_ralloc [acme@doppio linux-2.6-tip]$ The new field is just after the symbol address. To help in figuring out symbol resolution bugs. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8420546..a5e2f8d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -700,7 +700,8 @@ sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used) size_t ret = 0; if (verbose) - ret += repsep_fprintf(fp, "%#018llx ", (u64)self->ip); + ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip, + dso__symtab_origin(self->dso)); ret += repsep_fprintf(fp, "[%c] ", self->level); if (self->sym) { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 16ddca2..f1dcede 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -24,6 +24,16 @@ const char *sym_hist_filter; #define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ #endif +enum dso_origin { + DSO__ORIG_KERNEL = 0, + DSO__ORIG_JAVA_JIT, + DSO__ORIG_FEDORA, + DSO__ORIG_UBUNTU, + DSO__ORIG_BUILDID, + DSO__ORIG_DSO, + DSO__ORIG_NOT_FOUND, +}; + static struct symbol *symbol__new(u64 start, u64 len, const char *name, unsigned int priv_size, u64 obj_start, int verbose) @@ -81,6 +91,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size) self->sym_priv_size = sym_priv_size; self->find_symbol = dso__find_symbol; self->slen_calculated = 0; + self->origin = DSO__ORIG_NOT_FOUND; } return self; @@ -710,7 +721,7 @@ static char *dso__read_build_id(struct dso *self, int verbose) ++raw; bid += 2; } - if (verbose) + if (verbose >= 2) printf("%s(%s): %s\n", __func__, self->name, build_id); out_elf_end: elf_end(elf); @@ -720,11 +731,26 @@ out: return build_id; } +char dso__symtab_origin(const struct dso *self) +{ + static const char origin[] = { + [DSO__ORIG_KERNEL] = 'k', + [DSO__ORIG_JAVA_JIT] = 'j', + [DSO__ORIG_FEDORA] = 'f', + [DSO__ORIG_UBUNTU] = 'u', + [DSO__ORIG_BUILDID] = 'b', + [DSO__ORIG_DSO] = 'd', + }; + + if (self == NULL || self->origin == DSO__ORIG_NOT_FOUND) + return '!'; + return origin[self->origin]; +} + int dso__load(struct dso *self, symbol_filter_t filter, int verbose) { int size = PATH_MAX; char *name = malloc(size), *build_id = NULL; - int variant = 0; int ret = -1; int fd; @@ -733,19 +759,26 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose) self->adjust_symbols = 0; - if (strncmp(self->name, "/tmp/perf-", 10) == 0) - return dso__load_perf_map(self, filter, verbose); + if (strncmp(self->name, "/tmp/perf-", 10) == 0) { + ret = dso__load_perf_map(self, filter, verbose); + self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT : + DSO__ORIG_NOT_FOUND; + return ret; + } + + self->origin = DSO__ORIG_FEDORA - 1; more: do { - switch (variant) { - case 0: /* Fedora */ + self->origin++; + switch (self->origin) { + case DSO__ORIG_FEDORA: snprintf(name, size, "/usr/lib/debug%s.debug", self->name); break; - case 1: /* Ubuntu */ + case DSO__ORIG_UBUNTU: snprintf(name, size, "/usr/lib/debug%s", self->name); break; - case 2: + case DSO__ORIG_BUILDID: build_id = dso__read_build_id(self, verbose); if (build_id != NULL) { snprintf(name, size, @@ -754,16 +787,15 @@ more: free(build_id); break; } - variant++; + self->origin++; /* Fall thru */ - case 3: /* Sane people */ + case DSO__ORIG_DSO: snprintf(name, size, "%s", self->name); break; default: goto out; } - variant++; fd = open(name, O_RDONLY); } while (fd < 0); @@ -899,6 +931,9 @@ int dso__load_kernel(struct dso *self, const char *vmlinux, if (err <= 0) err = dso__load_kallsyms(self, filter, verbose); + if (err > 0) + self->origin = DSO__ORIG_KERNEL; + return err; } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 2f92b21..1e003ec 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -26,6 +26,7 @@ struct dso { unsigned int sym_priv_size; unsigned char adjust_symbols; unsigned char slen_calculated; + unsigned char origin; char name[0]; }; @@ -49,6 +50,7 @@ int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose); int dso__load(struct dso *self, symbol_filter_t filter, int verbose); size_t dso__fprintf(struct dso *self, FILE *fp); +char dso__symtab_origin(const struct dso *self); void symbol__init(void); #endif /* _PERF_SYMBOL_ */ -- cgit v0.10.2 From b26bc5a7f81474937e427b0c855eabee5ad56f89 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Fri, 7 Aug 2009 10:18:39 +0200 Subject: perf stat: Fix tool option consistency: rename -S/--scale to -c/--scale We want to use a coherent flag for -S/--stat across all tools, so free up -S in perf stat. Signed-off-by: Brice Goglin Cc: Peter Zijlstra Cc: paulus@samba.org Signed-off-by: Ingo Molnar diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 0d74346..484080d 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -40,7 +40,7 @@ OPTIONS -a:: system-wide collection --S:: +-c:: scale counter values EXAMPLES diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f9510ee..b4b06c7 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -496,7 +496,7 @@ static const struct option options[] = { "stat events on existing pid"), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), - OPT_BOOLEAN('S', "scale", &scale, + OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), -- cgit v0.10.2 From f36a1a133a947973efb8e6a1fbdcc23e4a011437 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 7 Aug 2009 16:59:45 +1000 Subject: perf_counter/powerpc: Fix oops on cpus without perf_counter hardware support If we have the powerpc perf_counter backend compiled in, but the cpu we are running on is one where we don't support the PMU, we currently oops in hw_perf_group_sched_in if we try to use any counters, because ppmu is NULL in that case, and we unconditionally dereference ppmu. This fixes the problem by adding a check if ppmu is NULL at the beginning of hw_perf_group_sched_in, and also at the beginning of the other functions that get called from the perf_counter core, i.e. hw_perf_disable, hw_perf_enable, and hw_perf_counter_setup. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: benh@kernel.crashing.org Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 809fdf9..70e1f57 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -518,6 +518,8 @@ void hw_perf_disable(void) struct cpu_hw_counters *cpuhw; unsigned long flags; + if (!ppmu) + return; local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_counters); @@ -572,6 +574,8 @@ void hw_perf_enable(void) int n_lim; int idx; + if (!ppmu) + return; local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_counters); if (!cpuhw->disabled) { @@ -737,6 +741,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, long i, n, n0; struct perf_counter *sub; + if (!ppmu) + return 0; cpuhw = &__get_cpu_var(cpu_hw_counters); n0 = cpuhw->n_counters; n = collect_events(group_leader, ppmu->n_counter - n0, @@ -1281,6 +1287,8 @@ void hw_perf_counter_setup(int cpu) { struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); + if (!ppmu) + return; memset(cpuhw, 0, sizeof(*cpuhw)); cpuhw->mmcr[0] = MMCR0_FC; } -- cgit v0.10.2 From ae07b63f4b6728e1f98aa5c5416cfc1280f59f51 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 16:48:54 +0200 Subject: perf list: Fix the output to not include tracepoints without an id Stop perf list from displaying tracepoints without an id file, those are special tracepoints that are not interfaced to perfcounters so listing them is erroneous and passing them as events will produce no output. Signed-off-by: Peter Zijlstra Acked-by: Jason Baron Cc: Steven Rostedt Cc: Chris Mason Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f77407b..4858d83 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -121,13 +121,29 @@ static unsigned long hw_cache_stat[C(MAX)] = { (strcmp(sys_dirent.d_name, ".")) && \ (strcmp(sys_dirent.d_name, ".."))) +static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) +{ + char evt_path[MAXPATHLEN]; + int fd; + + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, + sys_dir->d_name, evt_dir->d_name); + fd = open(evt_path, O_RDONLY); + if (fd < 0) + return -EINVAL; + close(fd); + + return 0; +} + #define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st) \ while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \ if (snprintf(file, MAXPATHLEN, "%s/%s/%s", debugfs_path, \ sys_dirent.d_name, evt_dirent.d_name) && \ (!stat(file, &st)) && (S_ISDIR(st.st_mode)) && \ (strcmp(evt_dirent.d_name, ".")) && \ - (strcmp(evt_dirent.d_name, ".."))) + (strcmp(evt_dirent.d_name, "..")) && \ + (!tp_event_has_id(&sys_dirent, &evt_dirent))) #define MAX_EVENT_LENGTH 30 -- cgit v0.10.2 From 7eac7e9e726c1b136bd7e0ad6671ce315f48bb18 Mon Sep 17 00:00:00 2001 From: Pierre Habouzit Date: Fri, 7 Aug 2009 14:16:00 +0200 Subject: perf util: Fix do_read() to fail on EOF instead of busy-looping While toying with perf, I've noticed that perf record can easily enter a busy loop when doing something as silly as: $ perf record -A ls Yeah, do_read here really wants to read a known size, not being able to should die(), not busy-loop ;) That was the cause for the bug. Signed-off-by: Pierre Habouzit Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 95a44bc..b92a457 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -185,6 +185,8 @@ static void do_read(int fd, void *buf, size_t size) if (ret < 0) die("failed to read"); + if (ret == 0) + die("failed to read: missing data"); size -= ret; buf += ret; -- cgit v0.10.2 From 266e0e219888420a1a7cafc82e82891cf7b5a979 Mon Sep 17 00:00:00 2001 From: Pierre Habouzit Date: Fri, 7 Aug 2009 14:16:01 +0200 Subject: perf record: Fix the -A UI for empty or non-existent perf.data 1. Ignore the -A argument if there is no perf.data file 2. Treat an empty file like a non existent file. Else, perf will try to read the perf.data header, and fail with an error. Treating an empty file like a non-existent file makes sense, since an interupted (as in SIGKILLed) perf could leave such files around, and you don't want to annoy the user with errors for files with no data in it. Signed-off-by: Pierre Habouzit Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 90c9808..0345aad 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -525,10 +525,14 @@ static int __cmd_record(int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); - if (!stat(output_name, &st) && !force && !append_file) { - fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", - output_name); - exit(-1); + if (!stat(output_name, &st) && st.st_size) { + if (!force && !append_file) { + fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", + output_name); + exit(-1); + } + } else { + append_file = 0; } flags = O_CREAT|O_RDWR; -- cgit v0.10.2 From b0efe213f84f7fd5ccfe07053e3d9fb827b7c188 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:23 +0200 Subject: perf tools: callchain: Fix spurious 'perf report' warnings: ignore empty callchains When the callchain tree comes to insert an empty backtrace, it raises a spurious warning about the fact we are inserting an empty. This is spurious because the radix tree assumes it did something wrong to reach this state. But it didn't, we just met an empty callchain that has to be ignored. This happens occasionally with certain types of call-chain recordings. If it happens it's a big nuisance as perf report output starts with thousands of warning lines. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 98c5627..a8e67aa 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -336,5 +336,7 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain, void append_chain(struct callchain_node *root, struct ip_callchain *chain, struct symbol **syms) { + if (!chain->nr) + return; __append_chain_children(root, chain, syms, 0); } -- cgit v0.10.2 From b1a88349c37624755b28ac3b3152b48f52c1f487 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:24 +0200 Subject: perf tools: callchain: Fix 'perf report' display to be callchain by default If we recorded with -g option to record the callchain, right now we require a -g option to perf report as well - and people reported this as unnecessary complication: the user already specified -g once, no need to require it a second time. So if the recording includes call-chains, display the callchain by default from perf report. ( The user can override this default using "-g none" option from perf report. ) Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index a5e2f8d..c4a8e10 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -68,7 +68,7 @@ static int callchain; static struct callchain_param callchain_param = { - .mode = CHAIN_GRAPH_ABS, + .mode = CHAIN_GRAPH_REL, .min_percent = 0.5 }; @@ -1836,6 +1836,13 @@ static int __cmd_report(void) " -g?\n"); exit(-1); } + } else if (callchain_param.mode != CHAIN_NONE && !callchain) { + callchain = 1; + if (register_callchain_param(&callchain_param) < 0) { + fprintf(stderr, "Can't register callchain" + " params\n"); + exit(-1); + } } if (load_kernel() < 0) { @@ -1974,6 +1981,13 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, else if (!strncmp(tok, "fractal", strlen(arg))) callchain_param.mode = CHAIN_GRAPH_REL; + else if (!strncmp(tok, "none", strlen(arg))) { + callchain_param.mode = CHAIN_NONE; + callchain = 0; + + return 0; + } + else return -1; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index b2d128e..a926ae4 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -7,6 +7,7 @@ #include "symbol.h" enum chain_mode { + CHAIN_NONE, CHAIN_FLAT, CHAIN_GRAPH_ABS, CHAIN_GRAPH_REL -- cgit v0.10.2 From 25446036cbfc2c89faacdb4fb4603943d2197dc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:25 +0200 Subject: perf tools: callchain: Fix sum of percentages to be 100% by displaying amount of ignored chains in fractal mode When we filter the callchains below a given percentage, we ignore them and the end result only shows entries that have an upper percentage than the filter threshold. It seems to users then that we have an imbalance in the percentage, as if the sum inside a profiled branch doesn't reach 100%. Since in the past there have been real perf report bugs that showed the same sypmtom, it would be nice to assure the user that the data is perfect and trustable and it all sums up to 100.00%. So fix this by displaying the remaining hits that have been filtered but without more detail than their amount in each branches. Example while filtering below 50%: 7.73% [k] delay_tsc | |--98.22%-- __const_udelay | | | |--86.37%-- ath5k_hw_register_timeout | | ath5k_hw_noise_floor_calibration | | ath5k_hw_reset | | ath5k_reset | | ath5k_config | | ieee80211_hw_config | | | | | |--88.53%-- ieee80211_scan_work | | | worker_thread | | | kthread | | | child_rip | | --11.47%-- [...] | --13.63%-- [...] --1.78%-- [...] Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c4a8e10..99274ce 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -891,6 +891,21 @@ ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth, return ret; } +static struct symbol *rem_sq_bracket; +static struct callchain_list rem_hits; + +static void init_rem_hits(void) +{ + rem_sq_bracket = malloc(sizeof(*rem_sq_bracket) + 6); + if (!rem_sq_bracket) { + fprintf(stderr, "Not enough memory to display remaining hits\n"); + return; + } + + strcpy(rem_sq_bracket->name, "[...]"); + rem_hits.sym = rem_sq_bracket; +} + static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self, u64 total_samples, int depth, int depth_mask) @@ -900,6 +915,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, struct callchain_list *chain; int new_depth_mask = depth_mask; u64 new_total; + u64 remaining; size_t ret = 0; int i; @@ -908,17 +924,25 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, else new_total = total_samples; + remaining = new_total; + node = rb_first(&self->rb_root); while (node) { + u64 cumul; + child = rb_entry(node, struct callchain_node, rb_node); + cumul = cumul_hits(child); + remaining -= cumul; /* * The depth mask manages the output of pipes that show * the depth. We don't want to keep the pipes of the current - * level for the last child of this depth + * level for the last child of this depth. + * Except if we have remaining filtered hits. They will + * supersede the last child */ next = rb_next(node); - if (!next) + if (!next && (callchain_param.mode != CHAIN_GRAPH_REL || !remaining)) new_depth_mask &= ~(1 << (depth - 1)); /* @@ -933,7 +957,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, ret += ipchain__fprintf_graph(fp, chain, depth, new_depth_mask, i++, new_total, - cumul_hits(child)); + cumul); } ret += callchain__fprintf_graph(fp, child, new_total, depth + 1, @@ -941,6 +965,19 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, node = next; } + if (callchain_param.mode == CHAIN_GRAPH_REL && + remaining && remaining != new_total) { + + if (!rem_sq_bracket) + return ret; + + new_depth_mask &= ~(1 << (depth - 1)); + + ret += ipchain__fprintf_graph(fp, &rem_hits, depth, + new_depth_mask, 0, new_total, + remaining); + } + return ret; } @@ -1361,6 +1398,8 @@ static size_t output__fprintf(FILE *fp, u64 total_samples) unsigned int width; char *col_width = col_width_list_str; + init_rem_hits(); + fprintf(fp, "# Samples: %Ld\n", (u64)total_samples); fprintf(fp, "#\n"); @@ -1432,6 +1471,8 @@ print_entries: } fprintf(fp, "\n"); + free(rem_sq_bracket); + return ret; } -- cgit v0.10.2 From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440a..117622c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit v0.10.2 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c..2aabe43 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622c..0023105 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v0.10.2 From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0023105..546e62d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit v0.10.2 From c24b513337f06712b8c81eb4f1413d44591dfee7 Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Wed, 5 Aug 2009 20:53:34 +0200 Subject: perf: "Longum est iter per praecepta, breve et efficax per exempla" A few examples of how 'perf' can be used, from an e-mail by Ingo Molnar http://lkml.org/lkml/2009/8/4/346. Signed-off-by: Carlos R. Mafra Cc: Peter Zijlstra Cc: Valdis.Kletnieks@vt.edu LKML-Reference: <20090805185334.GA4535@Pilar.aei.mpg.de> Signed-off-by: Ingo Molnar diff --git a/tools/perf/Documentation/perf-examples.txt b/tools/perf/Documentation/perf-examples.txt new file mode 100644 index 0000000..8eb6c48 --- /dev/null +++ b/tools/perf/Documentation/perf-examples.txt @@ -0,0 +1,225 @@ + + ------------------------------ + ****** perf by examples ****** + ------------------------------ + +[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ] + + +First, discovery/enumeration of available counters can be done via +'perf list': + +titan:~> perf list + [...] + kmem:kmalloc [Tracepoint event] + kmem:kmem_cache_alloc [Tracepoint event] + kmem:kmalloc_node [Tracepoint event] + kmem:kmem_cache_alloc_node [Tracepoint event] + kmem:kfree [Tracepoint event] + kmem:kmem_cache_free [Tracepoint event] + kmem:mm_page_free_direct [Tracepoint event] + kmem:mm_pagevec_free [Tracepoint event] + kmem:mm_page_alloc [Tracepoint event] + kmem:mm_page_alloc_zone_locked [Tracepoint event] + kmem:mm_page_pcpu_drain [Tracepoint event] + kmem:mm_page_alloc_extfrag [Tracepoint event] + +Then any (or all) of the above event sources can be activated and +measured. For example the page alloc/free properties of a 'hackbench +run' are: + + titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc + -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10 + Time: 0.575 + + Performance counter stats for './hackbench 10': + + 13857 kmem:mm_page_pcpu_drain + 27576 kmem:mm_page_alloc + 6025 kmem:mm_pagevec_free + 20934 kmem:mm_page_free_direct + + 0.613972165 seconds time elapsed + +You can observe the statistical properties as well, by using the +'repeat the workload N times' feature of perf stat: + + titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e + kmem:mm_page_alloc -e kmem:mm_pagevec_free -e + kmem:mm_page_free_direct ./hackbench 10 + Time: 0.627 + Time: 0.644 + Time: 0.564 + Time: 0.559 + Time: 0.626 + + Performance counter stats for './hackbench 10' (5 runs): + + 12920 kmem:mm_page_pcpu_drain ( +- 3.359% ) + 25035 kmem:mm_page_alloc ( +- 3.783% ) + 6104 kmem:mm_pagevec_free ( +- 0.934% ) + 18376 kmem:mm_page_free_direct ( +- 4.941% ) + + 0.643954516 seconds time elapsed ( +- 2.363% ) + +Furthermore, these tracepoints can be used to sample the workload as +well. For example the page allocations done by a 'git gc' can be +captured the following way: + + titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc + Counting objects: 1148, done. + Delta compression using up to 2 threads. + Compressing objects: 100% (450/450), done. + Writing objects: 100% (1148/1148), done. + Total 1148 (delta 690), reused 1148 (delta 690) + [ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ] + +To check which functions generated page allocations: + + titan:~/git> perf report + # Samples: 10646 + # + # Overhead Command Shared Object + # ........ ............... .......................... + # + 23.57% git-repack /lib64/libc-2.5.so + 21.81% git /lib64/libc-2.5.so + 14.59% git ./git + 11.79% git-repack ./git + 7.12% git /lib64/ld-2.5.so + 3.16% git-repack /lib64/libpthread-2.5.so + 2.09% git-repack /bin/bash + 1.97% rm /lib64/libc-2.5.so + 1.39% mv /lib64/ld-2.5.so + 1.37% mv /lib64/libc-2.5.so + 1.12% git-repack /lib64/ld-2.5.so + 0.95% rm /lib64/ld-2.5.so + 0.90% git-update-serv /lib64/libc-2.5.so + 0.73% git-update-serv /lib64/ld-2.5.so + 0.68% perf /lib64/libpthread-2.5.so + 0.64% git-repack /usr/lib64/libz.so.1.2.3 + +Or to see it on a more finegrained level: + +titan:~/git> perf report --sort comm,dso,symbol +# Samples: 10646 +# +# Overhead Command Shared Object Symbol +# ........ ............... .......................... ...... +# + 9.35% git-repack ./git [.] insert_obj_hash + 9.12% git ./git [.] insert_obj_hash + 7.31% git /lib64/libc-2.5.so [.] memcpy + 6.34% git-repack /lib64/libc-2.5.so [.] _int_malloc + 6.24% git-repack /lib64/libc-2.5.so [.] memcpy + 5.82% git-repack /lib64/libc-2.5.so [.] __GI___fork + 5.47% git /lib64/libc-2.5.so [.] _int_malloc + 2.99% git /lib64/libc-2.5.so [.] memset + +Furthermore, call-graph sampling can be done too, of page +allocations - to see precisely what kind of page allocations there +are: + + titan:~/git> perf record -f -g -e kmem:mm_page_alloc -c 1 ./git gc + Counting objects: 1148, done. + Delta compression using up to 2 threads. + Compressing objects: 100% (450/450), done. + Writing objects: 100% (1148/1148), done. + Total 1148 (delta 690), reused 1148 (delta 690) + [ perf record: Captured and wrote 0.963 MB perf.data (~42069 samples) ] + + titan:~/git> perf report -g + # Samples: 10686 + # + # Overhead Command Shared Object + # ........ ............... .......................... + # + 23.25% git-repack /lib64/libc-2.5.so + | + |--50.00%-- _int_free + | + |--37.50%-- __GI___fork + | make_child + | + |--12.50%-- ptmalloc_unlock_all2 + | make_child + | + --6.25%-- __GI_strcpy + 21.61% git /lib64/libc-2.5.so + | + |--30.00%-- __GI_read + | | + | --83.33%-- git_config_from_file + | git_config + | | + [...] + +Or you can observe the whole system's page allocations for 10 +seconds: + +titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e +kmem:mm_page_alloc -e kmem:mm_pagevec_free -e +kmem:mm_page_free_direct sleep 10 + + Performance counter stats for 'sleep 10': + + 171585 kmem:mm_page_pcpu_drain + 322114 kmem:mm_page_alloc + 73623 kmem:mm_pagevec_free + 254115 kmem:mm_page_free_direct + + 10.000591410 seconds time elapsed + +Or observe how fluctuating the page allocations are, via statistical +analysis done over ten 1-second intervals: + + titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e + kmem:mm_page_alloc -e kmem:mm_pagevec_free -e + kmem:mm_page_free_direct sleep 1 + + Performance counter stats for 'sleep 1' (10 runs): + + 17254 kmem:mm_page_pcpu_drain ( +- 3.709% ) + 34394 kmem:mm_page_alloc ( +- 4.617% ) + 7509 kmem:mm_pagevec_free ( +- 4.820% ) + 25653 kmem:mm_page_free_direct ( +- 3.672% ) + + 1.058135029 seconds time elapsed ( +- 3.089% ) + +Or you can annotate the recorded 'git gc' run on a per symbol basis +and check which instructions/source-code generated page allocations: + + titan:~/git> perf annotate __GI___fork + ------------------------------------------------ + Percent | Source code & Disassembly of libc-2.5.so + ------------------------------------------------ + : + : + : Disassembly of section .plt: + : Disassembly of section .text: + : + : 00000031a2e95560 <__fork>: + [...] + 0.00 : 31a2e95602: b8 38 00 00 00 mov $0x38,%eax + 0.00 : 31a2e95607: 0f 05 syscall + 83.42 : 31a2e95609: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax + 0.00 : 31a2e9560f: 0f 87 4d 01 00 00 ja 31a2e95762 <__fork+0x202> + 0.00 : 31a2e95615: 85 c0 test %eax,%eax + +( this shows that 83.42% of __GI___fork's page allocations come from + the 0x38 system call it performs. ) + +etc. etc. - a lot more is possible. I could list a dozen of +other different usecases straight away - neither of which is +possible via /proc/vmstat. + +/proc/vmstat is not in the same league really, in terms of +expressive power of system analysis and performance +analysis. + +All that the above results needed were those new tracepoints +in include/tracing/events/kmem.h. + + Ingo + + -- cgit v0.10.2 From 183f3b0887083d36c8a25cd5e3518906415d1889 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 8 Aug 2009 14:14:15 +0200 Subject: perf_counter tools: Fix libbfd detection for systems with libz dependency Due to a libz dependency in some distro's binutils package, C++ demangle support isn't compiled in despite the necessary libraries being available. Fix this by adding a -lz link test to the dependency detection rules. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1249733655.6929.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 1916e44..60411e9 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -387,10 +387,14 @@ else has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") + has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd),y) EXTLIBS += -lbfd else ifeq ($(has_bfd_iberty),y) EXTLIBS += -lbfd -liberty + else ifeq ($(has_bfd_iberty_z),y) + EXTLIBS += -lbfd -liberty -lz else msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) BASIC_CFLAGS += -DNO_DEMANGLE -- cgit v0.10.2 From c0a8865e32c8d1a562db38e06ef31ef23282f646 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 9 Aug 2009 04:19:15 +0200 Subject: perf tools: callchain: Fix bad rounding of minimum rate Sometimes we get callchain branches that have a rate under the limit given by the user. Say you launched: perf record -f -g -a ./hackbench 10 perf report -g fractal,10.0 And you got: 2.33% hackbench [kernel] [k] _spin_lock_irqsave | |--78.57%-- remove_wait_queue | poll_freewait | do_sys_poll | sys_poll | sysenter_dispatch | 0xf7ffa430 | 0x1ffadea3c | |--7.14%-- __up_read | up_read | do_page_fault | page_fault | 0xf7ffa430 | 0xa0df710000000a ... It is abnormal to get a 7.14% branch whereas we passed a 10% filter. The problem is that we round down the minimum threshold. This happens mostly when we have very low number of events. If the total amount of your branch is 4 and you have a subranch of 3 events, filtering to 90% will be computed like follows: limit = 4 * 0.9; The result is about 3.6, but the cast to integer will round down to 3. It means that our filter is actually of 75% We must then explicitly round up the minimum threshold. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: acme@redhat.com Cc: peterz@infradead.org Cc: efault@gmx.de LKML-Reference: <20090809024235.GA10146@nowhere> Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index a8e67aa..0114734 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "callchain.h" @@ -112,7 +113,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node, u64 min_hit; node->rb_root = RB_ROOT; - min_hit = node->children_hit * min_percent / 100.0; + min_hit = ceil(node->children_hit * min_percent); chain_for_each_child(child, node) { __sort_chain_graph_rel(child, min_percent); @@ -126,7 +127,7 @@ static void sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root, u64 min_hit __used, struct callchain_param *param) { - __sort_chain_graph_rel(chain_root, param->min_percent); + __sort_chain_graph_rel(chain_root, param->min_percent / 100.0); rb_root->rb_node = chain_root->rb_root.rb_node; } -- cgit v0.10.2 From c8c00a6915a2e3d10416e8bdd3138429beb96210 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Aug 2009 12:50:52 +1000 Subject: Remove deadlock potential in md_open A recent commit: commit 449aad3e25358812c43afc60918c5ad3819488e7 introduced the possibility of an A-B/B-A deadlock between bd_mutex and reconfig_mutex. __blkdev_get holds bd_mutex while calling md_open which takes reconfig_mutex, do_md_run is always called with reconfig_mutex held, and it now takes bd_mutex in the call the revalidate_disk. This potential deadlock was not caught by lockdep due to the use of mutex_lock_interruptible_nexted which was introduced by commit d63a5a74dee87883fda6b7d170244acaac5b05e8 do avoid a warning of an impossible deadlock. It is quite possible to split reconfig_mutex in to two locks. One protects the array data structures while it is being reconfigured, the other ensures that an array is never even partially open while it is being deactivated. In particular, the second lock prevents an open from completing between the time when do_md_stop checks if there are any active opens, and the time when the array is either set read-only, or when ->pers is set to NULL. So we can be certain that no IO is in flight as the array is being destroyed. So create a new lock, open_mutex, just to ensure exclusion between 'open' and 'stop'. This avoids the deadlock and also avoids the lockdep warning mentioned in commit d63a5a74d Reported-by: "Mike Snitzer" Reported-by: "H. Peter Anvin" Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 5b98bea..5614500 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; + mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); @@ -4304,12 +4305,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; + mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); - return -EBUSY; - } - - if (mddev->pers) { + err = -EBUSY; + } else if (mddev->pers) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4367,7 +4367,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) set_disk_ro(disk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - +out: + mutex_unlock(&mddev->open_mutex); + if (err) + return err; /* * Free resources if final stop */ @@ -4433,7 +4436,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); -out: return err; } @@ -5518,12 +5520,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) } BUG_ON(mddev != bdev->bd_disk->private_data); - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; atomic_inc(&mddev->openers); - mddev_unlock(mddev); + mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); out: diff --git a/drivers/md/md.h b/drivers/md/md.h index 78f0316..f8fc188 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -223,6 +223,16 @@ struct mddev_s * so we don't loop trying */ int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with bdev->bd_mutex. + * Lock ordering is: + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ -- cgit v0.10.2 From b2f2e8fee3d62f621e795f25b2fc0f51bbdb4af9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 10 Aug 2009 16:36:38 +1000 Subject: powerpc/dma: pci_set_dma_mask() shouldn't fail if mask fits in RAM On an iMac G5, the b43 driver is failing to initialise because trying to set the dma mask to 30-bit fails. Even though there's only 512MiB of RAM in the machine anyway: https://bugzilla.redhat.com/show_bug.cgi?id=514787 We should probably let it succeed if the available RAM in the system doesn't exceed the requested limit. Signed-off-by: David Woodhouse Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 20a60d6..ccf129d 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -90,11 +91,10 @@ static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sg, static int dma_direct_dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PPC64 - /* Could be improved to check for memory though it better be - * done via some global so platforms can set the limit in case + /* Could be improved so platforms can set the limit in case * they have limited DMA windows */ - return mask >= DMA_BIT_MASK(32); + return mask >= (lmb_end_of_DRAM() - 1); #else return 1; #endif -- cgit v0.10.2 From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff8..8cc3ee1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v0.10.2 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43..a9d823a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d9..7167b9b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d..5229d16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v0.10.2 From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d16..b0b20a0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit v0.10.2 From 100d5eb36ba20dc0b99a17ea2b9800c567bfc3d1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 10 Aug 2009 11:55:51 +0200 Subject: ALSA: hda - Add missing vmaster initialization for ALC269 Without the initialization of vmaster NID, the dB information got confused for ALC269 codec. Reference: Novell bnc#527361 https://bugzilla.novell.com/show_bug.cgi?id=527361 Signed-off-by: Takashi Iwai Cc: diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 51c44fd..5cc927f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -13563,6 +13563,8 @@ static int patch_alc269(struct hda_codec *codec) set_capture_mixer(spec); set_beep_amp(spec, 0x0b, 0x04, HDA_INPUT); + spec->vmaster_nid = 0x02; + codec->patch_ops = alc_patch_ops; if (board_config == ALC269_AUTO) spec->init_hook = alc269_auto_init; -- cgit v0.10.2 From 13f0feafa6b8aead57a2a328e2fca6a5828bf286 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jun 2009 21:25:32 +0200 Subject: mm_for_maps: simplify, use ptrace_may_access() It would be nice to kill __ptrace_may_access(). It requires task_lock(), but this lock is only needed to read mm->flags in the middle. Convert mm_for_maps() to use ptrace_may_access(), this also simplifies the code a little bit. Also, we do not need to take ->mmap_sem in advance. In fact I think mm_for_maps() should not play with ->mmap_sem at all, the caller should take this lock. With or without this patch, without ->cred_guard_mutex held we can race with exec() and get the new ->mm but check old creds. Signed-off-by: Oleg Nesterov Reviewed-by: Serge Hallyn Signed-off-by: James Morris diff --git a/fs/proc/base.c b/fs/proc/base.c index 3ce5ae9..917f338 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task) struct mm_struct *mm = get_task_mm(task); if (!mm) return NULL; + if (mm != current->mm) { + /* + * task->mm can be changed before security check, + * in that case we must notice the change after. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ) || + mm != task->mm) { + mmput(mm); + return NULL; + } + } down_read(&mm->mmap_sem); - task_lock(task); - if (task->mm != mm) - goto out; - if (task->mm != current->mm && - __ptrace_may_access(task, PTRACE_MODE_READ) < 0) - goto out; - task_unlock(task); return mm; -out: - task_unlock(task); - up_read(&mm->mmap_sem); - mmput(mm); - return NULL; } static int proc_pid_cmdline(struct task_struct *task, char * buffer) -- cgit v0.10.2 From 00f89d218523b9bf6b522349c039d5ac80aa536d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 10 Jul 2009 03:27:38 +0200 Subject: mm_for_maps: shift down_read(mmap_sem) to the caller mm_for_maps() takes ->mmap_sem after security checks, this looks strange and obfuscates the locking rules. Move this lock to its single caller, m_start(). Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Signed-off-by: James Morris diff --git a/fs/proc/base.c b/fs/proc/base.c index 917f338..f3c2e40 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); - if (!mm) - return NULL; - if (mm != current->mm) { + + if (mm && mm != current->mm) { /* * task->mm can be changed before security check, * in that case we must notice the change after. @@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task) if (!ptrace_may_access(task, PTRACE_MODE_READ) || mm != task->mm) { mmput(mm); - return NULL; + mm = NULL; } } - down_read(&mm->mmap_sem); return mm; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6f61b7c..9bd8be1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) mm = mm_for_maps(priv->task); if (!mm) return NULL; + down_read(&mm->mmap_sem); tail_vma = get_gate_vma(priv->task); priv->tail_vma = tail_vma; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 64a72e2..8f5c05d 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = NULL; return NULL; } + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) -- cgit v0.10.2 From 704b836cbf19e885f8366bccb2e4b0474346c02d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 10 Jul 2009 03:27:40 +0200 Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec The problem is minor, but without ->cred_guard_mutex held we can race with exec() and get the new ->mm but check old creds. Now we do not need to re-check task->mm after ptrace_may_access(), it can't be changed to the new mm under us. Strictly speaking, this also fixes another very minor problem. Unless security check fails or the task exits mm_for_maps() should never return NULL, the caller should get either old or new ->mm. Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Signed-off-by: James Morris diff --git a/fs/proc/base.c b/fs/proc/base.c index f3c2e40..175db25 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { - struct mm_struct *mm = get_task_mm(task); + struct mm_struct *mm; - if (mm && mm != current->mm) { - /* - * task->mm can be changed before security check, - * in that case we must notice the change after. - */ - if (!ptrace_may_access(task, PTRACE_MODE_READ) || - mm != task->mm) { - mmput(mm); - mm = NULL; - } + if (mutex_lock_killable(&task->cred_guard_mutex)) + return NULL; + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, PTRACE_MODE_READ)) { + mmput(mm); + mm = NULL; } + mutex_unlock(&task->cred_guard_mutex); + return mm; } -- cgit v0.10.2 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a..cf3c2f5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4..c4bd3d8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v0.10.2 From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b..2357165 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v0.10.2 From 304703aba31a87903b8c0db8f5e6890cac2d596d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:11:32 +0200 Subject: perf_counter: Subtract the buffer size field from the event record size We compute the perf raw sample size by aligning the raw ftrace event size plus the buffer size field itself. We do that instead of aligning only the perf raw sample size, so that we might economize some in some cases. But this buffer size field is not stored in the perf raw sample, we must then substract its size from the buffer once we computed the alignment unless we may get a useless u32 field in the buffer. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090810141129.GA5124@nowhere> Signed-off-by: Ingo Molnar diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7167b9b..a05524f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -637,7 +637,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * pc = preempt_count(); * * __data_size = ftrace_get_offsets_(&__data_offsets, args); - * __entry_size = __data_size + sizeof(*entry); + * + * // Below we want to get the aligned size by taking into account + * // the u32 field that will later store the buffer size + * __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), + * sizeof(u64)); + * __entry_size -= sizeof(u32); * * do { * char raw_data[__entry_size]; <- allocate our sample in the stack @@ -687,6 +692,7 @@ static void ftrace_profile_##call(proto) \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ + __entry_size -= sizeof(u32); \ \ do { \ char raw_data[__entry_size]; \ -- cgit v0.10.2 From 1853db0e02ae4088f102b0d8e59e83dc98f93f03 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:38:36 +0200 Subject: perf_counter: Zero dead bytes from ftrace raw samples size alignment After aligning the ftrace raw samples, there are dead bytes storing random data from the stack. We don't want to leak these to userspace, then zero these out. Before: 0x2de88 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 2c 00 00 00 00 00 00 00 h...h...,...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 40 7f 46 81 ff ff ff ff 00 10 1b 7f h...@.F........ ^ ^ ^ ^ Leak After: 0x2d318 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 68 14 00 00 00 00 00 00 h...h...h...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 a0 80 46 81 ff ff ff ff 00 00 00 00 h.....F........ ^ ^ ^ ^ Fixed Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249915116-5210-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a05524f..f64fbaa 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -648,6 +648,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * char raw_data[__entry_size]; <- allocate our sample in the stack * struct trace_entry *ent; * + * zero dead bytes from alignment to avoid stack leak to userspace: + * + * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; * entry = (struct ftrace_raw_ *)raw_data; * ent = &entry->ent; * tracing_generic_entry_update(ent, irq_flags, pc); @@ -698,6 +701,7 @@ static void ftrace_profile_##call(proto) \ char raw_data[__entry_size]; \ struct trace_entry *ent; \ \ + *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ ent = &entry->ent; \ tracing_generic_entry_update(ent, irq_flags, pc); \ -- cgit v0.10.2 From 1392e3b33319fd1a2527bebfc56631c2f2d3c7c5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 8 Aug 2009 17:52:50 +0900 Subject: documentation: register ioctl entry of nilfs2 This will register the ioctl range used by nilfs2 file system to the table listed in Documentation/ioctl/ioctl-number.txt. Signed-off-by: Ryusuke Konishi Signed-off-by: Linus Torvalds diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 7bb0d93..dbea4f9 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -139,6 +139,7 @@ Code Seq# Include File Comments 'm' all linux/synclink.h conflict! 'm' 00-1F net/irda/irmod.h conflict! 'n' 00-7F linux/ncp_fs.h +'n' 80-8F linux/nilfs2_fs.h NILFS2 'n' E0-FF video/matrox.h matroxfb 'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 'o' 00-03 include/mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps) -- cgit v0.10.2 From 5e2f89b5d5d87a7c3ba19fc85ba0c29adb65f639 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sat, 8 Aug 2009 21:01:22 +0800 Subject: mempool.c: clean up type-casting clean up type-casting twice. "size_t" is typedef as "unsigned long" in 64-bit system, and "unsigned int" in 32-bit system, and the intermediate cast to 'long' is pointless. Signed-off-by: Figo.zhang Signed-off-by: Linus Torvalds diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b..32e75d4 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)pool_data; return kzalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kzalloc); -- cgit v0.10.2 From 04e35357e2e3ff4e0cabd6468354cf3dbfeb4f27 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 10 Aug 2009 16:45:42 +0100 Subject: MN10300: includecheck fix: mn10300, pci.h Fix the following 'make includecheck' warning: arch/mn10300/include/asm/pci.h: linux/mm.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: David Howells Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h index 35d2ed6..19aecc9 100644 --- a/arch/mn10300/include/asm/pci.h +++ b/arch/mn10300/include/asm/pci.h @@ -59,7 +59,6 @@ void pcibios_penalize_isa_irq(int irq); #include #include #include -#include #include struct pci_dev; -- cgit v0.10.2 From b6e61eef4f9f94714ac3ee4a5c96862d9bcd1836 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 31 Jul 2009 12:45:41 -0700 Subject: x86: Fix serialization in pit_expect_msb() Wei Chong Tan reported a fast-PIT-calibration corner-case: | pit_expect_msb() is vulnerable to SMI disturbance corner case | in some platforms which causes /proc/cpuinfo to show wrong | CPU MHz value when quick_pit_calibrate() jumps to success | section. I think that the real issue isn't even an SMI - but the fact that in the very last iteration of the loop, there's no serializing instruction _after_ the last 'rdtsc'. So even in the absense of SMI's, we do have a situation where the cycle counter was read without proper serialization. The last check should be done outside the outer loop, since _inside_ the outer loop, we'll be testing that the PIT has the right MSB value has the right value in the next iteration. So only the _last_ iteration is special, because that's the one that will not check the PIT MSB value any more, and because the final 'get_cycles()' isn't serialized. In other words: - I'd like to move the PIT MSB check to after the last iteration, rather than in every iteration - I think we should comment on the fact that it's also a serializing instruction and so 'fences in' the TSC read. Here's a suggested replacement. Signed-off-by: Linus Torvalds Reported-by: "Tan, Wei Chong" Tested-by: "Tan, Wei Chong" LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6e1a368..71f4368 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); -- cgit v0.10.2 From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1..e18cfbd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the -- cgit v0.10.2 From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001 From: Shunichi Fuji Date: Tue, 11 Aug 2009 03:34:40 +0900 Subject: x86: Add reboot quirk for every 5 series MacBook/Pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5) too. It seems all unibody MacBook and MacBookPro require PCI reboot handling, i guess. Following model/machine ID list shows unibody MacBook/Pro have the 5 series of model number: http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html Signed-off-by: Shunichi Fuji Cc: Ozan Çağlayan LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9eb8976..a06e8d1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) } static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { - { /* Handle problems with rebooting on Apple MacBook5,2 */ + { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, - .ident = "Apple MacBook", + .ident = "Apple MacBook5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, - { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, - .ident = "Apple MacBookPro5,1", + .ident = "Apple MacBookPro5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, { } -- cgit v0.10.2 From b409d7a0ab46fe530efe52734984b4ed5d46c3eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Aug 2009 23:29:34 +0200 Subject: ocfs2: Fix possible deadlock when extending quota file In OCFS2, allocator locks rank above transaction start. Thus we cannot extend quota file from inside a transaction less we could deadlock. We solve the problem by starting transaction not already in ocfs2_acquire_dquot() but only in ocfs2_local_read_dquot() and ocfs2_global_read_dquot() and we allocate blocks to quota files before starting the transaction. In case we crash, quota files will just have a few blocks more but that's no problem since we just use them next time we extend the quota file. Signed-off-by: Jan Kara Signed-off-by: Joel Becker diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4a4d3b5..2c3222a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -363,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb) return credits; } -/* Number of credits needed for removing quota structure from file */ -int ocfs2_calc_qdel_credits(struct super_block *sb, int type); -/* Number of credits needed for initialization of new quota structure */ -int ocfs2_calc_qinit_credits(struct super_block *sb, int type); - /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d604a6a..bf7742d 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -215,11 +215,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, rounded_end, off); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; + /* Space is already allocated in ocfs2_global_read_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, rounded_end); @@ -405,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type) return err; } +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, and info block */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; +} + /* Read in information from global quota file and acquire a reference to it. * dquot_acquire() has already started the transaction and locked quota file */ int ocfs2_global_read_dquot(struct dquot *dquot) { int err, err2, ex = 0; - struct ocfs2_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle = NULL; err = ocfs2_qinfo_lock(info, 0); if (err < 0) @@ -422,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + ocfs2_qinfo_unlock(info, 0); + if (!dquot->dq_off) { /* No real quota entry? */ - /* Upgrade to exclusive lock for allocation */ - ocfs2_qinfo_unlock(info, 0); - err = ocfs2_qinfo_lock(info, 1); - if (err < 0) - goto out_qlock; ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; + } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; } + err = ocfs2_qinfo_lock(info, ex); + if (err < 0) + goto out_trans; err = qtree_write_dquot(&info->dqi_gi, dquot); if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); @@ -441,6 +479,9 @@ out_qlock: ocfs2_qinfo_unlock(info, 1); else ocfs2_qinfo_unlock(info, 0); +out_trans: + if (handle) + ocfs2_commit_trans(osb, handle); out: if (err < 0) mlog_errno(err); @@ -638,24 +679,17 @@ out: return status; } -int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) { - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; /* * We modify tree, leaf block, global info, local chunk header, * global and local inode; OCFS2_QINFO_WRITE_CREDITS already * accounts for inode update */ - return oinfo->dqi_gi.dqi_qtree_depth + + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_INODE_UPDATE_CREDITS; } @@ -688,36 +722,10 @@ out: return status; } -int ocfs2_calc_qinit_credits(struct super_block *sb, int type) -{ - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - struct ocfs2_dinode *lfe, *gfe; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; - lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; - /* We can extend local file + global file. In local file we - * can modify info, chunk header block and dquot block. In - * global file we can modify info, tree and leaf block */ - return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + - ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - OCFS2_LOCAL_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + - oinfo->dqi_gi.dqi_qtree_depth + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; -} - static int ocfs2_acquire_dquot(struct dquot *dquot) { - handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); @@ -726,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; - handle = ocfs2_start_trans(osb, - ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } status = dquot_acquire(dquot); - ocfs2_commit_trans(osb, handle); -out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mlog_exit(status); -- cgit v0.10.2 From 85dfd81dc57e8183a277ddd7a56aa65c96f3f487 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Aug 2009 13:21:19 -0700 Subject: pty: fix data loss when stopped (^S/^Q) Commit d945cb9cc ("pty: Rework the pty layer to use the normal buffering logic") dropped the test for 'tty->stopped' in pty_write_room(), which then causes the n_tty line discipline thing to not throttle the data properly when the tty is stopped. So instead of pausing the write due to the tty being stopped, the ldisc layer would go ahead and push it down to the pty. The pty write() routine would then refuse to take the data (because it _did_ check 'stopped'), and the data wouldn't actually be written. This whole stopped test should eventually be moved into the tty ldisc layer rather than have low-level tty drivers care about these things, but right now the fix is to just re-instate the missing pty 'stopped' handling. Reported-and-tested-by: Artur Skawina Cc: Alan Cox Signed-off-by: Linus Torvalds diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 6e6942c..d083c73 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -144,6 +144,8 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf, static int pty_write_room(struct tty_struct *tty) { + if (tty->stopped) + return 0; return pty_space(tty->link); } -- cgit v0.10.2 From 651b1f125c7e3806bbd635739d009433dc07372d Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 10 Aug 2009 23:41:18 +0200 Subject: PM / Driver Core: Kill dev_pm_ops platform warning for now Commit 783ea7d4eeefe895f2731fe73ac951e94418927b (Driver Core: Rework platform suspend/resume, print warning) added a warning message printed for platform drivers that use the legacy PM callbacks rather than struct dev_pm_ops. Unfortunately, this resulted in some confusion and made some people try to convert drivers by replacing the old callbacks with struct dev_pm_ops in automatic way, which generally is not a good idea. Remove the platform device runtime dev_pm_ops warning for now, because it's annoying to users and it's not really necessary right now. [rjw: Modified the changelog to be more informative.] Signed-off-by: Magnus Damm Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 81cb01b..456594b 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -483,9 +483,6 @@ int platform_driver_register(struct platform_driver *drv) drv->driver.remove = platform_drv_remove; if (drv->shutdown) drv->driver.shutdown = platform_drv_shutdown; - if (drv->suspend || drv->resume) - pr_warning("Platform driver '%s' needs updating - please use " - "dev_pm_ops\n", drv->driver.name); return driver_register(&drv->driver); } -- cgit v0.10.2 From 314dabb83a547ec4da819e8cbc78fac9cec605cd Mon Sep 17 00:00:00 2001 From: James Morris Date: Mon, 10 Aug 2009 22:00:13 +1000 Subject: SELinux: fix memory leakage in /security/selinux/hooks.c Fix memory leakage in /security/selinux/hooks.c The buffer always needs to be freed here; we either error out or allocate more memory. Reported-by: iceberg Signed-off-by: James Morris Acked-by: Stephen Smalley diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 15c2a08..1e8cfc4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1285,6 +1285,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX, context, len); if (rc == -ERANGE) { + kfree(context); + /* Need a larger buffer. Query for the right size. */ rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX, NULL, 0); @@ -1292,7 +1294,6 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent dput(dentry); goto out_unlock; } - kfree(context); len = rc; context = kmalloc(len+1, GFP_NOFS); if (!context) { -- cgit v0.10.2 From dd704698f56c1451fc9c5daadcd6e3a089de2c40 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 11 Aug 2009 08:45:11 +0200 Subject: ALSA: hda - Don't override ADC definitions for ALC codecs ALC269 and ALC861-VD parsers override the ADC definitions unconditionally without checking the spec definition. This causes the problem when any inconsistent ADC is set up in the device quirk (like ALC272 with digital-mic). This patch avoids the overriding by adding the proper checks. Reference: Novell bnc#529467 https://bugzilla.novell.com/show_bug.cgi?id=529467 Signed-off-by: Takashi Iwai diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5cc927f..fea9767 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -15579,9 +15579,12 @@ static int patch_alc861vd(struct hda_codec *codec) spec->stream_digital_playback = &alc861vd_pcm_digital_playback; spec->stream_digital_capture = &alc861vd_pcm_digital_capture; - spec->adc_nids = alc861vd_adc_nids; - spec->num_adc_nids = ARRAY_SIZE(alc861vd_adc_nids); - spec->capsrc_nids = alc861vd_capsrc_nids; + if (!spec->adc_nids) { + spec->adc_nids = alc861vd_adc_nids; + spec->num_adc_nids = ARRAY_SIZE(alc861vd_adc_nids); + } + if (!spec->capsrc_nids) + spec->capsrc_nids = alc861vd_capsrc_nids; set_capture_mixer(spec); set_beep_amp(spec, 0x0b, 0x05, HDA_INPUT); @@ -17498,9 +17501,12 @@ static int patch_alc662(struct hda_codec *codec) spec->stream_digital_playback = &alc662_pcm_digital_playback; spec->stream_digital_capture = &alc662_pcm_digital_capture; - spec->adc_nids = alc662_adc_nids; - spec->num_adc_nids = ARRAY_SIZE(alc662_adc_nids); - spec->capsrc_nids = alc662_capsrc_nids; + if (!spec->adc_nids) { + spec->adc_nids = alc662_adc_nids; + spec->num_adc_nids = ARRAY_SIZE(alc662_adc_nids); + } + if (!spec->capsrc_nids) + spec->capsrc_nids = alc662_capsrc_nids; if (!spec->cap_mixer) set_capture_mixer(spec); -- cgit v0.10.2 From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 9 Aug 2009 21:44:49 -0700 Subject: x86, mce: therm_throt - change when we print messages My Latitude d630 seems to be handling thermal events in SMI by lowering the max frequency of the CPU till it cools down but still leaks the "everything is normal" events. This spams the console and with high priority printks. Adjust therm_throt driver to only print messages about the fact that temperatire returned back to normal when leaving the throttling state. Also lower the severity of "back to normal" message from KERN_CRIT to KERN_INFO. Signed-off-by: Dmitry Torokhov Acked-by: H. Peter Anvin LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd1..8bc64cf 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,24 +97,27 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } else if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } return 1; -- cgit v0.10.2 From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:47:36 +0200 Subject: perf_counter, x86: Fix lapic printk message Instead of this garbled bootup on UP Pentium-M systems: [ 0.015048] Performance Counters: [ 0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only. Print: [ 0.015050] Performance Counters: [ 0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it. [ 0.017003] no PMU driver, software counters only. Cf: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f9..40e233a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1590,7 +1590,7 @@ static int p6_pmu_init(void) } if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); return -ENODEV; } -- cgit v0.10.2 From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:26:33 +0200 Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs Johannes Stezenbach reported that 'perf stat' does not count cache-miss and cache-references events on his Pentium-M based laptop. This is because we left them blank in p6_perfmon_event_map[], fill them in. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40e233a..fffc126 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -- cgit v0.10.2 From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Mon, 10 Aug 2009 19:56:45 -0300 Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag Due to an erratum with certain AMD Athlon 64 processors, the BIOS may need to force enable the LAHF_LM capability. Unfortunately, in at least one case, the BIOS does this even for processors that do not support the functionality. Add a specific check that will clear the feature bit for processors known not to support the LAHF/SAHF instructions. Signed-off-by: Kevin Winchester Acked-by: Borislav Petkov LKML-Reference: <4A80A5AD.2000209@gmail.com> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b0..63fddcd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); -- cgit v0.10.2 From 51b89f7a6615eca184aa0b85db5781d931e9c8d1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu <[fenghua.yu@intel.com]> Date: Tue, 11 Aug 2009 14:52:10 -0700 Subject: Bug Fix arch/ia64/kernel/pci-dma.c: fix recursive dma_supported() call in iommu_dma_supported() In commit 160c1d8e40866edfeae7d68816b7005d70acf391, dma_ops->dma_supported = iommu_dma_supported; This dma_ops->dma_supported is first called in platform_dma_init() during kernel boot. Then dma_ops->dma_supported will be called recursively in iommu_dma_supported. Kernel can not boot because kernel can not get out of iommu_dma_supported until it runs out of stack memory. Signed-off-by: Fenghua Yu diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c index 0569596..f6b1ff0 100644 --- a/arch/ia64/kernel/pci-dma.c +++ b/arch/ia64/kernel/pci-dma.c @@ -69,11 +69,6 @@ iommu_dma_init(void) int iommu_dma_supported(struct device *dev, u64 mask) { - struct dma_map_ops *ops = platform_dma_get_ops(dev); - - if (ops->dma_supported) - return ops->dma_supported(dev, mask); - /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ -- cgit v0.10.2 From 8d6f9af91959256244878cd801c1c969e66cd093 Mon Sep 17 00:00:00 2001 From: Johannes Weiner <[hannes@cmpxchg.org]> Date: Tue, 11 Aug 2009 14:52:10 -0700 Subject: ia64: boolean __test_and_clear_bit __test_and_clear_bit() returns a bitfield with the tested-for bit set. Make it consistent with the other bitops - of ia64 but also every other architecture - and return a boolean value. Signed-off-by: Johannes Weiner Acked-by: Fenghua Yu diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index e2ca800..57a2787 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -286,7 +286,7 @@ __test_and_clear_bit(int nr, volatile void * addr) { __u32 *p = (__u32 *) addr + (nr >> 5); __u32 m = 1 << (nr & 31); - int oldbitset = *p & m; + int oldbitset = (*p & m) != 0; *p &= ~m; return oldbitset; -- cgit v0.10.2 From cfa5f809e399c699974ba6018eefa022bbc2e16e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput <[jaswinder@kernel.org]> Date: Tue, 11 Aug 2009 14:52:10 -0700 Subject: IA64: includecheck fix: ia64, ia64_ksyms.c fix the following 'make includecheck' warning: arch/ia64/kernel/ia64_ksyms.c: asm/page.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Acked-by: Fenghua Yu diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 2d31186..8ebccb5 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -21,6 +21,7 @@ EXPORT_SYMBOL(csum_ipv6_magic); #include EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(copy_page); #ifdef CONFIG_VIRTUAL_MEM_MAP #include @@ -60,9 +61,6 @@ EXPORT_SYMBOL(__udivdi3); EXPORT_SYMBOL(__moddi3); EXPORT_SYMBOL(__umoddi3); -#include -EXPORT_SYMBOL(copy_page); - #if defined(CONFIG_MD_RAID456) || defined(CONFIG_MD_RAID456_MODULE) extern void xor_ia64_2(void); extern void xor_ia64_3(void); -- cgit v0.10.2 From b5a8879347bbe68bd24c8870503bf6a0362da26b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput <[jaswinder@kernel.org]> Date: Tue, 11 Aug 2009 14:52:11 -0700 Subject: IA64: includecheck fix: ia64, pgtable.h fix the following 'make includecheck' warning: arch/ia64/include/asm/pgtable.h: asm/processor.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Acked-by: Fenghua Yu diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 0a9cc73..8840a69 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -155,7 +155,6 @@ #include #include #include -#include /* * Next come the mappings that determine how mmap() protection bits -- cgit v0.10.2 From bf2a4c7270b9a22243a91ab5efcc47aaf997c66b Mon Sep 17 00:00:00 2001 From: Fenghua Yu <[fenghua.yu@intel.com]> Date: Tue, 11 Aug 2009 14:52:11 -0700 Subject: arch/ia64/Makefile: Remove -mtune=merced in IA64 kernel build Between GCC version 3.4.0 and 4.3.3 (including 3.4.0 and 4.3.3), -mtune=merced is implemented in GCC. Starting from 4.4.0, -mtune=merced is deprecated. Even implemented in versions between 3.4.0 and 4.3.3, the -mtune=merced feature has been broken in some of the versions. For example, GCC 4.1.2 reports interanl tuning function errors during kernel building with -mtune=merced. Or GCC Bugzilla 16130 reports another -mtune=merced issue on GCC 3.4.1. So I would remove the -mtune=merced from IA64 kernel build. Without this option, kernel on Merced will remain the same except losing an unstable and out-of-date performance tunning feature. Since GCC version 3.4.0, -mtune=mckinley has been implemented. The -mtune=mckinley option functions the same as mtune=itanium2. And mtune=itanium2 is the default option. So we don't need to add mtune=mckinley either since its been the default option in any GCC version which implements this option. Signed-off-by: Fenghua Yu diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 58a7e46a..e7cbaa0 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -41,11 +41,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz) endif -ifeq ($(call cc-version),0304) - cflags-$(CONFIG_ITANIUM) += -mtune=merced - cflags-$(CONFIG_MCKINLEY) += -mtune=mckinley -endif - KBUILD_CFLAGS += $(cflags-y) head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o -- cgit v0.10.2 From 5359dffd4396f281c5b77de1acbee6fb1b333b23 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 11 Aug 2009 14:52:11 -0700 Subject: ia64/topology.c: exit cache_add_dev when kobject_init_and_add fails Make cache_add_dev exit sysfs when kobject_init_and_add returns an error. Signed-off-by: Xiaotian Feng Signed-off-by: Fenghua Yu diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index bc80dff..8f06035 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -372,6 +372,10 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj, &cache_ktype_percpu_entry, &sys_dev->kobj, "%s", "cache"); + if (unlikely(retval < 0)) { + cpu_cache_sysfs_exit(cpu); + return retval; + } for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) { this_object = LEAF_KOBJECT_PTR(cpu,i); @@ -385,7 +389,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(&all_cpu_cache_info[cpu].kobj); cpu_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } -- cgit v0.10.2 From e7369e01eb85550ed60dd1b0e120b69dfb03dc23 Mon Sep 17 00:00:00 2001 From: Roel Kluin <[roel.kluin@gmail.com]> Date: Tue, 11 Aug 2009 14:52:11 -0700 Subject: arch/ia64/kernel/iosapic: missing test after ioremap() Missing test after ioremap() Signed-off-by: Roel Kluin Acked-by: Fenghua Yu diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index c48b03f..dab4d39 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -1072,6 +1072,10 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base) } addr = ioremap(phys_addr, 0); + if (addr == NULL) { + spin_unlock_irqrestore(&iosapic_lock, flags); + return -ENOMEM; + } ver = iosapic_version(addr); if ((err = iosapic_check_gsi_range(gsi_base, ver))) { iounmap(addr); -- cgit v0.10.2 From 0cc6eee130b0c062feec8446d9cecdb17d2cfad3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:53 -0400 Subject: xfs: avoid memory allocation under m_peraglock in growfs code Allocate the memory for the larger m_perag array before taking the per-AG lock as the per-AG lock can be taken under the i_lock which can be taken from reclaim context. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cbd451b..2d0b3e1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,17 +167,25 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + void *new_perag, *old_perag; + xfs_filestream_flush(mp); + + new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, + KM_MAYFAIL); + if (!new_perag) + return XFS_ERROR(ENOMEM); + down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); + memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); + old_perag = mp->m_perag; + mp->m_perag = new_perag; + mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); + + kmem_free(old_perag); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; -- cgit v0.10.2 From ca35dcd6cae7d4a780c484c53f45548c4719f82c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:54 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_getbmap xfs_getbmap allocates memory with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7928b99..8ee5b5a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6009,7 +6009,7 @@ xfs_getbmap( */ error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) goto out_unlock_ilock; -- cgit v0.10.2 From f41d7fb9da05b604f8a69fb6cac2a0563c8ede4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:55 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_state_alloc xfs_da_state_alloc is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9ff6e57..bd0bb6d 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); } /* -- cgit v0.10.2 From 73195ed7864ae4a1fb0bea2ed9df59d19b4fde90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:56 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_buf_make i_lock is taken in the reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index bd0bb6d..2847bbc 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) int off; if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; #ifdef XFS_DABUF_DEBUG dabuf->ra = ra; -- cgit v0.10.2 From 3f52c2f0a07c23771909cc53f2e9451a7f1bf253 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:57 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_dir_cilookup_result xfs_dir_cilookup_result is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c657bec..bb1d58e 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -256,7 +256,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return EEXIST; - args->value = kmem_alloc(len, KM_MAYFAIL); + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) return ENOMEM; -- cgit v0.10.2 From 36fae17a648e0aee5d9560514d08477ef48dc87f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:58 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_buf_associate_memory xfs_buf_associate_memory is used for setting up the spare buffer for the log wrap case in xlog_sync which can happen under i_lock when called from xfs_fsync. The i_lock mutex is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. There are a couple more uses of xfs_buf_associate_memory in the log recovery code that are also affected by this, but I'd rather keep the code simple than passing on a gfp_mask argument. Longer term we should just stop requiring the memoery allocation in xlog_sync by some smaller rework of the buffer layer. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 1418b91..178c20c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -770,7 +770,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); if (rval) return rval; -- cgit v0.10.2 From 10746e47e722b5688fcd6eba9fbf9b2e64a248a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:59 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_set xfs_attr_rmtval_set is always called with i_lock held, and i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index db15feb..bfb5837 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2141,8 +2141,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); -- cgit v0.10.2 From 7b02ecb3031b192823bc732ae717febc0a59aa92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:00 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_readlink_bmap xfs_readlink_bmap is called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4eca5e..492d75b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,7 +538,9 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | + XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", -- cgit v0.10.2 From ddd3a14e0f030f0f7b900621f67532285b8657ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:01 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_get xfs_attr_rmtval_get is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index bfb5837..4ece190 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); + blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK, + &bp); if (error) return(error); -- cgit v0.10.2 From e0c222c411e22f086e929cd69fdcc89336164ec1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 20 Jul 2009 10:52:15 -0500 Subject: use XFS_CORRUPTION_ERROR in xfs_btree_check_sblock In Red Hat Bug 512552 - Can't write to XFS mount during raid5 resync a user ran into corruption while resyncing a raid, and we failed a consistency test, but didn't get much more info; it'd be nice to call XFS_CORRUPTION_ERROR here so we can see the buffer contents. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e9df995..2671738 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -120,8 +120,8 @@ xfs_btree_check_sblock( XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); } return 0; -- cgit v0.10.2 From b89d4208de3de442c9025919c4261be0b38e79a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:18 -0300 Subject: xfs: check for dinode realtime flag corruption Ramon tested XFS with a modified version of fsfuzzer and hit a NULL pointer dereference in __xfs_get_blocks due to the RT device target pointer being NULL. To fix this reject inode with the realtime bit set on a a filesystem without an RT subvolume during inode read. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Reported-by: Ramon de Carvalho Valle Tested-by: Ramon de Carvalho Valle Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f22d65..da428b3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -343,6 +343,16 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + switch (ip->i_d.di_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: -- cgit v0.10.2 From a8914f3a6d72c97328597a556a99daaf5cc288ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:44 -0300 Subject: xfs: fix spin_is_locked assert on uni-processor builds Without SMP or preemption spin_is_locked always returns false, so we can't do an assert with it. Instead use assert_spin_locked, which does the right thing on all builds. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reported-by: Johannes Engel Tested-by: Johannes Engel Signed-off-by: Felix Blyakher diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3750f04..9dbdff3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3180,7 +3180,7 @@ try_again: STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - ASSERT(spin_is_locked(&log->l_icloglock)); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); -- cgit v0.10.2 From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 11 Aug 2009 20:00:11 +0200 Subject: x86: Fix oops in identify_cpu() on CPUs without CPUID Kernel is broken for x86 CPUs without CPUID since 2.6.28. It crashes with NULL pointer dereference in identify_cpu(): 766 generic_identify(c); 767 768--> if (this_cpu->c_identify) 769 this_cpu->c_identify(c); this_cpu is NULL. This is because it's only initialized in get_cpu_vendor() function, which is not called if the CPU has no CPUID instruction. Signed-off-by: Ondrej Zary LKML-Reference: <200908112000.15993.linux@rainbow-software.org> Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c0..5ce60a8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; -- cgit v0.10.2 From df9eba8c9febf53782ef896518e7177999d98188 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Aug 2009 11:15:20 +0900 Subject: pata_at91: fix resource release Julias Lawall discovered that pata_at91 wasn't freeing a memory region allocated with kzalloc() on init failure paths. Upon review, pata_at91 also seems to be doing unnecessary explicit resource releases for managed resources too. Convert memory allocation to managed one and drop unnecessary explicit resource releases. Signed-off-by: Tejun Heo Cc: Julia Lawall Cc: Sergey Matyukevich Signed-off-by: Jeff Garzik diff --git a/drivers/ata/pata_at91.c b/drivers/ata/pata_at91.c index 5702aff..41c94b1 100644 --- a/drivers/ata/pata_at91.c +++ b/drivers/ata/pata_at91.c @@ -250,7 +250,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev) ata_port_desc(ap, "no IRQ, using PIO polling"); } - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); if (!info) { dev_err(dev, "failed to allocate memory for private data\n"); @@ -275,7 +275,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev) if (!info->ide_addr) { dev_err(dev, "failed to map IO base\n"); ret = -ENOMEM; - goto err_ide_ioremap; + goto err_put; } info->alt_addr = devm_ioremap(dev, @@ -284,7 +284,7 @@ static int __devinit pata_at91_probe(struct platform_device *pdev) if (!info->alt_addr) { dev_err(dev, "failed to map CTL base\n"); ret = -ENOMEM; - goto err_alt_ioremap; + goto err_put; } ap->ioaddr.cmd_addr = info->ide_addr; @@ -303,13 +303,8 @@ static int __devinit pata_at91_probe(struct platform_device *pdev) irq ? ata_sff_interrupt : NULL, irq_flags, &pata_at91_sht); -err_alt_ioremap: - devm_iounmap(dev, info->ide_addr); - -err_ide_ioremap: +err_put: clk_put(info->mck); - kfree(info); - return ret; } @@ -317,7 +312,6 @@ static int __devexit pata_at91_remove(struct platform_device *pdev) { struct ata_host *host = dev_get_drvdata(&pdev->dev); struct at91_ide_info *info; - struct device *dev = &pdev->dev; if (!host) return 0; @@ -328,11 +322,8 @@ static int __devexit pata_at91_remove(struct platform_device *pdev) if (!info) return 0; - devm_iounmap(dev, info->ide_addr); - devm_iounmap(dev, info->alt_addr); clk_put(info->mck); - kfree(info); return 0; } -- cgit v0.10.2 From 1fd4bbec8c0d6db96b02141f324066afa2e77e89 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 6 Aug 2009 17:47:05 +0200 Subject: pata_atiixp: fix second channel support PIO and MWDMA timings are never programmed for the second channel because timing registers are treated as 16-bit long ones. The bug is an attixp -> pata_atiixp regression and goes back to: commit 669a5db411d85a14f86cd92bc16bf7ab5b8aa235 Author: Jeff Garzik Date: Tue Aug 29 18:12:40 2006 -0400 [libata] Add a bunch of PATA drivers. Cc: Krystian Juskowiak Cc: Andrew Morton Cc: Borislav Petkov Cc: Robert Hancock Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Jeff Garzik diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c index bec0b8a..4591556 100644 --- a/drivers/ata/pata_atiixp.c +++ b/drivers/ata/pata_atiixp.c @@ -1,6 +1,7 @@ /* * pata_atiixp.c - ATI PATA for new ATA layer * (C) 2005 Red Hat Inc + * (C) 2009 Bartlomiej Zolnierkiewicz * * Based on * @@ -61,20 +62,19 @@ static void atiixp_set_pio_timing(struct ata_port *ap, struct ata_device *adev, struct pci_dev *pdev = to_pci_dev(ap->host->dev); int dn = 2 * ap->port_no + adev->devno; - - /* Check this is correct - the order is odd in both drivers */ int timing_shift = (16 * ap->port_no) + 8 * (adev->devno ^ 1); - u16 pio_mode_data, pio_timing_data; + u32 pio_timing_data; + u16 pio_mode_data; pci_read_config_word(pdev, ATIIXP_IDE_PIO_MODE, &pio_mode_data); pio_mode_data &= ~(0x7 << (4 * dn)); pio_mode_data |= pio << (4 * dn); pci_write_config_word(pdev, ATIIXP_IDE_PIO_MODE, pio_mode_data); - pci_read_config_word(pdev, ATIIXP_IDE_PIO_TIMING, &pio_timing_data); + pci_read_config_dword(pdev, ATIIXP_IDE_PIO_TIMING, &pio_timing_data); pio_timing_data &= ~(0xFF << timing_shift); pio_timing_data |= (pio_timings[pio] << timing_shift); - pci_write_config_word(pdev, ATIIXP_IDE_PIO_TIMING, pio_timing_data); + pci_write_config_dword(pdev, ATIIXP_IDE_PIO_TIMING, pio_timing_data); } /** @@ -119,16 +119,17 @@ static void atiixp_set_dmamode(struct ata_port *ap, struct ata_device *adev) udma_mode_data |= dma << (4 * dn); pci_write_config_word(pdev, ATIIXP_IDE_UDMA_MODE, udma_mode_data); } else { - u16 mwdma_timing_data; - /* Check this is correct - the order is odd in both drivers */ int timing_shift = (16 * ap->port_no) + 8 * (adev->devno ^ 1); + u32 mwdma_timing_data; dma -= XFER_MW_DMA_0; - pci_read_config_word(pdev, ATIIXP_IDE_MWDMA_TIMING, &mwdma_timing_data); + pci_read_config_dword(pdev, ATIIXP_IDE_MWDMA_TIMING, + &mwdma_timing_data); mwdma_timing_data &= ~(0xFF << timing_shift); mwdma_timing_data |= (mwdma_timings[dma] << timing_shift); - pci_write_config_word(pdev, ATIIXP_IDE_MWDMA_TIMING, mwdma_timing_data); + pci_write_config_dword(pdev, ATIIXP_IDE_MWDMA_TIMING, + mwdma_timing_data); } /* * We must now look at the PIO mode situation. We may need to -- cgit v0.10.2 From 7831387bda72af3059be48d39846d3eb6d8ce2f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Aug 2009 01:59:15 +0900 Subject: libata: OCZ Vertex can't do HPA OCZ Vertex SSD can't do HPA and not in a usual way. It reports HPA, allows unlocking but then fails all IOs which fall in the unlocked area. Quirk it so that HPA unlocking is not used for the device. Reported by Daniel Perup in bnc#522414. https://bugzilla.novell.com/show_bug.cgi?id=522414 Signed-off-by: Tejun Heo Reported-by: Daniel Perup Signed-off-by: Jeff Garzik diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 8ac98ff..072ba5e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4302,6 +4302,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_HORKAGE_BROKEN_HPA }, { "MAXTOR 6L080L4", "A93.0500", ATA_HORKAGE_BROKEN_HPA }, + /* this one allows HPA unlocking but fails IOs on the area */ + { "OCZ-VERTEX", "1.30", ATA_HORKAGE_BROKEN_HPA }, + /* Devices which report 1 sector over size HPA */ { "ST340823A", NULL, ATA_HORKAGE_HPA_SIZE, }, { "ST320413A", NULL, ATA_HORKAGE_HPA_SIZE, }, -- cgit v0.10.2 From 51c8949950647afeeb897e08dd75ad99078adb50 Mon Sep 17 00:00:00 2001 From: Tony Vroon Date: Thu, 6 Aug 2009 00:50:09 +0100 Subject: sata_nv: MSI support, disabled by default At least the nVidia MCP55 controller quite happily supports MSI. This adds an option to use it. It is disabled by default. As per feedback by Robert Hancock, it will honour the user request as the kernel will not enable MSI where the controller or the specific system configuration do not support it. Signed-off-by: Tony Vroon Cc: Robert Hancock Signed-off-by: Jeff Garzik diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index b2d11f3..86a4058 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -602,6 +602,7 @@ MODULE_VERSION(DRV_VERSION); static int adma_enabled; static int swncq_enabled = 1; +static int msi_enabled; static void nv_adma_register_mode(struct ata_port *ap) { @@ -2459,6 +2460,11 @@ static int nv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) } else if (type == SWNCQ) nv_swncq_host_init(host); + if (msi_enabled) { + dev_printk(KERN_NOTICE, &pdev->dev, "Using MSI\n"); + pci_enable_msi(pdev); + } + pci_set_master(pdev); return ata_host_activate(host, pdev->irq, ipriv->irq_handler, IRQF_SHARED, ipriv->sht); @@ -2558,4 +2564,6 @@ module_param_named(adma, adma_enabled, bool, 0444); MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: false)"); module_param_named(swncq, swncq_enabled, bool, 0444); MODULE_PARM_DESC(swncq, "Enable use of SWNCQ (Default: true)"); +module_param_named(msi, msi_enabled, bool, 0444); +MODULE_PARM_DESC(msi, "Enable use of MSI (Default: false)"); -- cgit v0.10.2 From 20308871588518b5e209c403de2a3ad9a2eba9af Mon Sep 17 00:00:00 2001 From: Michael Prokop Date: Thu, 6 Aug 2009 00:14:10 +0200 Subject: Documentation/kernel-parameters.txt: document libata's ignore_hpa option By default the kernel honors the HPA (host protected area) of hard drives. Using libata's ignore_hpa module option it's possible to change this behaviour. Document usage and options of libata.ignore_hpa in Documentation/kernel-parameters.txt. Signed-off-by: Michael Prokop Signed-off-by: Jeff Garzik diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dd1a6d4..7936b80 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1115,6 +1115,10 @@ and is between 256 and 4096 characters. It is defined in the file libata.dma=4 Compact Flash DMA only Combinations also work, so libata.dma=3 enables DMA for disks and CDROMs, but not CFs. + + libata.ignore_hpa= [LIBATA] Ignore HPA limit + libata.ignore_hpa=0 keep BIOS limits (default) + libata.ignore_hpa=1 ignore limits, using full disk libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume when set. -- cgit v0.10.2 From b6931c1fbaf7fda9ea7f120228a96600d7090049 Mon Sep 17 00:00:00 2001 From: Shane Huang Date: Wed, 5 Aug 2009 10:10:41 +0800 Subject: ahci: Soften up the dmesg on SB600 PMP softreset failure recovery Too strong words led to spurious bug reports: Novell bugzilla #527748, RedHat bugzilla #468800. This patch is used to soften up the dmesg on SB600 PMP softreset failure recovery, so as to remove the scariness and concern from community. Reported-by: pgnet Dev Signed-off-by: Shane Huang Cc: Tejun Heo Signed-off-by: Jeff Garzik diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 958c1fa..838ff73 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1773,7 +1773,8 @@ static int ahci_sb600_softreset(struct ata_link *link, unsigned int *class, irq_sts = readl(port_mmio + PORT_IRQ_STAT); if (irq_sts & PORT_IRQ_BAD_PMP) { ata_link_printk(link, KERN_WARNING, - "failed due to HW bug, retry pmp=0\n"); + "applying SB600 PMP SRST workaround " + "and retrying\n"); rc = ahci_do_softreset(link, class, 0, deadline, ahci_check_ready); } -- cgit v0.10.2 From 5594639aab8b5614cb27a3e5b2b627505cbcd137 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Aug 2009 14:30:08 +0900 Subject: ahci: add workaround for on-board 5723s on some gigabyte boards Some gigabytes have on-board SIMG5723s connected to JMB ahcis. These are used to implement hardware raid. Unfortunately some firmware revisions on these 5723s don't bring the link down when all the downstream ports are unoccupied while not responding to reset protocol which makes libata think that there's device attached to the port but is not responding and retry. This results in painfully wrong boot detection time for these ports when they're empty. This patch quirks those boards such that ahci gives up after the initial timeout. Combined with parallel probing, this gives quick enough probing and also is safe because SIMG5723 will respond to the first try if any of the downstream ports is occupied. Signed-off-by: Tejun Heo Reported-by: Marc Bowes Reported-by: Nicolas Mailhot Signed-off-by: Jeff Garzik diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 838ff73..fe3eba5 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -219,6 +219,8 @@ enum { AHCI_HFLAG_SECT255 = (1 << 8), /* max 255 sectors */ AHCI_HFLAG_YES_NCQ = (1 << 9), /* force NCQ cap on */ AHCI_HFLAG_NO_SUSPEND = (1 << 10), /* don't suspend */ + AHCI_HFLAG_SRST_TOUT_IS_OFFLINE = (1 << 11), /* treat SRST timeout as + link offline */ /* ap->flags bits */ @@ -1663,6 +1665,7 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class, int (*check_ready)(struct ata_link *link)) { struct ata_port *ap = link->ap; + struct ahci_host_priv *hpriv = ap->host->private_data; const char *reason = NULL; unsigned long now, msecs; struct ata_taskfile tf; @@ -1701,12 +1704,21 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class, /* wait for link to become ready */ rc = ata_wait_after_reset(link, deadline, check_ready); - /* link occupied, -ENODEV too is an error */ - if (rc) { + if (rc == -EBUSY && hpriv->flags & AHCI_HFLAG_SRST_TOUT_IS_OFFLINE) { + /* + * Workaround for cases where link online status can't + * be trusted. Treat device readiness timeout as link + * offline. + */ + ata_link_printk(link, KERN_INFO, + "device not ready, treating as offline\n"); + *class = ATA_DEV_NONE; + } else if (rc) { + /* link occupied, -ENODEV too is an error */ reason = "device not ready"; goto fail; - } - *class = ahci_dev_classify(ap); + } else + *class = ahci_dev_classify(ap); DPRINTK("EXIT, class=%u\n", *class); return 0; @@ -2727,6 +2739,56 @@ static bool ahci_broken_suspend(struct pci_dev *pdev) return !ver || strcmp(ver, dmi->driver_data) < 0; } +static bool ahci_broken_online(struct pci_dev *pdev) +{ +#define ENCODE_BUSDEVFN(bus, slot, func) \ + (void *)(unsigned long)(((bus) << 8) | PCI_DEVFN((slot), (func))) + static const struct dmi_system_id sysids[] = { + /* + * There are several gigabyte boards which use + * SIMG5723s configured as hardware RAID. Certain + * 5723 firmware revisions shipped there keep the link + * online but fail to answer properly to SRST or + * IDENTIFY when no device is attached downstream + * causing libata to retry quite a few times leading + * to excessive detection delay. + * + * As these firmwares respond to the second reset try + * with invalid device signature, considering unknown + * sig as offline works around the problem acceptably. + */ + { + .ident = "EP45-DQ6", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, + "Gigabyte Technology Co., Ltd."), + DMI_MATCH(DMI_BOARD_NAME, "EP45-DQ6"), + }, + .driver_data = ENCODE_BUSDEVFN(0x0a, 0x00, 0), + }, + { + .ident = "EP45-DS5", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, + "Gigabyte Technology Co., Ltd."), + DMI_MATCH(DMI_BOARD_NAME, "EP45-DS5"), + }, + .driver_data = ENCODE_BUSDEVFN(0x03, 0x00, 0), + }, + { } /* terminate list */ + }; +#undef ENCODE_BUSDEVFN + const struct dmi_system_id *dmi = dmi_first_match(sysids); + unsigned int val; + + if (!dmi) + return false; + + val = (unsigned long)dmi->driver_data; + + return pdev->bus->number == (val >> 8) && pdev->devfn == (val & 0xff); +} + static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { static int printed_version; @@ -2842,6 +2904,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "BIOS update required for suspend/resume\n"); } + if (ahci_broken_online(pdev)) { + hpriv->flags |= AHCI_HFLAG_SRST_TOUT_IS_OFFLINE; + dev_info(&pdev->dev, + "online status unreliable, applying workaround\n"); + } + /* CAP.NP sometimes indicate the index of the last enabled * port, at other times, that of the last possible port, so * determining the maximum port number requires looking at -- cgit v0.10.2 From 67fe0688082509c52bd451d10a61b3565169c23e Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Wed, 12 Aug 2009 06:29:57 -0400 Subject: Remove zero-length file drivers/mtd/maps/sbc8240.c It was "deleted" in commit 2bf961b7ccd69e108ac435c67e2b0522b403c578 Signed-off-by: Jeff Garzik diff --git a/drivers/mtd/maps/sbc8240.c b/drivers/mtd/maps/sbc8240.c deleted file mode 100644 index e69de29..0000000 -- cgit v0.10.2 From 2a8083f063472f27c253545dd64e1a7bbbb1ab61 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:22:00 -0300 Subject: perf record: Fix .tid and .pid fill-in when synthesizing events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Noticed when trying to record events for a firefox thread. We were synthesizing both .tid and .pid with the pid passed via --pid. Fix it by reading /proc/PID/status and getting the tgid to use in .pid, .tid gets the specified "pid". Signed-off-by: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Frédéric Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20090811192200.GF18061@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 0345aad..30b83de 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -203,46 +203,48 @@ static void sig_atexit(void) kill(getpid(), signr); } -static void pid_synthesize_comm_event(pid_t pid, int full) +static pid_t pid_synthesize_comm_event(pid_t pid, int full) { struct comm_event comm_ev; char filename[PATH_MAX]; char bf[BUFSIZ]; - int fd; - size_t size; - char *field, *sep; + FILE *fp; + size_t size = 0; DIR *tasks; struct dirent dirent, *next; + pid_t tgid = 0; - snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); + snprintf(filename, sizeof(filename), "/proc/%d/status", pid); - fd = open(filename, O_RDONLY); - if (fd < 0) { + fp = fopen(filename, "r"); + if (fd == NULL) { /* * We raced with a task exiting - just return: */ if (verbose) fprintf(stderr, "couldn't open %s\n", filename); - return; + return 0; } - if (read(fd, bf, sizeof(bf)) < 0) { - fprintf(stderr, "couldn't read %s\n", filename); - exit(EXIT_FAILURE); - } - close(fd); - /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ memset(&comm_ev, 0, sizeof(comm_ev)); - field = strchr(bf, '('); - if (field == NULL) - goto out_failure; - sep = strchr(++field, ')'); - if (sep == NULL) - goto out_failure; - size = sep - field; - memcpy(comm_ev.comm, field, size++); - - comm_ev.pid = pid; + while (!comm_ev.comm[0] || !comm_ev.pid) { + if (fgets(bf, sizeof(bf), fp) == NULL) + goto out_failure; + + if (memcmp(bf, "Name:", 5) == 0) { + char *name = bf + 5; + while (*name && isspace(*name)) + ++name; + size = strlen(name) - 1; + memcpy(comm_ev.comm, name, size++); + } else if (memcmp(bf, "Tgid:", 5) == 0) { + char *tgids = bf + 5; + while (*tgids && isspace(*tgids)) + ++tgids; + tgid = comm_ev.pid = atoi(tgids); + } + } + comm_ev.header.type = PERF_EVENT_COMM; size = ALIGN(size, sizeof(u64)); comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); @@ -251,7 +253,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full) comm_ev.tid = pid; write_output(&comm_ev, comm_ev.header.size); - return; + goto out_fclose; } snprintf(filename, sizeof(filename), "/proc/%d/task", pid); @@ -268,7 +270,10 @@ static void pid_synthesize_comm_event(pid_t pid, int full) write_output(&comm_ev, comm_ev.header.size); } closedir(tasks); - return; + +out_fclose: + fclose(fp); + return tgid; out_failure: fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", @@ -276,7 +281,7 @@ out_failure: exit(EXIT_FAILURE); } -static void pid_synthesize_mmap_samples(pid_t pid) +static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) { char filename[PATH_MAX]; FILE *fp; @@ -328,7 +333,7 @@ static void pid_synthesize_mmap_samples(pid_t pid) mmap_ev.len -= mmap_ev.start; mmap_ev.header.size = (sizeof(mmap_ev) - (sizeof(mmap_ev.filename) - size)); - mmap_ev.pid = pid; + mmap_ev.pid = tgid; mmap_ev.tid = pid; write_output(&mmap_ev, mmap_ev.header.size); @@ -347,14 +352,14 @@ static void synthesize_all(void) while (!readdir_r(proc, &dirent, &next) && next) { char *end; - pid_t pid; + pid_t pid, tgid; pid = strtol(dirent.d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; - pid_synthesize_comm_event(pid, 1); - pid_synthesize_mmap_samples(pid); + tgid = pid_synthesize_comm_event(pid, 1); + pid_synthesize_mmap_samples(pid, tgid); } closedir(proc); @@ -567,8 +572,8 @@ static int __cmd_record(int argc, const char **argv) perf_header__write(header, output); if (!system_wide) { - pid_synthesize_comm_event(pid, 0); - pid_synthesize_mmap_samples(pid); + pid_t tgid = pid_synthesize_comm_event(pid, 0); + pid_synthesize_mmap_samples(pid, tgid); } else synthesize_all(); -- cgit v0.10.2 From 94a24752fe95ca1e7f98b197052d44e6a207740d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:21:38 -0300 Subject: perf report: Show the tid too in -D MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This made it easier to find the firefox threading related bug. Signed-off-by: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811192138.GE18061@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 99274ce..23e1457 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1526,11 +1526,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n", + dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, - event->ip.pid, + event->ip.pid, event->ip.tid, (void *)(long)ip, (long long)period); @@ -1612,10 +1612,11 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) struct thread *thread = threads__findnew(event->mmap.pid); struct map *map = map__new(&event->mmap); - dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", + dprintf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, + event->mmap.tid, (void *)(long)event->mmap.start, (void *)(long)event->mmap.len, (void *)(long)event->mmap.pgoff, -- cgit v0.10.2 From 247648e3742ded01e42a4b14c2da330b13cbb47f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 16:22:11 -0300 Subject: perf tools: Fix fallback to cplus_demangle() when bfd_demangle() is not available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In old binutils we can't access bfd_demangle(), use cplus_demangle() just like oprofile. Signed-off-by: Arnaldo Carvalho de Melo Cc: Luis Claudio R. Gonçalves Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811192211.GG18061@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 60411e9..c045b42 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -382,22 +382,29 @@ endif ifdef NO_DEMANGLE BASIC_CFLAGS += -DNO_DEMANGLE else - has_bfd := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd > /dev/null 2>&1 && echo y") - has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") - - has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") - ifeq ($(has_bfd),y) EXTLIBS += -lbfd - else ifeq ($(has_bfd_iberty),y) - EXTLIBS += -lbfd -liberty - else ifeq ($(has_bfd_iberty_z),y) - EXTLIBS += -lbfd -liberty -lz else - msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) - BASIC_CFLAGS += -DNO_DEMANGLE + has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd_iberty),y) + EXTLIBS += -lbfd -liberty + else + has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd_iberty_z),y) + EXTLIBS += -lbfd -liberty -lz + else + has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -liberty > /dev/null 2>&1 && echo y") + ifeq ($(has_cplus_demangle),y) + EXTLIBS += -liberty + BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE + else + msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) + BASIC_CFLAGS += -DNO_DEMANGLE + endif + endif + endif endif endif diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f1dcede..2473fd4 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -7,23 +7,8 @@ #include #include -#ifndef NO_DEMANGLE -#include -#else -static inline -char *bfd_demangle(void __used *v, const char __used *c, int __used i) -{ - return NULL; -} -#endif - const char *sym_hist_filter; -#ifndef DMGL_PARAMS -#define DMGL_PARAMS (1 << 0) /* Include function args */ -#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ -#endif - enum dso_origin { DSO__ORIG_KERNEL = 0, DSO__ORIG_JAVA_JIT, diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 1e003ec..b53bf01 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -7,6 +7,30 @@ #include #include "module.h" +#ifdef HAVE_CPLUS_DEMANGLE +extern char *cplus_demangle(const char *, int); + +static inline char *bfd_demangle(void __used *v, const char *c, int i) +{ + return cplus_demangle(c, i); +} +#else +#ifdef NO_DEMANGLE +static inline char *bfd_demangle(void __used *v, const char __used *c, + int __used i) +{ + return NULL; +} +#else +#include +#endif +#endif + +#ifndef DMGL_PARAMS +#define DMGL_PARAMS (1 << 0) /* Include function args */ +#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ +#endif + struct symbol { struct rb_node rb_node; u64 start; -- cgit v0.10.2 From 1340e6bbaff7ff7f6f75eb4a5c34933efce84a84 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2009 17:04:36 -0300 Subject: perf tools: Fix dso__new handle() to handle deleted DSOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is better than showing the map addr, this way at least we know that we can't get the symtabs because the DSO was deleted (system update) while an app still used such DSO. Yeah, don't do that, but if you do, you'll figure it out quicker this way. [acme@doppio linux-2.6-tip]$ perf report | head -15 # Samples: 3796 # # Overhead Command Shared Object Symbol # ........ ....... ................................................................... ...... # 23.55% pidgin /lib64/libglib-2.0.so.0.2000.4.#prelink#.Pd98lu (deleted) [.] 0x00000000038844 21.55% pidgin /lib64/libpthread-2.10.1.so.#prelink#.AFwK8Q (deleted) [.] 0x0000000000a42d 10.85% pidgin [kernel] [.] vread_hpet 7.85% pidgin /lib64/libgobject-2.0.so.0.2000.4.#prelink#.o1vpU7 (deleted) [.] 0x00000000014de8 3.35% pidgin /lib64/libc-2.10.1.so (deleted) [.] 0x0000000007a875 3.19% pidgin /lib64/libdbus-1.so.3.4.0.#prelink#.6mwgZP (deleted) [.] 0x0000000001d254 3.06% pidgin /usr/lib64/libgtk-x11-2.0.so.0.1600.5.#prelink#.511hAl (deleted) [.] 0x000000002334e7 2.90% pidgin /usr/lib64/libgdk-x11-2.0.so.0.1600.5.#prelink#.5qlMo1 (deleted) [.] 0x00000000037b2d 1.84% pidgin [kernel] [k] do_sys_poll 1.45% pidgin /usr/lib64/libX11.so.6.2.0.#prelink#.iR59Rx (deleted) [.] 0x0000000004c751 [acme@doppio linux-2.6-tip]$ Signed-off-by: Arnaldo Carvalho de Melo Cc: Luis Claudio R. Gonçalves Cc: Clark Williams Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frédéric Weisbecker LKML-Reference: <20090811200436.GA3478@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 2473fd4..5c0f42e 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -801,6 +801,8 @@ more: } out: free(name); + if (ret < 0 && strstr(self->name, " (deleted)") != NULL) + return 0; return ret; } -- cgit v0.10.2 From 0a5ac84650fb7a7f226814103d95724e34b012ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Aug 2009 11:18:01 +0200 Subject: perf record: Add missing -C option support for specifying profile cpu perf top supports a -C for setting the profile CPU, but perf record does not. This adds the same option for perf record, allowing the user to specify a specific target profile CPU. Signed-off-by: Jens Axboe Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20090812091801.GC12579@kernel.dk> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 30b83de..de76008 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -35,6 +35,7 @@ static const char *output_name = "perf.data"; static int group = 0; static unsigned int realtime_prio = 0; static int system_wide = 0; +static int profile_cpu = -1; static pid_t target_pid = -1; static int inherit = 1; static int force = 0; @@ -431,6 +432,8 @@ try_again: if (err == EPERM) die("Permission error - are you root?\n"); + else if (err == ENODEV && profile_cpu != -1) + die("No such device - did you specify an out-of-range profile CPU?\n"); /* * If it's cycles then fall back to hrtimer @@ -564,9 +567,15 @@ static int __cmd_record(int argc, const char **argv) if (pid == -1) pid = getpid(); - open_counters(-1, pid); - } else for (i = 0; i < nr_cpus; i++) - open_counters(i, target_pid); + open_counters(profile_cpu, pid); + } else { + if (profile_cpu != -1) { + open_counters(profile_cpu, target_pid); + } else { + for (i = 0; i < nr_cpus; i++) + open_counters(i, target_pid); + } + } if (file_new) perf_header__write(header, output); @@ -645,6 +654,8 @@ static const struct option options[] = { "system-wide collection from all CPUs"), OPT_BOOLEAN('A', "append", &append_file, "append to the output file to do incremental profiling"), + OPT_INTEGER('C', "profile_cpu", &profile_cpu, + "CPU to profile on"), OPT_BOOLEAN('f', "force", &force, "overwrite existing data file"), OPT_LONG('c', "count", &default_interval, -- cgit v0.10.2 From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:40:08 +0200 Subject: perf_counter, x86: Fix/improve apic fallback Johannes Stezenbach reported that his Pentium-M based laptop does not have the local APIC enabled by default, and hence perfcounters do not get initialized. Add a fallback for this case: allow non-sampled counters and return with an error on sampled counters. This allows 'perf stat' to work out of box - and allows 'perf top' and 'perf record' to fall back on a hrtimer based sampling method. ( Passing 'lapic' on the boot line will allow hardware sampling to occur - but if the APIC is disabled permanently by the hardware then this fallback still allows more systems to use perfcounters. ) Also decouple perfcounter support from X86_LOCAL_APIC. -v2: fix typo breaking counters on all other systems ... Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6..13ffa5d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -742,7 +743,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC - select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fffc126..900332b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -644,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -657,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = { .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, .num_counters = 2, @@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1614,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - return -ENODEV; + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } -- cgit v0.10.2 From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Aug 2009 09:12:30 -0400 Subject: NFS: Fix an O_DIRECT Oops... We can't call nfs_readdata_release()/nfs_writedata_release() without first initialising and referencing args.context. Doing so inside nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment() causes an Oops. We should rather be calling nfs_readdata_free()/nfs_writedata_free() in those cases. Looking at the O_DIRECT code, the "struct nfs_direct_req" is already referencing the nfs_open_context for us. Since the readdata and writedata structures carry a reference to that, we can simplify things by getting rid of the extra nfs_open_context references, so that we can replace all instances of nfs_readdata_release()/nfs_writedata_release(). Reported-by: Catalin Marinas Signed-off-by: Trond Myklebust Tested-by: Catalin Marinas Cc: stable@kernel.org Signed-off-by: Linus Torvalds diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01..e4e089a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 73ea5e8..12c9e66 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a0a2ff..a34fae2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fdffb41..f6b9024 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,7 +473,6 @@ extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); -extern void nfs_writedata_release(void *); /* * Try to write back everything synchronously (but check the @@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode) * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages); +extern void nfs_writedata_free(struct nfs_write_data *); /* * linux/fs/nfs/read.c @@ -515,7 +514,6 @@ extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); -extern void nfs_readdata_release(void *data); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); @@ -523,6 +521,7 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages); +extern void nfs_readdata_free(struct nfs_read_data *); /* * linux/fs/nfs3proc.c -- cgit v0.10.2 From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 7 Aug 2009 12:01:08 -0400 Subject: Remove double removal of blktrace directory commit fd51d251e4cdb21f68e9dbc4336514d64a105a79 Author: Stefan Raspl Date: Tue May 19 09:59:08 2009 +0200 blktrace: remove debugfs entries on bad path added in an explicit invocation of debugfs_remove for bt->dir, in blk_remove_buf_file_callback we are also getting the directory removed. On occasion I am seeing memory corruption that I have bisected down to this commit. [The testing involves a (long) series of I/O benchmarks with blktrace invoked around the actual runs.] I believe that this committed patch is correct, but the problem actually lies in the code in blk_remove_buf_file_callback. With this patch I am able to consistently get complete runs whereas previously I could not get a single run to complete. The first part of the patch simply moves the debugfs_remove below the relay_close: the relay_close call will remove files under bt->dir, and so we should not remove the directory until all the files we created have been removed. (Note: This is not sufficient to fix the problem - the file system code has ref counts on the directoy, so our invocation does not cause the directory to actually be removed. Nonetheless, we should not rely upon that feature.) Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0a..7a34cb5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } -- cgit v0.10.2 From 51d5668cb2e3fd1827a55184e48606fff054c5be Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 09:54:02 +1000 Subject: md: never advance 'events' counter by more than 1. When assembling arrays, md allows two devices to have different event counts as long as the difference is only '1'. This is to cope with a system failure between updating the metadata on two difference devices. However there are currently times when we update the event count by 2. This was done to keep the event count even when the array is clean and odd when it is dirty, which allows us to avoid writing common update to spare devices and so allow those spares to go to sleep. This is bad for the above reason. So change it to never increase by two. This means that the alignment between 'odd/even' and 'clean/dirty' might take a little longer to attain, but that is only a small cost. The spares will get a few more updates but that will still be spared (;-) most updates and can still go to sleep. Prior to this patch there was a small chance that after a crash an array would fail to assemble due to the overly large event count mismatch. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index 5614500..d18805f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1975,17 +1975,14 @@ repeat: /* otherwise we have to go forward and ... */ mddev->events ++; if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; + /* .. if the array isn't clean, an 'even' event must also go + * to spares. */ + if ((mddev->events&1)==0) nospares = 0; - } } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; + /* otherwise an 'odd' event must go to spares */ + if ((mddev->events&1)) nospares = 0; - } } } -- cgit v0.10.2 From 67ac6011db5d2b0c853d573ff474b25c85dfb644 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:06:24 +1000 Subject: md/raid5: allow new reshape modes to be restarted in the middle. md/raid5 doesn't allow a reshape to restart if it involves writing over the same part of disk that it would be reading from. This happens at the beginning of a reshape that increases the number of devices, at the end of a reshape that decreases the number of devices, and continuously for a reshape that does not change the number of devices. The current code is correct for the "increase number of devices" case as the critical section at the start is handled by userspace performing a backup. It does not work for reducing the number of devices, or the no-change case. For 'reducing', we need to invert the test. For no-change we cannot really be sure things will be safe, so simply require the array to be read-only, which is how the user-space code which carefully starts such arrays works. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2b521ee..b8a22a2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev) (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ - if (here_new >= here_old) { + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space if monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors) || + mddev->ro == 0) { + printk(KERN_ERR "raid5: in-place reshape must be started" + " in read-only mode - aborting\n"); + return -EINVAL; + } + } else if (mddev->delta_disks < 0 + ? (here_new * mddev->new_chunk_sectors <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "raid5: reshape_position too early for " "auto-recovery - aborting.\n"); -- cgit v0.10.2 From a639755cf885e437b2fe4168d35157fa90d530ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:13:00 +1000 Subject: md/raid5: make sure a reshape restarts at the correct address. This "if" don't allow for the possibility that the number of devices doesn't change, and so sector_nr isn't set correctly in that case. So change '>' to '>='. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b8a22a2..94a74cb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks > 0 && + } else if (mddev->delta_disks >= 0 && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); -- cgit v0.10.2 From 1a67dde0abba36421a1257d01ba9de2f6d1c160a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:41:49 +1000 Subject: md/raid5: Properly remove excess drives after shrinking a raid5/6 We were removing the drives, from the array, but not removing symlinks from /sys/.... and not marking the device as having been removed. Signed-off-by: NeilBrown diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 94a74cb..b8a2c5d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5097,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev) mddev->degraded--; for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; - d++) - raid5_remove_disk(mddev, d); + d++) { + mdk_rdev_t *rdev = conf->disks[d].rdev; + if (rdev && raid5_remove_disk(mddev, d) == 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors; -- cgit v0.10.2 From 4d484a4a7a5126410eed5f8dd329a33f6eeed068 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 10:41:50 +1000 Subject: md: allow upper limit for resync/reshape to be set when array is read-only Normally we only allow the upper limit for a reshape to be decreased when the array not performing a sync/recovery/reshape, otherwise there could be races. But if an array is part-way through a reshape when it is assembled the reshape is started immediately leaving no window to set an upper bound. If the array is started read-only, the reshape will be suspended until the array becomes writable, so that provides a window during which it is perfectly safe to reduce the upper limit of a reshape. So: allow the upper limit (sync_max) to be reduced even if the reshape thread is running, as long as the array is still read-only. Signed-off-by: NeilBrown diff --git a/drivers/md/md.c b/drivers/md/md.c index d18805f..103f2d3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3599,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; -- cgit v0.10.2 From ba9a633787eed1e90d587282642580ad3d44f7fd Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 22 Jul 2009 15:14:29 +0000 Subject: sh: convert processor device setup functions to arch_initcall() Convert the processor platform device setup functions from __initcall() and sometimes device_initcall() to arch_initcall(). This makes sure that the platform devices are registered a bit earlier so the devices are available when drivers register using initcall levels earlier than device_initcall(). A good example is platform devices needed by i2c-sh_mobile.c which registers a bit earlier using subsys_initcall(). Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index 1379873..8555c05 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -187,7 +187,7 @@ static int __init sh7619_devices_setup(void) return platform_add_devices(sh7619_devices, ARRAY_SIZE(sh7619_devices)); } -__initcall(sh7619_devices_setup); +arch_initcall(sh7619_devices_setup); void __init plat_irq_setup(void) { diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c index 869c2da..b673764 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c +++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c @@ -238,7 +238,7 @@ static int __init mxg_devices_setup(void) return platform_add_devices(mxg_devices, ARRAY_SIZE(mxg_devices)); } -__initcall(mxg_devices_setup); +arch_initcall(mxg_devices_setup); void __init plat_irq_setup(void) { diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c index d8febe1..fbde5b7 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c @@ -357,7 +357,7 @@ static int __init sh7201_devices_setup(void) return platform_add_devices(sh7201_devices, ARRAY_SIZE(sh7201_devices)); } -__initcall(sh7201_devices_setup); +arch_initcall(sh7201_devices_setup); void __init plat_irq_setup(void) { diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index 62e3039..d3fd536 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -367,7 +367,7 @@ static int __init sh7203_devices_setup(void) return platform_add_devices(sh7203_devices, ARRAY_SIZE(sh7203_devices)); } -__initcall(sh7203_devices_setup); +arch_initcall(sh7203_devices_setup); void __init plat_irq_setup(void) { diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index 3e6f3d7..a9ccc5e 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -338,7 +338,7 @@ static int __init sh7206_devices_setup(void) return platform_add_devices(sh7206_devices, ARRAY_SIZE(sh7206_devices)); } -__initcall(sh7206_devices_setup); +arch_initcall(sh7206_devices_setup); void __init plat_irq_setup(void) { diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c index 88f742f..c231059 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c @@ -222,7 +222,7 @@ static int __init sh7705_devices_setup(void) return platform_add_devices(sh7705_devices, ARRAY_SIZE(sh7705_devices)); } -__initcall(sh7705_devices_setup); +arch_initcall(sh7705_devices_setup); static struct platform_device *sh7705_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c index c563067..347ab35 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c @@ -250,7 +250,7 @@ static int __init sh770x_devices_setup(void) return platform_add_devices(sh770x_devices, ARRAY_SIZE(sh770x_devices)); } -__initcall(sh770x_devices_setup); +arch_initcall(sh770x_devices_setup); static struct platform_device *sh770x_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c index efa76c8..717e90a 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c @@ -226,7 +226,7 @@ static int __init sh7710_devices_setup(void) return platform_add_devices(sh7710_devices, ARRAY_SIZE(sh7710_devices)); } -__initcall(sh7710_devices_setup); +arch_initcall(sh7710_devices_setup); static struct platform_device *sh7710_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c index 5b21077..74d8baa 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c @@ -388,7 +388,7 @@ static int __init sh7720_devices_setup(void) return platform_add_devices(sh7720_devices, ARRAY_SIZE(sh7720_devices)); } -__initcall(sh7720_devices_setup); +arch_initcall(sh7720_devices_setup); static struct platform_device *sh7720_early_devices[] __initdata = { &cmt0_device, diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c index 6d088d1..de4827d 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c @@ -138,7 +138,7 @@ static int __init sh4202_devices_setup(void) return platform_add_devices(sh4202_devices, ARRAY_SIZE(sh4202_devices)); } -__initcall(sh4202_devices_setup); +arch_initcall(sh4202_devices_setup); static struct platform_device *sh4202_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c index 851672d..1b8b122 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c @@ -239,7 +239,7 @@ static int __init sh7750_devices_setup(void) return platform_add_devices(sh7750_devices, ARRAY_SIZE(sh7750_devices)); } -__initcall(sh7750_devices_setup); +arch_initcall(sh7750_devices_setup); static struct platform_device *sh7750_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c index 5b82251..7fbb7be 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c @@ -265,7 +265,7 @@ static int __init sh7760_devices_setup(void) return platform_add_devices(sh7760_devices, ARRAY_SIZE(sh7760_devices)); } -__initcall(sh7760_devices_setup); +arch_initcall(sh7760_devices_setup); static struct platform_device *sh7760_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index 6307e08..ac4d567 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -325,7 +325,7 @@ static int __init sh7343_devices_setup(void) return platform_add_devices(sh7343_devices, ARRAY_SIZE(sh7343_devices)); } -__initcall(sh7343_devices_setup); +arch_initcall(sh7343_devices_setup); static struct platform_device *sh7343_early_devices[] __initdata = { &cmt_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index c18f7d0..1a956b1 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -318,7 +318,7 @@ static int __init sh7366_devices_setup(void) return platform_add_devices(sh7366_devices, ARRAY_SIZE(sh7366_devices)); } -__initcall(sh7366_devices_setup); +arch_initcall(sh7366_devices_setup); static struct platform_device *sh7366_early_devices[] __initdata = { &cmt_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index ea524a2..cda76eb 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -359,7 +359,7 @@ static int __init sh7722_devices_setup(void) return platform_add_devices(sh7722_devices, ARRAY_SIZE(sh7722_devices)); } -__initcall(sh7722_devices_setup); +arch_initcall(sh7722_devices_setup); static struct platform_device *sh7722_early_devices[] __initdata = { &cmt_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index e1bb80b..b45dace 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -473,7 +473,7 @@ static int __init sh7723_devices_setup(void) return platform_add_devices(sh7723_devices, ARRAY_SIZE(sh7723_devices)); } -__initcall(sh7723_devices_setup); +arch_initcall(sh7723_devices_setup); static struct platform_device *sh7723_early_devices[] __initdata = { &cmt_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index e5ac9eb..a04edaa 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -508,7 +508,7 @@ static int __init sh7724_devices_setup(void) return platform_add_devices(sh7724_devices, ARRAY_SIZE(sh7724_devices)); } -device_initcall(sh7724_devices_setup); +arch_initcall(sh7724_devices_setup); static struct platform_device *sh7724_early_devices[] __initdata = { &cmt_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c index f1e0c0d..4659fff 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c @@ -314,7 +314,7 @@ static int __init sh7763_devices_setup(void) return platform_add_devices(sh7763_devices, ARRAY_SIZE(sh7763_devices)); } -__initcall(sh7763_devices_setup); +arch_initcall(sh7763_devices_setup); static struct platform_device *sh7763_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c index 1e86209..eead08d 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c @@ -368,7 +368,7 @@ static int __init sh7770_devices_setup(void) return platform_add_devices(sh7770_devices, ARRAY_SIZE(sh7770_devices)); } -__initcall(sh7770_devices_setup); +arch_initcall(sh7770_devices_setup); static struct platform_device *sh7770_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c index 715e05b..2c901f4 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c @@ -256,7 +256,7 @@ static int __init sh7780_devices_setup(void) return platform_add_devices(sh7780_devices, ARRAY_SIZE(sh7780_devices)); } -__initcall(sh7780_devices_setup); +arch_initcall(sh7780_devices_setup); static struct platform_device *sh7780_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index af56140..7f6c718 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -263,7 +263,7 @@ static int __init sh7785_devices_setup(void) return platform_add_devices(sh7785_devices, ARRAY_SIZE(sh7785_devices)); } -__initcall(sh7785_devices_setup); +arch_initcall(sh7785_devices_setup); static struct platform_device *sh7785_early_devices[] __initdata = { &tmu0_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c index b700494..0104a8e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c @@ -547,7 +547,7 @@ static int __init sh7786_devices_setup(void) return platform_add_devices(sh7786_devices, ARRAY_SIZE(sh7786_devices)); } -device_initcall(sh7786_devices_setup); +arch_initcall(sh7786_devices_setup); void __init plat_early_device_setup(void) { diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c index 53c65fd..07f0789 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c @@ -256,7 +256,7 @@ static int __init shx3_devices_setup(void) return platform_add_devices(shx3_devices, ARRAY_SIZE(shx3_devices)); } -__initcall(shx3_devices_setup); +arch_initcall(shx3_devices_setup); void __init plat_early_device_setup(void) { diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c index f5ff1ac..6a0f82f 100644 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -186,7 +186,7 @@ static int __init sh5_devices_setup(void) return platform_add_devices(sh5_devices, ARRAY_SIZE(sh5_devices)); } -__initcall(sh5_devices_setup); +arch_initcall(sh5_devices_setup); void __init plat_early_device_setup(void) { -- cgit v0.10.2 From ba3a17019181af5d6ab898760620c4e9916396c2 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 13 Aug 2009 11:39:02 +0900 Subject: sh: fix i2c init order on Migo-R V2 Convert the Migo-R board code to register devices at arch_initcall() time instead of __initcall(). This fix unbreaks migor_ts touch screen driver support. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index f70f464..f9b2e4d 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -608,7 +608,7 @@ static int __init migor_devices_setup(void) return platform_add_devices(migor_devices, ARRAY_SIZE(migor_devices)); } -__initcall(migor_devices_setup); +arch_initcall(migor_devices_setup); /* Return the board specific boot mode pin configuration */ static int migor_mode_pins(void) -- cgit v0.10.2 From dbefd606a3b3634799b625f4900336e61c89e868 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 7 Aug 2009 03:52:18 +0000 Subject: sh: fix i2c init order on ap325rxa V2 Convert the AP325RXA board code to register devices at arch_initcall() time instead of device_initcall(). This fix unbreaks pcf8563 RTC driver support. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c index 7ffd1b43..b9c88cc 100644 --- a/arch/sh/boards/board-ap325rxa.c +++ b/arch/sh/boards/board-ap325rxa.c @@ -547,7 +547,7 @@ static int __init ap325rxa_devices_setup(void) return platform_add_devices(ap325rxa_devices, ARRAY_SIZE(ap325rxa_devices)); } -device_initcall(ap325rxa_devices_setup); +arch_initcall(ap325rxa_devices_setup); /* Return the board specific boot mode pin configuration */ static int ap325rxa_mode_pins(void) -- cgit v0.10.2 From 8f7a0dc51674ad0dd06155291b0aed60d655943c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2009 14:44:59 -0300 Subject: perf list: Fix large list output by using the pager When /sys/kernel/debug is mounted the list can be imense, so use the pager like the other tools. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Cc: Peter Zijlstra LKML-Reference: <20090812174459.GB3495@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index f990fa8..d88c696 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -10,11 +10,12 @@ #include "perf.h" -#include "util/parse-options.h" #include "util/parse-events.h" +#include "util/cache.h" int cmd_list(int argc __used, const char **argv __used, const char *prefix __used) { + setup_pager(); print_events(); return 0; } -- cgit v0.10.2 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a..2b36afe 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a0..e26d2fc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v0.10.2 From 3a9f131fb00b8ac5950a11ad1599e45edfb5ae44 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 Aug 2009 10:27:18 +0200 Subject: perf tools: Add a per tracepoint counter attribute to get raw sample Add a new flag field while opening a tracepoint perf counter: -e tracepoint_subsystem:tracepoint_name:flags This is intended to be generic although for now it only supports the r[e[c[o[r[d]]]]] flag: ./perf record -e workqueue:workqueue_insertion:record ./perf record -e workqueue:workqueue_insertion:r will have the same effect: enabling the raw samples record for the given tracepoint counter. In the future, we may want to support further flags, separated by commas. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1250152039-7284-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index de76008..78adc47 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -398,7 +398,7 @@ static void create_counter(int counter, int cpu, pid_t pid) PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID; - attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; + attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; if (freq) { attr->sample_type |= PERF_SAMPLE_PERIOD; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4858d83..0441784 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -379,6 +379,7 @@ static int parse_tracepoint_event(const char **strp, struct perf_counter_attr *attr) { const char *evt_name; + char *flags; char sys_name[MAX_EVENT_LENGTH]; char id_buf[4]; int fd; @@ -400,6 +401,15 @@ static int parse_tracepoint_event(const char **strp, strncpy(sys_name, *strp, sys_length); sys_name[sys_length] = '\0'; evt_name = evt_name + 1; + + flags = strchr(evt_name, ':'); + if (flags) { + *flags = '\0'; + flags++; + if (!strncmp(flags, "record", strlen(flags))) + attr->sample_type |= PERF_SAMPLE_RAW; + } + evt_length = strlen(evt_name); if (evt_length >= MAX_EVENT_LENGTH) return 0; -- cgit v0.10.2 From daac07b2e6b77f1bd44104aa2f0593e5505f27d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 13 Aug 2009 10:27:19 +0200 Subject: perf tools: Add a general option to enable raw sample records While we can enable the perf sample records per tracepoint counter, we may also want to enable this option for every tracepoint counters to open, so that we don't need to add a :record flag for all of them. Add the -R, --raw-samples options for this purpose. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1250152039-7284-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 78adc47..3d051b9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -34,6 +34,7 @@ static int output; static const char *output_name = "perf.data"; static int group = 0; static unsigned int realtime_prio = 0; +static int raw_samples = 0; static int system_wide = 0; static int profile_cpu = -1; static pid_t target_pid = -1; @@ -418,6 +419,8 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + if (raw_samples) + attr->sample_type |= PERF_SAMPLE_RAW; attr->mmap = track; attr->comm = track; @@ -650,6 +653,8 @@ static const struct option options[] = { "record events on existing pid"), OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), + OPT_BOOLEAN('R', "raw-samples", &raw_samples, + "collect raw sample records from all opened counters"), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), OPT_BOOLEAN('A', "append", &append_file, -- cgit v0.10.2 From 8fd101f20bdf771949a8f3a5a779877d09b2fb56 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2009 18:19:57 -0300 Subject: perf report: Don't show unresolved DSOs and symbols when -S/-d is used We're interested in just those symbols/DSOs, so filter out the unresolved ones. Signed-off-by: Arnaldo Carvalho de Melo Cc: Peter Zijlstra LKML-Reference: <20090812211957.GE3495@ghostprotocols.net> Signed-off-by: Ingo Molnar diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 23e1457..b53a60f 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1590,10 +1590,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (show & show_mask) { struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip); - if (dso_list && dso && dso->name && !strlist__has_entry(dso_list, dso->name)) + if (dso_list && (!dso || !dso->name || + !strlist__has_entry(dso_list, dso->name))) return 0; - if (sym_list && sym && !strlist__has_entry(sym_list, sym->name)) + if (sym_list && (!sym || !strlist__has_entry(sym_list, sym->name))) return 0; if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) { -- cgit v0.10.2 From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 09:51:55 +0200 Subject: perf_counter: Fix swcounter context invariance perf_swcounter_is_counting() uses a lock, which means we cannot use swcounters from NMI or when holding that particular lock, this is unintended. The below removes the lock, this opens up race window, but not worse than the swcounters already experience due to RCU traversal of the context in perf_swcounter_ctx_event(). This also fixes the hard lockups while opening a lockdep tracepoint counter. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Paul Mackerras Cc: stephane eranian Cc: Corey J Ashford LKML-Reference: <1250149915.10001.66.camel@twins> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e26d2fc..3dd4339 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, static int perf_swcounter_is_counting(struct perf_counter *counter) { - struct perf_counter_context *ctx; - unsigned long flags; - int count; - + /* + * The counter is active, we're good! + */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) return 1; + /* + * The counter is off/error, not counting. + */ if (counter->state != PERF_COUNTER_STATE_INACTIVE) return 0; /* - * If the counter is inactive, it could be just because - * its task is scheduled out, or because it's in a group - * which could not go on the PMU. We want to count in - * the first case but not the second. If the context is - * currently active then an inactive software counter must - * be the second case. If it's not currently active then - * we need to know whether the counter was active when the - * context was last active, which we can determine by - * comparing counter->tstamp_stopped with ctx->time. - * - * We are within an RCU read-side critical section, - * which protects the existence of *ctx. + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. */ - ctx = counter->ctx; - spin_lock_irqsave(&ctx->lock, flags); - count = 1; - /* Re-check state now we have the lock */ - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->ctx->is_active || - counter->tstamp_stopped < ctx->time) - count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); - return count; + if (counter->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v0.10.2 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe..b53f700 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339..b8c6b97 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v0.10.2 From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:54 +0200 Subject: perf_counter: Fix an ipi-deadlock perf_pending_counter() is called from IRQ context and will call perf_counter_disable(), however perf_counter_disable() uses smp_call_function_single() which doesn't fancy being used with IRQs disabled due to IPI deadlocks. Fix this by making it use the local __perf_counter_disable() call and teaching the counter_sched_out() code about pending disables as well. This should cover the case where a counter migrates before the pending queue gets processed. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.244097721@chello.nl> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8c6b97..3f841be 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { -- cgit v0.10.2 From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 16:14:42 +0200 Subject: perf_counter: Report the cloning task as parent on perf_counter_fork() A bug in (9f498cc: perf_counter: Full task tracing) makes profiling multi-threaded apps it go belly up. [ output as: (PID:TID):(PPID:PTID) ] # ./perf report -D | grep FORK 0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236) 0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236) 0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236) 0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236) 0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236) Shows us that the test (27d028d perf report: Update for the new FORK/EXIT events) in builtin-report.c: /* * A thread clone will have the same PID for both * parent and child. */ if (thread == parent) return 0; Will clearly fail. The problem is that perf_counter_fork() reports the actual parent, instead of the cloning thread. Fixing that (with the below patch), yields: # ./perf report -D | grep FORK 0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589) 0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590) 0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590) 0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590) 0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590) Which both makes more sense and doesn't confuse perf report anymore. Reported-by: Pekka Enberg Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: Anton Blanchard Cc: Arjan van de Ven LKML-Reference: <1250172882.5241.62.camel@twins> Signed-off-by: Ingo Molnar diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f841be..534e20d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter, return; task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.ppid = perf_counter_pid(counter, current); task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + task_event->event.ptid = perf_counter_tid(counter, current); perf_output_put(&handle, task_event->event); perf_output_end(&handle); -- cgit v0.10.2 From e694958388c50148389b0e9b9e9e8945cf0f1b98 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 08:28:36 -0700 Subject: Make sock_sendpage() use kernel_sendpage() kernel_sendpage() does the proper default case handling for when the socket doesn't have a native sendpage implementation. Now, arguably this might be something that we could instead solve by just specifying that all protocols should do it themselves at the protocol level, but we really only care about the common protocols. Does anybody really care about sendpage on something like Appletalk? Not likely. Acked-by: David S. Miller Acked-by: Julien TINNES Acked-by: Tavis Ormandy Cc: stable@kernel.org Signed-off-by: Linus Torvalds diff --git a/net/socket.c b/net/socket.c index 791d71a..6d47165 100644 --- a/net/socket.c +++ b/net/socket.c @@ -736,7 +736,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, if (more) flags |= MSG_MORE; - return sock->ops->sendpage(sock, page, offset, size, flags); + return kernel_sendpage(sock, page, offset, size, flags); } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, -- cgit v0.10.2 From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 13:05:10 -0700 Subject: genirq: prevent wakeup of freed irq thread free_irq() can remove an irqaction while the corresponding interrupt is in progress, but free_irq() sets action->thread to NULL unconditionally, which might lead to a NULL pointer dereference in handle_IRQ_event() when the hard interrupt context tries to wake up the handler thread. Prevent this by moving the thread stop after synchronize_irq(). No need to set action->thread to NULL either as action is going to be freed anyway. This fixes a boot crash reported against preempt-rt which uses the mainline irq threads code to implement full irq threading. [ tglx: removed local irqthread variable ] Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679d..d222515 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; - struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - irqthread = action->thread; - action->thread = NULL; - spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); - if (irqthread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(irqthread); - put_task_struct(irqthread); - } - #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + return action; } -- cgit v0.10.2 From 64f1607ffbbc772685733ea63e6f7f4183df1b16 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 15:43:34 -0700 Subject: Linux 2.6.31-rc6 diff --git a/Makefile b/Makefile index 0d46615..abcfa85 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 31 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* -- cgit v0.10.2 From 9f162d2a810b4db48f7b8d7e734d0932c81ec2a1 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:18:44 +0300 Subject: sunrpc: hton -> cpu_to_be* htonl is already defined as cpu_to_be32. cpu_to_be64 has architecture specific optimized implementations. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b99c625..f94bbdc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -117,9 +117,8 @@ static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int le static inline __be32 * xdr_encode_hyper(__be32 *p, __u64 val) { - *p++ = htonl(val >> 32); - *p++ = htonl(val & 0xFFFFFFFF); - return p; + *(__be64 *)p = cpu_to_be64(val); + return p + 2; } static inline __be32 * diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 406e26d..0d05d25 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -24,7 +24,7 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) unsigned int quadlen = XDR_QUADLEN(obj->len); p[quadlen] = 0; /* zero trailing bytes */ - *p++ = htonl(obj->len); + *p++ = cpu_to_be32(obj->len); memcpy(p, obj->data, obj->len); return p + XDR_QUADLEN(obj->len); } @@ -83,7 +83,7 @@ EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed); */ __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes) { - *p++ = htonl(nbytes); + *p++ = cpu_to_be32(nbytes); return xdr_encode_opaque_fixed(p, ptr, nbytes); } EXPORT_SYMBOL_GPL(xdr_encode_opaque); @@ -779,7 +779,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_word); int xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) { - __be32 raw = htonl(obj); + __be32 raw = cpu_to_be32(obj); return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj)); } -- cgit v0.10.2 From 98866b5abe1513cdacc011874ca045d40002eccd Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:18:49 +0300 Subject: sunrpc: ntoh -> be*_to_cpu ntohl is already defined as be32_to_cpu. be64_to_cpu has architecture specific optimized implementations. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index f94bbdc..7da466b 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -124,9 +124,8 @@ xdr_encode_hyper(__be32 *p, __u64 val) static inline __be32 * xdr_decode_hyper(__be32 *p, __u64 *valp) { - *valp = ((__u64) ntohl(*p++)) << 32; - *valp |= ntohl(*p++); - return p; + *valp = be64_to_cpup((__be64 *)p); + return p + 2; } /* diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 0d05d25..8bd690c 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -35,7 +35,7 @@ xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) { unsigned int len; - if ((len = ntohl(*p++)) > XDR_MAX_NETOBJ) + if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) return NULL; obj->len = len; obj->data = (u8 *) p; @@ -101,7 +101,7 @@ xdr_decode_string_inplace(__be32 *p, char **sp, { u32 len; - len = ntohl(*p++); + len = be32_to_cpu(*p++); if (len > maxlen) return NULL; *lenp = len; @@ -771,7 +771,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); if (status) return status; - *obj = ntohl(raw); + *obj = be32_to_cpu(raw); return 0; } EXPORT_SYMBOL_GPL(xdr_decode_word); -- cgit v0.10.2 From e75bc1c89e0c7dda0b140408ddee2ffaef7ba6d4 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:18:54 +0300 Subject: nfs: nfs4xdr: get rid of WRITE32 s/WRITE32/*p++ = cpu_to_be32/ Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 617273e..9a03b24 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -712,7 +712,6 @@ struct compound_hdr { * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define WRITE32(n) *p++ = htonl(n) #define WRITE64(n) do { \ *p++ = htonl((uint32_t)((n) >> 32)); \ *p++ = htonl((uint32_t)(n)); \ @@ -750,11 +749,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); + *p++ = cpu_to_be32(hdr->taglen); WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(hdr->minorversion); + *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - WRITE32(hdr->nops); + *p++ = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -835,7 +834,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - WRITE32(2); + *p++ = cpu_to_be32(2); q = p; p += 3; @@ -845,39 +844,39 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); + *p++ = cpu_to_be32(owner_namelen); WRITEMEM(owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); + *p++ = cpu_to_be32(owner_grouplen); WRITEMEM(owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } /* @@ -901,8 +900,8 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd __be32 *p; RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); + *p++ = cpu_to_be32(OP_ACCESS); + *p++ = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -912,8 +911,8 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg __be32 *p; RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; @@ -924,9 +923,9 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar __be32 *p; RESERVE_SPACE(16); - WRITE32(OP_COMMIT); + *p++ = cpu_to_be32(OP_COMMIT); WRITE64(args->offset); - WRITE32(args->count); + *p++ = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -936,20 +935,20 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * __be32 *p; RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + *p++ = cpu_to_be32(OP_CREATE); + *p++ = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + *p++ = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p++ = cpu_to_be32(create->u.device.specdata2); break; default: @@ -957,7 +956,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); + *p++ = cpu_to_be32(create->name->len); WRITEMEM(create->name->name, create->name->len); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -970,9 +969,9 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c __be32 *p; RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -982,10 +981,10 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm __be32 *p; RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -1013,7 +1012,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_GETFH); + *p++ = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1023,8 +1022,8 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct __be32 *p; RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); + *p++ = cpu_to_be32(OP_LINK); + *p++ = cpu_to_be32(name->len); WRITEMEM(name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; @@ -1053,26 +1052,26 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args __be32 *p; RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); WRITE64(args->fl->fl_start); WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + *p++ = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); WRITE64(args->lock_owner.clientid); - WRITE32(16); + *p++ = cpu_to_be32(16); WRITEMEM("lock id:", 8); WRITE64(args->lock_owner.id); } else { RESERVE_SPACE(NFS4_STATEID_SIZE+4); WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1083,12 +1082,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar __be32 *p; RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); WRITE64(args->fl->fl_start); WRITE64(nfs4_lock_length(args->fl)); WRITE64(args->lock_owner.clientid); - WRITE32(16); + *p++ = cpu_to_be32(16); WRITEMEM("lock id:", 8); WRITE64(args->lock_owner.id); hdr->nops++; @@ -1100,9 +1099,9 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar __be32 *p; RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); WRITE64(args->fl->fl_start); WRITE64(nfs4_lock_length(args->fl)); @@ -1116,8 +1115,8 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc __be32 *p; RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); + *p++ = cpu_to_be32(OP_LOOKUP); + *p++ = cpu_to_be32(len); WRITEMEM(name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; @@ -1130,18 +1129,18 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) RESERVE_SPACE(8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); break; case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); break; case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); break; default: - WRITE32(0); + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p++ = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1152,12 +1151,12 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * owner 4 = 32 */ RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); + *p++ = cpu_to_be32(OP_OPEN); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); RESERVE_SPACE(28); WRITE64(arg->clientid); - WRITE32(16); + *p++ = cpu_to_be32(16); WRITEMEM("open id:", 8); WRITE64(arg->id); } @@ -1169,11 +1168,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op RESERVE_SPACE(4); switch(arg->open_flags & O_EXCL) { case 0: - WRITE32(NFS4_CREATE_UNCHECKED); + *p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - WRITE32(NFS4_CREATE_EXCLUSIVE); + *p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1185,11 +1184,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a RESERVE_SPACE(4); switch (arg->open_flags & O_CREAT) { case 0: - WRITE32(NFS4_OPEN_NOCREATE); + *p++ = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); + *p++ = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1201,13 +1200,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega RESERVE_SPACE(4); switch (delegation_type) { case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); + *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); + *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); + *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1219,7 +1218,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * __be32 *p; RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1228,7 +1227,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) __be32 *p; RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1237,7 +1236,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc __be32 *p; RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); WRITEMEM(stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1268,9 +1267,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co __be32 *p; RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1280,9 +1279,9 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close __be32 *p; RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1295,8 +1294,8 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd __be32 *p; RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); + *p++ = cpu_to_be32(OP_PUTFH); + *p++ = cpu_to_be32(len); WRITEMEM(fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; @@ -1307,7 +1306,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + *p++ = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1330,13 +1329,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_READ); + *p++ = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); RESERVE_SPACE(12); WRITE64(args->offset); - WRITE32(args->count); + *p++ = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1350,19 +1349,19 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg __be32 *p; RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); + *p++ = cpu_to_be32(OP_READDIR); WRITE64(readdir->cookie); WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); + *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); /* Switch to mounted_on_fileid if the server supports it */ if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) attrs[0] &= ~FATTR4_WORD0_FILEID; else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1379,7 +1378,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_READLINK); + *p++ = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1389,8 +1388,8 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc __be32 *p; RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); + *p++ = cpu_to_be32(OP_REMOVE); + *p++ = cpu_to_be32(name->len); WRITEMEM(name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; @@ -1401,12 +1400,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co __be32 *p; RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); + *p++ = cpu_to_be32(OP_RENAME); + *p++ = cpu_to_be32(oldname->len); WRITEMEM(oldname->name, oldname->len); RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); + *p++ = cpu_to_be32(newname->len); WRITEMEM(newname->name, newname->len); hdr->nops++; hdr->replen += decode_rename_maxsz; @@ -1417,7 +1416,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client __be32 *p; RESERVE_SPACE(12); - WRITE32(OP_RENEW); + *p++ = cpu_to_be32(OP_RENEW); WRITE64(client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; @@ -1429,7 +1428,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + *p++ = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1440,15 +1439,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun __be32 *p; RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); + *p++ = cpu_to_be32(OP_SETATTR); WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; RESERVE_SPACE(4); - WRITE32(arg->acl_len); + *p++ = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1461,7 +1460,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + *p++ = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1471,7 +1470,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs __be32 *p; RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); + *p++ = cpu_to_be32(OP_SETATTR); WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; @@ -1483,16 +1482,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie __be32 *p; RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); + *p++ = cpu_to_be32(OP_SETCLIENTID); WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); + *p++ = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + *p++ = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1502,7 +1501,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ __be32 *p; RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); WRITE64(client_state->cl_clientid); WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; @@ -1514,14 +1513,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg __be32 *p; RESERVE_SPACE(4); - WRITE32(OP_WRITE); + *p++ = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); RESERVE_SPACE(16); WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); + *p++ = cpu_to_be32(args->stable); + *p++ = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1534,7 +1533,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_DELEGRETURN); + *p++ = cpu_to_be32(OP_DELEGRETURN); WRITEMEM(stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; @@ -1549,15 +1548,15 @@ static void encode_exchange_id(struct xdr_stream *xdr, __be32 *p; RESERVE_SPACE(4 + sizeof(args->verifier->data)); - WRITE32(OP_EXCHANGE_ID); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); RESERVE_SPACE(12); - WRITE32(args->flags); - WRITE32(0); /* zero length state_protect4_a */ - WRITE32(0); /* zero length implementation id array */ + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p++ = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1572,54 +1571,54 @@ static void encode_create_session(struct xdr_stream *xdr, struct nfs_client *clp = args->client; RESERVE_SPACE(4); - WRITE32(OP_CREATE_SESSION); + *p++ = cpu_to_be32(OP_CREATE_SESSION); RESERVE_SPACE(8); WRITE64(clp->cl_ex_clid); RESERVE_SPACE(8); - WRITE32(clp->cl_seqid); /*Sequence id */ - WRITE32(args->flags); /*flags */ + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->fc_attrs.max_ops); /* max operations */ - WRITE32(args->fc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->bc_attrs.max_ops); /* max operations */ - WRITE32(args->bc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ RESERVE_SPACE(4); - WRITE32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ RESERVE_SPACE(4); /* # of security flavors */ - WRITE32(1); + *p++ = cpu_to_be32(1); RESERVE_SPACE(4); - WRITE32(RPC_AUTH_UNIX); /* auth_sys */ + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ RESERVE_SPACE(4); - WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ len = scnprintf(machine_name, sizeof(machine_name), "%s", clp->cl_ipaddr); RESERVE_SPACE(16 + len); - WRITE32(len); + *p++ = cpu_to_be32(len); WRITEMEM(machine_name, len); - WRITE32(0); /* UID */ - WRITE32(0); /* GID */ - WRITE32(0); /* No more gids */ + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p++ = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1630,7 +1629,7 @@ static void encode_destroy_session(struct xdr_stream *xdr, { __be32 *p; RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); - WRITE32(OP_DESTROY_SESSION); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; @@ -1656,7 +1655,7 @@ static void encode_sequence(struct xdr_stream *xdr, slot = tp->slots + args->sa_slotid; RESERVE_SPACE(4); - WRITE32(OP_SEQUENCE); + *p++ = cpu_to_be32(OP_SEQUENCE); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1672,10 +1671,10 @@ static void encode_sequence(struct xdr_stream *xdr, tp->highest_used_slotid, args->sa_cache_this); RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(slot->seq_nr); - WRITE32(args->sa_slotid); - WRITE32(tp->highest_used_slotid); - WRITE32(args->sa_cache_this); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p++ = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ -- cgit v0.10.2 From b95be5a976848febff82edb21d5b4351b3997bf6 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:01 +0300 Subject: nfs: nfs4xdr: get rid of WRITE64 s/WRITE64/p = xdr_encode_hyper(p, / Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9a03b24..5d80766 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -712,10 +712,6 @@ struct compound_hdr { * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) #define WRITEMEM(ptr,nbytes) do { \ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ } while (0) @@ -840,7 +836,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); + p = xdr_encode_hyper(p, iap->ia_size); } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; @@ -924,7 +920,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar RESERVE_SPACE(16); *p++ = cpu_to_be32(OP_COMMIT); - WRITE64(args->offset); + p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; @@ -1055,18 +1051,18 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args *p++ = cpu_to_be32(OP_LOCK); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); *p++ = cpu_to_be32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p++ = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); + p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = xdr_encode_hyper(p, args->lock_owner.id); } else { RESERVE_SPACE(NFS4_STATEID_SIZE+4); @@ -1084,12 +1080,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar RESERVE_SPACE(52); *p++ = cpu_to_be32(OP_LOCKT); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1103,8 +1099,8 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); *p++ = cpu_to_be32(args->seqid->sequence->counter); WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1155,10 +1151,10 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena *p++ = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); RESERVE_SPACE(28); - WRITE64(arg->clientid); + p = xdr_encode_hyper(p, arg->clientid); *p++ = cpu_to_be32(16); WRITEMEM("open id:", 8); - WRITE64(arg->id); + p = xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1334,7 +1330,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, encode_stateid(xdr, args->context); RESERVE_SPACE(12); - WRITE64(args->offset); + p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; @@ -1350,7 +1346,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); - WRITE64(readdir->cookie); + p = xdr_encode_hyper(p, readdir->cookie); WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ *p++ = cpu_to_be32(readdir->count); @@ -1417,7 +1413,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client RESERVE_SPACE(12); *p++ = cpu_to_be32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + p = xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1502,7 +1498,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); + p = xdr_encode_hyper(p, client_state->cl_clientid); WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; @@ -1518,7 +1514,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg encode_stateid(xdr, args->context); RESERVE_SPACE(16); - WRITE64(args->offset); + p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->stable); *p++ = cpu_to_be32(args->count); @@ -1574,7 +1570,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(OP_CREATE_SESSION); RESERVE_SPACE(8); - WRITE64(clp->cl_ex_clid); + p = xdr_encode_hyper(p, clp->cl_ex_clid); RESERVE_SPACE(8); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ -- cgit v0.10.2 From 93f0cf25944695e1229fe90a2897af0211fbd425 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:06 +0300 Subject: nfs: nfs4xdr: get rid of WRITEMEM s/WRITEMEM(/p = xdr_encode_opaque_fixed(p, / Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5d80766..17915c8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -712,10 +712,6 @@ struct compound_hdr { * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - #define RESERVE_SPACE(nbytes) do { \ p = xdr_reserve_space(xdr, nbytes); \ BUG_ON(!p); \ @@ -746,7 +742,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); *p++ = cpu_to_be32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); + p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen); *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; *p++ = cpu_to_be32(hdr->nops); @@ -845,12 +841,12 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; *p++ = cpu_to_be32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); + p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; *p++ = cpu_to_be32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); + p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; @@ -909,7 +905,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg RESERVE_SPACE(8+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_CLOSE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -953,7 +949,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * RESERVE_SPACE(4 + create->name->len); *p++ = cpu_to_be32(create->name->len); - WRITEMEM(create->name->name, create->name->len); + p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -1020,7 +1016,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct RESERVE_SPACE(8 + name->len); *p++ = cpu_to_be32(OP_LINK); *p++ = cpu_to_be32(name->len); - WRITEMEM(name->name, name->len); + p = xdr_encode_opaque_fixed(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1057,16 +1053,16 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args if (args->new_lock_owner){ RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); - WRITEMEM("lock id:", 8); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); p = xdr_encode_hyper(p, args->lock_owner.id); } else { RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; @@ -1084,7 +1080,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); - WRITEMEM("lock id:", 8); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); p = xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; @@ -1098,7 +1094,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar *p++ = cpu_to_be32(OP_LOCKU); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); *p++ = cpu_to_be32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; @@ -1113,7 +1109,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc RESERVE_SPACE(8 + len); *p++ = cpu_to_be32(OP_LOOKUP); *p++ = cpu_to_be32(len); - WRITEMEM(name->name, len); + p = xdr_encode_opaque_fixed(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1153,7 +1149,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena RESERVE_SPACE(28); p = xdr_encode_hyper(p, arg->clientid); *p++ = cpu_to_be32(16); - WRITEMEM("open id:", 8); + p = xdr_encode_opaque_fixed(p, "open id:", 8); p = xdr_encode_hyper(p, arg->id); } @@ -1233,7 +1229,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc RESERVE_SPACE(4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1264,7 +1260,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; @@ -1276,7 +1272,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; @@ -1292,7 +1288,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd RESERVE_SPACE(8 + len); *p++ = cpu_to_be32(OP_PUTFH); *p++ = cpu_to_be32(len); - WRITEMEM(fh->data, len); + p = xdr_encode_opaque_fixed(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1315,9 +1311,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context RESERVE_SPACE(NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) @@ -1347,7 +1343,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); p = xdr_encode_hyper(p, readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ *p++ = cpu_to_be32(readdir->count); *p++ = cpu_to_be32(2); @@ -1386,7 +1382,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc RESERVE_SPACE(8 + name->len); *p++ = cpu_to_be32(OP_REMOVE); *p++ = cpu_to_be32(name->len); - WRITEMEM(name->name, name->len); + p = xdr_encode_opaque_fixed(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1398,11 +1394,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co RESERVE_SPACE(8 + oldname->len); *p++ = cpu_to_be32(OP_RENAME); *p++ = cpu_to_be32(oldname->len); - WRITEMEM(oldname->name, oldname->len); + p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len); RESERVE_SPACE(4 + newname->len); *p++ = cpu_to_be32(newname->len); - WRITEMEM(newname->name, newname->len); + p = xdr_encode_opaque_fixed(p, newname->name, newname->len); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1436,7 +1432,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun RESERVE_SPACE(4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); RESERVE_SPACE(2*4); *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(FATTR4_WORD0_ACL); @@ -1467,7 +1463,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs RESERVE_SPACE(4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1479,7 +1475,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); RESERVE_SPACE(4); @@ -1499,7 +1495,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); p = xdr_encode_hyper(p, client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1530,7 +1526,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state RESERVE_SPACE(4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1545,7 +1541,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, RESERVE_SPACE(4 + sizeof(args->verifier->data)); *p++ = cpu_to_be32(OP_EXCHANGE_ID); - WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); + p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); @@ -1611,7 +1607,7 @@ static void encode_create_session(struct xdr_stream *xdr, clp->cl_ipaddr); RESERVE_SPACE(16 + len); *p++ = cpu_to_be32(len); - WRITEMEM(machine_name, len); + p = xdr_encode_opaque_fixed(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ *p++ = cpu_to_be32(0); /* No more gids */ @@ -1626,7 +1622,7 @@ static void encode_destroy_session(struct xdr_stream *xdr, __be32 *p; RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(OP_DESTROY_SESSION); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1666,7 +1662,7 @@ static void encode_sequence(struct xdr_stream *xdr, slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(slot->seq_nr); *p++ = cpu_to_be32(args->sa_slotid); *p++ = cpu_to_be32(tp->highest_used_slotid); -- cgit v0.10.2 From 42edd698125b76a38bd9999015202db036dfbc76 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:13 +0300 Subject: nfs: nfs4xdr: optimize RESERVE_SPACE in encode_create_session and encode_sequence Coalesce multilpe constant RESERVE_SPACEs into one Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 17915c8..d460d81 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1562,17 +1562,15 @@ static void encode_create_session(struct xdr_stream *xdr, uint32_t len; struct nfs_client *clp = args->client; - RESERVE_SPACE(4); - *p++ = cpu_to_be32(OP_CREATE_SESSION); + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); - RESERVE_SPACE(8); + RESERVE_SPACE(20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); p = xdr_encode_hyper(p, clp->cl_ex_clid); - - RESERVE_SPACE(8); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ - RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ @@ -1591,21 +1589,12 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ - RESERVE_SPACE(4); *p++ = cpu_to_be32(args->cb_program); /* cb_program */ - - RESERVE_SPACE(4); /* # of security flavors */ *p++ = cpu_to_be32(1); - - RESERVE_SPACE(4); *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - RESERVE_SPACE(4); *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - RESERVE_SPACE(16 + len); *p++ = cpu_to_be32(len); p = xdr_encode_opaque_fixed(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ @@ -1646,7 +1635,7 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4); + RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16); *p++ = cpu_to_be32(OP_SEQUENCE); /* @@ -1661,7 +1650,6 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(slot->seq_nr); *p++ = cpu_to_be32(args->sa_slotid); -- cgit v0.10.2 From 2220f13a8b90d2259f3094cb54cf4de67d8eee2d Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:18 +0300 Subject: nfs: nfs4xdr: encode_compound_hdr does not have to round up reserved bytes This is already done by xdr_reserve_space and since encode_compound_hdr is adding a byte count to "12" which is already word aligned, the xdr level rounding will work just as well. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d460d81..7c2b162 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -740,7 +740,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); + RESERVE_SPACE(12 + hdr->taglen); *p++ = cpu_to_be32(hdr->taglen); p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen); *p++ = cpu_to_be32(hdr->minorversion); -- cgit v0.10.2 From 13c65ce90006badccd5663e558e3c85869ae5ce6 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:25 +0300 Subject: nfs: nfs4xdr: change RESERVE_SPACE macro into a static helper In order to open code and expose the result pointer assignment. Alternatively, we can open code the call to xdr_reserve_space and do the BUG_ON an the error case at the call site. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7c2b162..3bcde49 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -702,20 +702,12 @@ struct compound_hdr { u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { @@ -740,7 +732,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12 + hdr->taglen); + p = reserve_space(xdr, 12 + hdr->taglen); *p++ = cpu_to_be32(hdr->taglen); p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen); *p++ = cpu_to_be32(hdr->minorversion); @@ -820,7 +812,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; - RESERVE_SPACE(len); + p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute @@ -891,7 +883,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_ACCESS); *p++ = cpu_to_be32(access); hdr->nops++; @@ -902,7 +894,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { __be32 *p; - RESERVE_SPACE(8+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_CLOSE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); @@ -914,7 +906,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar { __be32 *p; - RESERVE_SPACE(16); + p = reserve_space(xdr, 16); *p++ = cpu_to_be32(OP_COMMIT); p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->count); @@ -926,19 +918,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_CREATE); *p++ = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(create->u.device.specdata1); *p++ = cpu_to_be32(create->u.device.specdata2); break; @@ -947,7 +939,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * break; } - RESERVE_SPACE(4 + create->name->len); + p = reserve_space(xdr, 4 + create->name->len); *p++ = cpu_to_be32(create->name->len); p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len); hdr->nops++; @@ -960,7 +952,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - RESERVE_SPACE(12); + p = reserve_space(xdr, 12); *p++ = cpu_to_be32(OP_GETATTR); *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(bitmap); @@ -972,7 +964,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm { __be32 *p; - RESERVE_SPACE(16); + p = reserve_space(xdr, 16); *p++ = cpu_to_be32(OP_GETATTR); *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bm0); @@ -1003,7 +995,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; @@ -1013,7 +1005,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct { __be32 *p; - RESERVE_SPACE(8 + name->len); + p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_LINK); *p++ = cpu_to_be32(name->len); p = xdr_encode_opaque_fixed(p, name->name, name->len); @@ -1043,7 +1035,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - RESERVE_SPACE(32); + p = reserve_space(xdr, 32); *p++ = cpu_to_be32(OP_LOCK); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); *p++ = cpu_to_be32(args->reclaim); @@ -1051,7 +1043,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p++ = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); @@ -1061,7 +1053,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args p = xdr_encode_hyper(p, args->lock_owner.id); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); } @@ -1073,7 +1065,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - RESERVE_SPACE(52); + p = reserve_space(xdr, 52); *p++ = cpu_to_be32(OP_LOCKT); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); p = xdr_encode_hyper(p, args->fl->fl_start); @@ -1090,7 +1082,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); *p++ = cpu_to_be32(OP_LOCKU); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); *p++ = cpu_to_be32(args->seqid->sequence->counter); @@ -1106,7 +1098,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc int len = name->len; __be32 *p; - RESERVE_SPACE(8 + len); + p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_LOOKUP); *p++ = cpu_to_be32(len); p = xdr_encode_opaque_fixed(p, name->name, len); @@ -1118,7 +1110,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); @@ -1142,11 +1134,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_OPEN); *p++ = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - RESERVE_SPACE(28); + p = reserve_space(xdr, 28); p = xdr_encode_hyper(p, arg->clientid); *p++ = cpu_to_be32(16); p = xdr_encode_opaque_fixed(p, "open id:", 8); @@ -1157,7 +1149,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: *p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED); @@ -1173,7 +1165,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: *p++ = cpu_to_be32(NFS4_OPEN_NOCREATE); @@ -1189,7 +1181,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { case 0: *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); @@ -1209,7 +1201,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1218,7 +1210,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1227,7 +1219,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); @@ -1258,7 +1250,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_CONFIRM); p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); @@ -1270,7 +1262,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); @@ -1285,7 +1277,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd int len = fh->size; __be32 *p; - RESERVE_SPACE(8 + len); + p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_PUTFH); *p++ = cpu_to_be32(len); p = xdr_encode_opaque_fixed(p, fh->data, len); @@ -1297,7 +1289,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; @@ -1308,7 +1300,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context nfs4_stateid stateid; __be32 *p; - RESERVE_SPACE(NFS4_STATEID_SIZE); + p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); @@ -1320,12 +1312,12 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); - RESERVE_SPACE(12); + p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->count); hdr->nops++; @@ -1340,7 +1332,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; __be32 *p; - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); *p++ = cpu_to_be32(OP_READDIR); p = xdr_encode_hyper(p, readdir->cookie); p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); @@ -1369,7 +1361,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; @@ -1379,7 +1371,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc { __be32 *p; - RESERVE_SPACE(8 + name->len); + p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_REMOVE); *p++ = cpu_to_be32(name->len); p = xdr_encode_opaque_fixed(p, name->name, name->len); @@ -1391,12 +1383,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - RESERVE_SPACE(8 + oldname->len); + p = reserve_space(xdr, 8 + oldname->len); *p++ = cpu_to_be32(OP_RENAME); *p++ = cpu_to_be32(oldname->len); p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len); - RESERVE_SPACE(4 + newname->len); + p = reserve_space(xdr, 4 + newname->len); *p++ = cpu_to_be32(newname->len); p = xdr_encode_opaque_fixed(p, newname->name, newname->len); hdr->nops++; @@ -1407,7 +1399,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client { __be32 *p; - RESERVE_SPACE(12); + p = reserve_space(xdr, 12); *p++ = cpu_to_be32(OP_RENEW); p = xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; @@ -1419,7 +1411,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; @@ -1430,15 +1422,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); + p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; @@ -1451,7 +1443,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; @@ -1461,7 +1453,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; @@ -1473,16 +1465,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID); p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; @@ -1492,7 +1484,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ { __be32 *p; - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); p = xdr_encode_hyper(p, client_state->cl_clientid); p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); @@ -1504,12 +1496,12 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); *p++ = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); - RESERVE_SPACE(16); + p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->stable); *p++ = cpu_to_be32(args->count); @@ -1523,7 +1515,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_DELEGRETURN); p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); @@ -1539,13 +1531,13 @@ static void encode_exchange_id(struct xdr_stream *xdr, { __be32 *p; - RESERVE_SPACE(4 + sizeof(args->verifier->data)); + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); *p++ = cpu_to_be32(OP_EXCHANGE_ID); p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); - RESERVE_SPACE(12); + p = reserve_space(xdr, 12); *p++ = cpu_to_be32(args->flags); *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ *p++ = cpu_to_be32(0); /* zero length implementation id array */ @@ -1565,7 +1557,7 @@ static void encode_create_session(struct xdr_stream *xdr, len = scnprintf(machine_name, sizeof(machine_name), "%s", clp->cl_ipaddr); - RESERVE_SPACE(20 + 2*28 + 20 + len + 12); + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); *p++ = cpu_to_be32(OP_CREATE_SESSION); p = xdr_encode_hyper(p, clp->cl_ex_clid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ @@ -1609,7 +1601,7 @@ static void encode_destroy_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(OP_DESTROY_SESSION); p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; @@ -1635,7 +1627,7 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); *p++ = cpu_to_be32(OP_SEQUENCE); /* -- cgit v0.10.2 From 345585132a204859fbb7d8b662e9b6e5b563c6dc Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:30 +0300 Subject: nfs: nfs4xdr: optimize low level encoding do not increment encoding ptr if not needed. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3bcde49..d75f821 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -737,7 +737,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen); *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - *p++ = cpu_to_be32(hdr->nops); + *p = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -874,7 +874,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len = (char *)p - (char *)q - 12; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(len); + *q = htonl(len); /* out: */ } @@ -885,7 +885,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_ACCESS); - *p++ = cpu_to_be32(access); + *p = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -897,7 +897,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_CLOSE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); - p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -909,7 +909,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar p = reserve_space(xdr, 16); *p++ = cpu_to_be32(OP_COMMIT); p = xdr_encode_hyper(p, args->offset); - *p++ = cpu_to_be32(args->count); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -920,19 +920,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_CREATE); - *p++ = cpu_to_be32(create->ftype); + *p = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(create->u.symlink.len); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: p = reserve_space(xdr, 8); *p++ = cpu_to_be32(create->u.device.specdata1); - *p++ = cpu_to_be32(create->u.device.specdata2); + *p = cpu_to_be32(create->u.device.specdata2); break; default: @@ -941,7 +941,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * p = reserve_space(xdr, 4 + create->name->len); *p++ = cpu_to_be32(create->name->len); - p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len); + xdr_encode_opaque_fixed(p, create->name->name, create->name->len); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -955,7 +955,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c p = reserve_space(xdr, 12); *p++ = cpu_to_be32(OP_GETATTR); *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(bitmap); + *p = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -968,7 +968,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm *p++ = cpu_to_be32(OP_GETATTR); *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bm0); - *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -996,7 +996,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_GETFH); + *p = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1008,7 +1008,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_LINK); *p++ = cpu_to_be32(name->len); - p = xdr_encode_opaque_fixed(p, name->name, name->len); + xdr_encode_opaque_fixed(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1041,7 +1041,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args *p++ = cpu_to_be32(args->reclaim); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); - *p++ = cpu_to_be32(args->new_lock_owner); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); @@ -1050,12 +1050,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); p = xdr_encode_opaque_fixed(p, "lock id:", 8); - p = xdr_encode_hyper(p, args->lock_owner.id); + xdr_encode_hyper(p, args->lock_owner.id); } else { p = reserve_space(xdr, NFS4_STATEID_SIZE+4); p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1073,7 +1073,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar p = xdr_encode_hyper(p, args->lock_owner.clientid); *p++ = cpu_to_be32(16); p = xdr_encode_opaque_fixed(p, "lock id:", 8); - p = xdr_encode_hyper(p, args->lock_owner.id); + xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1088,7 +1088,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar *p++ = cpu_to_be32(args->seqid->sequence->counter); p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); p = xdr_encode_hyper(p, args->fl->fl_start); - p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1101,7 +1101,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_LOOKUP); *p++ = cpu_to_be32(len); - p = xdr_encode_opaque_fixed(p, name->name, len); + xdr_encode_opaque_fixed(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1124,7 +1124,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) default: *p++ = cpu_to_be32(0); } - *p++ = cpu_to_be32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1136,13 +1136,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena */ p = reserve_space(xdr, 8); *p++ = cpu_to_be32(OP_OPEN); - *p++ = cpu_to_be32(arg->seqid->sequence->counter); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); p = reserve_space(xdr, 28); p = xdr_encode_hyper(p, arg->clientid); *p++ = cpu_to_be32(16); p = xdr_encode_opaque_fixed(p, "open id:", 8); - p = xdr_encode_hyper(p, arg->id); + xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1152,11 +1152,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: - *p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED); + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - *p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1168,11 +1168,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: - *p++ = cpu_to_be32(NFS4_OPEN_NOCREATE); + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - *p++ = cpu_to_be32(NFS4_OPEN_CREATE); + *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1184,13 +1184,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega p = reserve_space(xdr, 4); switch (delegation_type) { case 0: - *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - *p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1202,7 +1202,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1211,7 +1211,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1221,7 +1221,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1253,7 +1253,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_CONFIRM); p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(arg->seqid->sequence->counter); + *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1265,7 +1265,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(arg->seqid->sequence->counter); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1280,7 +1280,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_PUTFH); *p++ = cpu_to_be32(len); - p = xdr_encode_opaque_fixed(p, fh->data, len); + xdr_encode_opaque_fixed(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1290,7 +1290,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_PUTROOTFH); + *p = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1303,9 +1303,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) @@ -1313,13 +1313,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_READ); + *p = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); - *p++ = cpu_to_be32(args->count); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1345,7 +1345,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1362,7 +1362,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_READLINK); + *p = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1374,7 +1374,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_REMOVE); *p++ = cpu_to_be32(name->len); - p = xdr_encode_opaque_fixed(p, name->name, name->len); + xdr_encode_opaque_fixed(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1386,11 +1386,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co p = reserve_space(xdr, 8 + oldname->len); *p++ = cpu_to_be32(OP_RENAME); *p++ = cpu_to_be32(oldname->len); - p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len); + xdr_encode_opaque_fixed(p, oldname->name, oldname->len); p = reserve_space(xdr, 4 + newname->len); *p++ = cpu_to_be32(newname->len); - p = xdr_encode_opaque_fixed(p, newname->name, newname->len); + xdr_encode_opaque_fixed(p, newname->name, newname->len); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1401,7 +1401,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client p = reserve_space(xdr, 12); *p++ = cpu_to_be32(OP_RENEW); - p = xdr_encode_hyper(p, client_stateid->cl_clientid); + xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1412,7 +1412,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_RESTOREFH); + *p = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1424,14 +1424,14 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(FATTR4_WORD0_ACL); + *p = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(arg->acl_len); + *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1444,7 +1444,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_SAVEFH); + *p = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1455,7 +1455,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1467,15 +1467,15 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID); - p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(setclientid->sc_prog); + *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(setclientid->sc_cb_ident); + *p = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1487,7 +1487,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); p = xdr_encode_hyper(p, client_state->cl_clientid); - p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1497,14 +1497,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg __be32 *p; p = reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_WRITE); + *p = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); *p++ = cpu_to_be32(args->stable); - *p++ = cpu_to_be32(args->count); + *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1518,7 +1518,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_DELEGRETURN); - p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1533,14 +1533,14 @@ static void encode_exchange_id(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); *p++ = cpu_to_be32(OP_EXCHANGE_ID); - p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); p = reserve_space(xdr, 12); *p++ = cpu_to_be32(args->flags); *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ - *p++ = cpu_to_be32(0); /* zero length implementation id array */ + *p = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1591,7 +1591,7 @@ static void encode_create_session(struct xdr_stream *xdr, p = xdr_encode_opaque_fixed(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ - *p++ = cpu_to_be32(0); /* No more gids */ + *p = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1603,7 +1603,7 @@ static void encode_destroy_session(struct xdr_stream *xdr, __be32 *p; p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(OP_DESTROY_SESSION); - p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1646,7 +1646,7 @@ static void encode_sequence(struct xdr_stream *xdr, *p++ = cpu_to_be32(slot->seq_nr); *p++ = cpu_to_be32(args->sa_slotid); *p++ = cpu_to_be32(tp->highest_used_slotid); - *p++ = cpu_to_be32(args->sa_cache_this); + *p = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ -- cgit v0.10.2 From 811652bd6edd66dd35bf9caacdfe96d19f75a47e Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:34 +0300 Subject: nfs: nfs4xdr: merge xdr_encode_int+xdr_encode_opaque_fixed into xdr_encode_opaque use encode_string where appropriate. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d75f821..efa78ad 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -732,9 +732,8 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - p = reserve_space(xdr, 12 + hdr->taglen); - *p++ = cpu_to_be32(hdr->taglen); - p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; *p = cpu_to_be32(hdr->nops); @@ -832,13 +831,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - *p++ = cpu_to_be32(owner_namelen); - p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen); + p = xdr_encode_opaque(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - *p++ = cpu_to_be32(owner_grouplen); - p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen); + p = xdr_encode_opaque(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; @@ -939,9 +936,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * break; } - p = reserve_space(xdr, 4 + create->name->len); - *p++ = cpu_to_be32(create->name->len); - xdr_encode_opaque_fixed(p, create->name->name, create->name->len); + encode_string(xdr, create->name->len, create->name->name); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -1007,8 +1002,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_LINK); - *p++ = cpu_to_be32(name->len); - xdr_encode_opaque_fixed(p, name->name, name->len); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1100,8 +1094,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_LOOKUP); - *p++ = cpu_to_be32(len); - xdr_encode_opaque_fixed(p, name->name, len); + xdr_encode_opaque(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1279,8 +1272,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd p = reserve_space(xdr, 8 + len); *p++ = cpu_to_be32(OP_PUTFH); - *p++ = cpu_to_be32(len); - xdr_encode_opaque_fixed(p, fh->data, len); + xdr_encode_opaque(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1373,8 +1365,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc p = reserve_space(xdr, 8 + name->len); *p++ = cpu_to_be32(OP_REMOVE); - *p++ = cpu_to_be32(name->len); - xdr_encode_opaque_fixed(p, name->name, name->len); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1383,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - p = reserve_space(xdr, 8 + oldname->len); - *p++ = cpu_to_be32(OP_RENAME); - *p++ = cpu_to_be32(oldname->len); - xdr_encode_opaque_fixed(p, oldname->name, oldname->len); - - p = reserve_space(xdr, 4 + newname->len); - *p++ = cpu_to_be32(newname->len); - xdr_encode_opaque_fixed(p, newname->name, newname->len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1587,8 +1574,7 @@ static void encode_create_session(struct xdr_stream *xdr, /* authsys_parms rfc1831 */ *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - *p++ = cpu_to_be32(len); - p = xdr_encode_opaque_fixed(p, machine_name, len); + p = xdr_encode_opaque(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ *p = cpu_to_be32(0); /* No more gids */ -- cgit v0.10.2 From 6f723f7710024bb151ca8c5277ce8c71beec4db8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:37 +0300 Subject: nfs: nfs4xdr: get rid of READ32 s/READ32\((.*)\)/\1 = be32_to_cpup(p++)/ Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index efa78ad..5f6b46f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2433,7 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define READ32(x) (x) = ntohl(*p++) #define READ64(x) do { \ (x) = (u64)ntohl(*p++) << 32; \ (x) |= ntohl(*p++); \ @@ -2464,7 +2463,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char __be32 *p; READ_BUF(4); - READ32(*len); + *len = be32_to_cpup(p++); READ_BUF(*len); *string = (char *)p; return 0; @@ -2475,13 +2474,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) __be32 *p; READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p++); READ_BUF(hdr->taglen + 4); hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p++); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; @@ -2494,14 +2493,14 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) int32_t nfserr; READ_BUF(8); - READ32(opnum); + opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" " %d but we issued a request for %d\n", opnum, expected); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p++); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; @@ -2524,14 +2523,14 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) __be32 *p; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); bitmap[0] = bitmap[1] = 0; READ_BUF((bmlen << 2)); if (bmlen > 0) { - READ32(bitmap[0]); + bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bitmap[1]); + bitmap[1] = be32_to_cpup(p++); } return 0; } @@ -2541,7 +2540,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 *p; READ_BUF(4); - READ32(*attrlen); + *attrlen = be32_to_cpup(p++); *savep = xdr->p; return 0; } @@ -2567,7 +2566,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { READ_BUF(4); - READ32(*type); + *type = be32_to_cpup(p++); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2625,7 +2624,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { READ_BUF(4); - READ32(*res); + *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); @@ -2641,7 +2640,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { READ_BUF(4); - READ32(*res); + *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); @@ -2679,7 +2678,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { READ_BUF(4); - READ32(*res); + *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); @@ -2695,7 +2694,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { READ_BUF(4); - READ32(*res); + *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); @@ -2796,7 +2795,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) int status = 0; READ_BUF(4); - READ32(n); + n = be32_to_cpup(p++); if (n == 0) goto root_path; dprintk("path "); @@ -2848,7 +2847,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(status != 0)) goto out; READ_BUF(4); - READ32(n); + n = be32_to_cpup(p++); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2857,7 +2856,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st struct nfs4_fs_location *loc = &res->locations[res->nlocations]; READ_BUF(4); - READ32(m); + m = be32_to_cpup(p++); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2928,7 +2927,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { READ_BUF(4); - READ32(*maxlink); + *maxlink = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); @@ -2945,7 +2944,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { READ_BUF(4); - READ32(*maxname); + *maxname = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); @@ -3005,7 +3004,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; @@ -3024,7 +3023,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { READ_BUF(4); - READ32(*nlink); + *nlink = be32_to_cpup(p++); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } @@ -3043,7 +3042,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) @@ -3071,7 +3070,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) @@ -3101,8 +3100,8 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde dev_t tmp; READ_BUF(8); - READ32(major); - READ32(minor); + major = be32_to_cpup(p++); + minor = be32_to_cpup(p++); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3191,7 +3190,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) READ_BUF(12); READ64(sec); - READ32(nsec); + nsec = be32_to_cpup(p++); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; @@ -3273,7 +3272,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c __be32 *p; READ_BUF(20); - READ32(cinfo->atomic); + cinfo->atomic = be32_to_cpup(p++); READ64(cinfo->before); READ64(cinfo->after); return 0; @@ -3289,8 +3288,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) if (status) return status; READ_BUF(8); - READ32(supp); - READ32(acc); + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p++); access->supported = supp; access->access = acc; return 0; @@ -3336,7 +3335,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) if ((status = decode_change_info(xdr, cinfo))) return status; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); READ_BUF(bmlen << 2); return 0; } @@ -3591,7 +3590,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) return status; READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; @@ -3622,7 +3621,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) READ_BUF(32); READ64(offset); READ64(length); - READ32(type); + type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; @@ -3634,7 +3633,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_pid = 0; } READ64(clientid); - READ32(namelen); + namelen = be32_to_cpup(p++); READ_BUF(namelen); return -NFS4ERR_DENIED; } @@ -3695,14 +3694,14 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) uint32_t limit_type, nblocks, blocksize; READ_BUF(12); - READ32(limit_type); + limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: READ64(*maxsize); break; case 2: - READ32(nblocks); - READ32(blocksize); + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p++); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; @@ -3714,14 +3713,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t delegation_type; READ_BUF(4); - READ32(delegation_type); + delegation_type = be32_to_cpup(p++); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } READ_BUF(NFS4_STATEID_SIZE+4); COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + res->do_recall = be32_to_cpup(p++); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3752,15 +3751,15 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) decode_change_info(xdr, &res->cinfo); READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p++); if (bmlen > 10) goto xdr_error; READ_BUF(bmlen << 2); savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; @@ -3821,8 +3820,8 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ if (status) return status; READ_BUF(8); - READ32(eof); - READ32(count); + eof = be32_to_cpup(p++); + count = be32_to_cpup(p++); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -3948,7 +3947,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) /* Convert length of symlink */ READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4070,7 +4069,7 @@ static int decode_setattr(struct xdr_stream *xdr) if (status) return status; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); READ_BUF(bmlen << 2); return 0; } @@ -4082,13 +4081,13 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) int32_t nfserr; READ_BUF(8); - READ32(opnum); + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p++); if (nfserr == NFS_OK) { READ_BUF(8 + NFS4_VERIFIER_SIZE); READ64(clp->cl_clientid); @@ -4098,12 +4097,12 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) /* skip netid string */ READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); READ_BUF(len); /* skip uaddr string */ READ_BUF(4); - READ32(len); + len = be32_to_cpup(p++); READ_BUF(len); return -NFSERR_CLID_INUSE; } else @@ -4127,8 +4126,8 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) return status; READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); COPYMEM(res->verf->verifier, 8); return 0; } @@ -4154,11 +4153,11 @@ static int decode_exchange_id(struct xdr_stream *xdr, READ_BUF(8); READ64(clp->cl_ex_clid); READ_BUF(12); - READ32(clp->cl_seqid); - READ32(clp->cl_exchange_flags); + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != SP4_NONE) return -EIO; @@ -4167,17 +4166,17 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* Throw away Major id */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); /* Throw away server_scope */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); /* Throw away Implementation id array */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); return 0; @@ -4190,13 +4189,13 @@ static int decode_chan_attrs(struct xdr_stream *xdr, u32 nr_attrs; READ_BUF(28); - READ32(attrs->headerpadsz); - READ32(attrs->max_rqst_sz); - READ32(attrs->max_resp_sz); - READ32(attrs->max_resp_sz_cached); - READ32(attrs->max_ops); - READ32(attrs->max_reqs); - READ32(nr_attrs); + attrs->headerpadsz = be32_to_cpup(p++); + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p++); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); @@ -4226,8 +4225,8 @@ static int decode_create_session(struct xdr_stream *xdr, /* seqid, flags */ READ_BUF(8); - READ32(clp->cl_seqid); - READ32(session->flags); + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p++); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); @@ -4275,23 +4274,23 @@ static int decode_sequence(struct xdr_stream *xdr, goto out_err; } /* seqid */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != res->sr_slotid) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } /* highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); status = 0; out_err: res->sr_status = status; -- cgit v0.10.2 From 3ceb4dbb993fdab6a6fafc69db36686278871134 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:41 +0300 Subject: nfs: nfs4xdr: get rid of READ64 s/READ64\(\*(.*)\)/p = xdr_decode_hyper(p, \1)/ s/READ64\((.*)\)/p = xdr_decode_hyper(p, &\1)/ Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5f6b46f..238189c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2433,10 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) #define READTIME(x) do { \ p++; \ (x.tv_sec) = ntohl(*p++); \ @@ -2588,7 +2584,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { READ_BUF(8); - READ64(*change); + p = xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } @@ -2607,7 +2603,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { READ_BUF(8); - READ64(*size); + p = xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } @@ -2658,8 +2654,8 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_decode_hyper(p, &fsid->major); + p = xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2711,7 +2707,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { READ_BUF(8); - READ64(*fileid); + p = xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } @@ -2729,7 +2725,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { READ_BUF(8); - READ64(*fileid); + p = xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } @@ -2747,7 +2743,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); @@ -2764,7 +2760,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); @@ -2781,7 +2777,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); @@ -2910,7 +2906,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); @@ -2962,7 +2958,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; READ_BUF(8); - READ64(maxread); + p = xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -2983,7 +2979,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; READ_BUF(8); - READ64(maxwrite); + p = xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3122,7 +3118,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); @@ -3139,7 +3135,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); @@ -3156,7 +3152,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { READ_BUF(8); - READ64(*res); + p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); @@ -3173,7 +3169,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { READ_BUF(8); - READ64(*used); + p = xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } @@ -3189,7 +3185,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint32_t nsec; READ_BUF(12); - READ64(sec); + p = xdr_decode_hyper(p, &sec); nsec = be32_to_cpup(p++); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; @@ -3273,8 +3269,8 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c READ_BUF(20); cinfo->atomic = be32_to_cpup(p++); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_decode_hyper(p, &cinfo->before); + p = xdr_decode_hyper(p, &cinfo->after); return 0; } @@ -3619,8 +3615,8 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) uint32_t namelen, type; READ_BUF(32); - READ64(offset); - READ64(length); + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; @@ -3632,7 +3628,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); + p = xdr_decode_hyper(p, &clientid); namelen = be32_to_cpup(p++); READ_BUF(namelen); return -NFS4ERR_DENIED; @@ -3697,7 +3693,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - READ64(*maxsize); + p = xdr_decode_hyper(p, maxsize); break; case 2: nblocks = be32_to_cpup(p++); @@ -4090,7 +4086,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) nfserr = be32_to_cpup(p++); if (nfserr == NFS_OK) { READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); + p = xdr_decode_hyper(p, &clp->cl_clientid); COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; @@ -4151,7 +4147,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, return status; READ_BUF(8); - READ64(clp->cl_ex_clid); + p = xdr_decode_hyper(p, &clp->cl_ex_clid); READ_BUF(12); clp->cl_seqid = be32_to_cpup(p++); clp->cl_exchange_flags = be32_to_cpup(p++); -- cgit v0.10.2 From c816fd3406462702dee2e3859e70132c3aab7c10 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:44 +0300 Subject: nfs: nfs4xdr: get rid of READTIME It has no users. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 238189c..0c26bb2 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) #define COPYMEM(x,nbytes) do { \ memcpy((x), p, nbytes); \ p += XDR_QUADLEN(nbytes); \ -- cgit v0.10.2 From 686841b3cc3a71918b45ed148be7a01a4f10e3f8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:48 +0300 Subject: nfs: nfs4xdr: introduce print_overflow_msg Part fo the nfs4xdr cleanup. READ_BUF will go away. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0c26bb2..8255ec7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2441,14 +2441,18 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, #define READ_BUF(nbytes) do { \ p = xdr_inline_decode(xdr, nbytes); \ if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __func__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __func__, xdr->p, nbytes, xdr->end); \ + print_overflow_msg(__func__, xdr); \ return -EIO; \ } \ } while (0) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; -- cgit v0.10.2 From 07d30434cfe2f1a1553143c6b20f1fe68d2ef80a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:52 +0300 Subject: nfs: nfs4xdr: introduce decode_opaque_fixed and decode_stateid helpers Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8255ec7..86e6983 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3290,19 +3290,34 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) return 0; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) @@ -3635,15 +3650,15 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == -EIO) goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) status = decode_lock_denied(xdr, NULL); if (res->open_seqid != NULL) @@ -3664,16 +3679,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } @@ -3706,6 +3718,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; uint32_t delegation_type; + int status; READ_BUF(4); delegation_type = be32_to_cpup(p++); @@ -3713,8 +3726,10 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + READ_BUF(4); res->do_recall = be32_to_cpup(p++); switch (delegation_type) { @@ -3738,10 +3753,10 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); decode_change_info(xdr, &res->cinfo); @@ -3766,32 +3781,26 @@ xdr_error: static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) -- cgit v0.10.2 From db942bbd09563e169cc5d9004c32c1de33220fd1 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:19:56 +0300 Subject: nfs: nfs4xdr: introduce decode_verifier helper Signed-off-by: Benny Halevy [Trond: Fixed up an 'uninitialised variable' issue in decode_readdir] Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86e6983..404f2e6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3320,17 +3320,19 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) return status; } +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); +} + static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3852,17 +3854,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n int status; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); dprintk("%s: verifier = %08x:%08x\n", __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); - hdrlen = (char *) p - (char *) iov->iov_base; + hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; if (pglen > recvd) pglen = recvd; -- cgit v0.10.2 From e78291e4e07520348b0634095cf19ed3bc868965 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:20:00 +0300 Subject: nfs: nfs4xdr: introduce decode_sessionid helper Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 404f2e6..00630f4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4212,6 +4212,11 @@ static int decode_chan_attrs(struct xdr_stream *xdr, return 0; } +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); +} + static int decode_create_session(struct xdr_stream *xdr, struct nfs41_create_session_res *res) { @@ -4221,14 +4226,11 @@ static int decode_create_session(struct xdr_stream *xdr, struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); - - if (status) + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) return status; - /* sessionid */ - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN); - /* seqid, flags */ READ_BUF(8); clp->cl_seqid = be32_to_cpup(p++); @@ -4262,7 +4264,9 @@ static int decode_sequence(struct xdr_stream *xdr, return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); - if (status) + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) goto out_err; /* @@ -4271,15 +4275,16 @@ static int decode_sequence(struct xdr_stream *xdr, */ status = -ESERVERFAULT; - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 20); - COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN); if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; } + + READ_BUF(20); + /* seqid */ + slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); -- cgit v0.10.2 From 99398d0655ada44ae464a1c93d13cd438a306ecd Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:20:05 +0300 Subject: nfs: nfs4xdr: get rid of COPYMEM Just directly call memcpy. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 00630f4..f69aafc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, * task to translate them into Linux-specific versions which are more * consistent with the style used in NFSv2/v3... */ -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - #define READ_BUF(nbytes) do { \ p = xdr_inline_decode(xdr, nbytes); \ if (unlikely(!p)) { \ @@ -3607,7 +3602,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) return -EIO; fh->size = len; READ_BUF(len); - COPYMEM(fh->data, len); + memcpy(fh->data, p, len); return 0; } @@ -4097,7 +4092,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) if (nfserr == NFS_OK) { READ_BUF(8 + NFS4_VERIFIER_SIZE); p = xdr_decode_hyper(p, &clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; @@ -4134,7 +4129,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) READ_BUF(16); res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); - COPYMEM(res->verf->verifier, 8); + memcpy(res->verf->verifier, p, 8); return 0; } -- cgit v0.10.2 From 2460ba57c49c36dfef0b62c929461de09240fe17 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:20:10 +0300 Subject: nfs: nfs4xdr: simplify decode_exchange_id by reusing decode_opaque_inline Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f69aafc..3f49da0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4144,6 +4144,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, { __be32 *p; uint32_t dummy; + char *dummy_str; int status; struct nfs_client *clp = res->client; @@ -4166,19 +4167,19 @@ static int decode_exchange_id(struct xdr_stream *xdr, READ_BUF(8); /* Throw away Major id */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away server_scope */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away Implementation id array */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; return 0; } -- cgit v0.10.2 From c0eae66ece40bdb8cd88c5106834b71a1c9f421c Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:20:14 +0300 Subject: nfs: nfs4xdr: get rid of READ_BUF Use xdr_inline_decode instead. Open code debug printout and error return. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3f49da0..b0d690c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2423,24 +2423,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -/* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - print_overflow_msg(__func__, xdr); \ - return -EIO; \ - } \ -} while (0) - static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { dprintk("nfs: %s: prematurely hit end of receive buffer. " @@ -2452,28 +2434,42 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char { __be32 *p; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *len = be32_to_cpup(p++); - READ_BUF(*len); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; hdr->status = be32_to_cpup(p++); hdr->taglen = be32_to_cpup(p++); - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); hdr->nops = be32_to_cpup(p++); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) @@ -2482,7 +2478,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) uint32_t opnum; int32_t nfserr; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" @@ -2494,6 +2492,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* Dummy routine */ @@ -2503,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2512,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; bmlen = be32_to_cpup(p++); bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) bitmap[1] = be32_to_cpup(p++); } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) { __be32 *p; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *attrlen = be32_to_cpup(p++); *savep = xdr->p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -2555,7 +2571,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *type = be32_to_cpup(p++); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); @@ -2566,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -2577,7 +2598,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; @@ -2585,6 +2608,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -2596,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2613,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2629,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -2647,7 +2688,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &fsid->major); p = xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; @@ -2657,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2667,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2683,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *res = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2700,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2718,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2736,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2753,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2770,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2784,7 +2865,9 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; n = be32_to_cpup(p++); if (n == 0) goto root_path; @@ -2819,6 +2902,9 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2836,7 +2922,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; n = be32_to_cpup(p++); if (n <= 0) goto out_eio; @@ -2845,7 +2933,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st u32 m; struct nfs4_fs_location *loc = &res->locations[res->nlocations]; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; m = be32_to_cpup(p++); loc->nservers = 0; @@ -2885,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2899,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2916,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *maxlink = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2933,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *maxname = be32_to_cpup(p++); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2951,7 +3058,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; @@ -2960,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2972,7 +3084,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; @@ -2981,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -2993,7 +3110,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; tmp = be32_to_cpup(p++); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; @@ -3001,6 +3120,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3012,13 +3134,18 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; *nlink = be32_to_cpup(p++); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) @@ -3031,9 +3158,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); - READ_BUF(len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; @@ -3047,6 +3178,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: uid=%d\n", __func__, (int)*uid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) @@ -3059,9 +3193,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); - READ_BUF(len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; @@ -3075,6 +3213,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: gid=%d\n", __func__, (int)*gid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -3089,7 +3230,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; major = be32_to_cpup(p++); minor = be32_to_cpup(p++); tmp = MKDEV(major, minor); @@ -3100,6 +3243,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3111,12 +3257,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3128,12 +3279,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3145,12 +3301,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -3162,7 +3323,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; @@ -3170,6 +3333,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -3178,12 +3344,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &sec); nsec = be32_to_cpup(p++); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -3261,11 +3432,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); p = xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) @@ -3277,12 +3453,17 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; supp = be32_to_cpup(p++); acc = be32_to_cpup(p++); access->supported = supp; access->access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) @@ -3341,10 +3522,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; bmlen = be32_to_cpup(p++); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) @@ -3596,14 +3783,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3625,7 +3819,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); + p = xdr_inline_decode(xdr, 32); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &offset); p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); @@ -3641,8 +3837,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) } p = xdr_decode_hyper(p, &clientid); namelen = be32_to_cpup(p++); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_inline_decode(xdr, namelen); + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) @@ -3697,7 +3897,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: @@ -3709,6 +3911,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3717,7 +3922,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t delegation_type; int status; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; delegation_type = be32_to_cpup(p++); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; @@ -3726,7 +3933,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_stateid(xdr, &res->delegation); if (unlikely(status)) return status; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; res->do_recall = be32_to_cpup(p++); switch (delegation_type) { @@ -3739,6 +3948,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3757,13 +3969,17 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) decode_change_info(xdr, &res->cinfo); - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; res->rflags = be32_to_cpup(p++); bmlen = be32_to_cpup(p++); if (bmlen > 10) goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) res->attrset[i] = be32_to_cpup(p++); @@ -3774,6 +3990,9 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) @@ -3820,7 +4039,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; eof = be32_to_cpup(p++); count = be32_to_cpup(p++); hdrlen = (u8 *) p - (u8 *) iov->iov_base; @@ -3835,6 +4056,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -3947,7 +4171,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); @@ -3972,6 +4198,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) kaddr[len+rcvbuf->page_base] = '\0'; kunmap_atomic(kaddr, KM_USER0); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -4069,10 +4298,16 @@ static int decode_setattr(struct xdr_stream *xdr) status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; bmlen = be32_to_cpup(p++); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) @@ -4081,7 +4316,9 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) uint32_t opnum; int32_t nfserr; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" @@ -4090,26 +4327,39 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) } nfserr = be32_to_cpup(p++); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &clp->cl_clientid); memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); - READ_BUF(len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; len = be32_to_cpup(p++); - READ_BUF(len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -4126,11 +4376,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); memcpy(res->verf->verifier, p, 8); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -4152,9 +4407,13 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (status) return status; - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; p = xdr_decode_hyper(p, &clp->cl_ex_clid); - READ_BUF(12); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; clp->cl_seqid = be32_to_cpup(p++); clp->cl_exchange_flags = be32_to_cpup(p++); @@ -4164,7 +4423,9 @@ static int decode_exchange_id(struct xdr_stream *xdr, return -EIO; /* Throw away minor_id */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; /* Throw away Major id */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); @@ -4182,6 +4443,9 @@ static int decode_exchange_id(struct xdr_stream *xdr, return status; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -4190,7 +4454,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __be32 *p; u32 nr_attrs; - READ_BUF(28); + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; attrs->headerpadsz = be32_to_cpup(p++); attrs->max_rqst_sz = be32_to_cpup(p++); attrs->max_resp_sz = be32_to_cpup(p++); @@ -4203,9 +4469,15 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __func__, nr_attrs); return -EINVAL; } - if (nr_attrs == 1) - READ_BUF(4); /* skip rdma_attrs */ + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) @@ -4228,7 +4500,9 @@ static int decode_create_session(struct xdr_stream *xdr, return status; /* seqid, flags */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; clp->cl_seqid = be32_to_cpup(p++); session->flags = be32_to_cpup(p++); @@ -4237,6 +4511,9 @@ static int decode_create_session(struct xdr_stream *xdr, if (!status) status = decode_chan_attrs(xdr, &session->bc_attrs); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -4277,7 +4554,9 @@ static int decode_sequence(struct xdr_stream *xdr, goto out_err; } - READ_BUF(20); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; /* seqid */ slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; @@ -4302,6 +4581,10 @@ static int decode_sequence(struct xdr_stream *xdr, out_err: res->sr_status = status; return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; #else /* CONFIG_NFS_V4_1 */ return 0; #endif /* CONFIG_NFS_V4_1 */ -- cgit v0.10.2 From cccddf4f5580131c9b963900e1d3400655e633cc Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:20:19 +0300 Subject: nfs: nfs4xdr: optimize low level decoding do not increment decoding ptr if not needed. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b0d690c..14b6f51 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2437,7 +2437,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *len = be32_to_cpup(p++); + *len = be32_to_cpup(p); p = xdr_inline_decode(xdr, *len); if (unlikely(!p)) goto out_overflow; @@ -2456,14 +2456,14 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) if (unlikely(!p)) goto out_overflow; hdr->status = be32_to_cpup(p++); - hdr->taglen = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); p = xdr_inline_decode(xdr, hdr->taglen + 4); if (unlikely(!p)) goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - hdr->nops = be32_to_cpup(p++); + hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; @@ -2488,7 +2488,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) opnum, expected); return -EIO; } - nfserr = be32_to_cpup(p++); + nfserr = be32_to_cpup(p); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; @@ -2519,7 +2519,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - bmlen = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); bitmap[0] = bitmap[1] = 0; p = xdr_inline_decode(xdr, (bmlen << 2)); @@ -2528,7 +2528,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) if (bmlen > 0) { bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - bitmap[1] = be32_to_cpup(p++); + bitmap[1] = be32_to_cpup(p); } return 0; out_overflow: @@ -2543,7 +2543,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *attrlen = be32_to_cpup(p++); + *attrlen = be32_to_cpup(p); *savep = xdr->p; return 0; out_overflow: @@ -2574,7 +2574,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *type = be32_to_cpup(p++); + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2601,7 +2601,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, change); + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } @@ -2625,7 +2625,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, size); + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } @@ -2647,7 +2647,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *res = be32_to_cpup(p++); + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); @@ -2668,7 +2668,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *res = be32_to_cpup(p++); + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); @@ -2692,7 +2692,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(!p)) goto out_overflow; p = xdr_decode_hyper(p, &fsid->major); - p = xdr_decode_hyper(p, &fsid->minor); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2716,7 +2716,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *res = be32_to_cpup(p++); + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); @@ -2737,7 +2737,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *res = be32_to_cpup(p++); + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); @@ -2759,7 +2759,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, fileid); + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } @@ -2782,7 +2782,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, fileid); + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } @@ -2805,7 +2805,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); @@ -2827,7 +2827,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); @@ -2849,7 +2849,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); @@ -2868,7 +2868,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - n = be32_to_cpup(p++); + n = be32_to_cpup(p); if (n == 0) goto root_path; dprintk("path "); @@ -2925,7 +2925,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - n = be32_to_cpup(p++); + n = be32_to_cpup(p); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2936,7 +2936,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - m = be32_to_cpup(p++); + m = be32_to_cpup(p); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2994,7 +2994,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); @@ -3016,7 +3016,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *maxlink = be32_to_cpup(p++); + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); @@ -3038,7 +3038,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *maxname = be32_to_cpup(p++); + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); @@ -3061,7 +3061,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &maxread); + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -3087,7 +3087,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &maxwrite); + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3113,7 +3113,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - tmp = be32_to_cpup(p++); + tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; @@ -3137,7 +3137,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - *nlink = be32_to_cpup(p++); + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } @@ -3161,7 +3161,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; @@ -3196,7 +3196,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; @@ -3234,7 +3234,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (unlikely(!p)) goto out_overflow; major = be32_to_cpup(p++); - minor = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3260,7 +3260,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); @@ -3282,7 +3282,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); @@ -3304,7 +3304,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, res); + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); @@ -3326,7 +3326,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, used); + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } @@ -3348,7 +3348,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) if (unlikely(!p)) goto out_overflow; p = xdr_decode_hyper(p, &sec); - nsec = be32_to_cpup(p++); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; @@ -3437,7 +3437,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c goto out_overflow; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); - p = xdr_decode_hyper(p, &cinfo->after); + xdr_decode_hyper(p, &cinfo->after); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -3457,7 +3457,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) if (unlikely(!p)) goto out_overflow; supp = be32_to_cpup(p++); - acc = be32_to_cpup(p++); + acc = be32_to_cpup(p); access->supported = supp; access->access = acc; return 0; @@ -3525,7 +3525,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - bmlen = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; @@ -3786,7 +3786,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; @@ -3836,7 +3836,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_pid = 0; } p = xdr_decode_hyper(p, &clientid); - namelen = be32_to_cpup(p++); + namelen = be32_to_cpup(p); p = xdr_inline_decode(xdr, namelen); if (likely(p)) return -NFS4ERR_DENIED; @@ -3903,11 +3903,11 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - p = xdr_decode_hyper(p, maxsize); + xdr_decode_hyper(p, maxsize); break; case 2: nblocks = be32_to_cpup(p++); - blocksize = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; @@ -3925,7 +3925,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - delegation_type = be32_to_cpup(p++); + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; @@ -3936,7 +3936,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - res->do_recall = be32_to_cpup(p++); + res->do_recall = be32_to_cpup(p); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3973,7 +3973,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) if (unlikely(!p)) goto out_overflow; res->rflags = be32_to_cpup(p++); - bmlen = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); if (bmlen > 10) goto xdr_error; @@ -4043,7 +4043,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ if (unlikely(!p)) goto out_overflow; eof = be32_to_cpup(p++); - count = be32_to_cpup(p++); + count = be32_to_cpup(p); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -4174,7 +4174,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4301,7 +4301,7 @@ static int decode_setattr(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - bmlen = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; @@ -4325,7 +4325,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) " %d\n", opnum); return -EIO; } - nfserr = be32_to_cpup(p++); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) @@ -4339,7 +4339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; @@ -4348,7 +4348,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - len = be32_to_cpup(p++); + len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; @@ -4410,7 +4410,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &clp->cl_ex_clid); + xdr_decode_hyper(p, &clp->cl_ex_clid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; @@ -4418,7 +4418,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; @@ -4463,7 +4463,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr, attrs->max_resp_sz_cached = be32_to_cpup(p++); attrs->max_ops = be32_to_cpup(p++); attrs->max_reqs = be32_to_cpup(p++); - nr_attrs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); @@ -4504,7 +4504,7 @@ static int decode_create_session(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; clp->cl_seqid = be32_to_cpup(p++); - session->flags = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); @@ -4576,7 +4576,7 @@ static int decode_sequence(struct xdr_stream *xdr, /* target highest slot id - currently not processed */ dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; -- cgit v0.10.2