diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/.gitignore | 5 | ||||
-rw-r--r-- | kernel/acct.c | 16 | ||||
-rw-r--r-- | kernel/audit.c | 4 | ||||
-rw-r--r-- | kernel/configs.c | 2 | ||||
-rw-r--r-- | kernel/futex.c | 13 | ||||
-rw-r--r-- | kernel/irq/manage.c | 2 | ||||
-rw-r--r-- | kernel/irq/proc.c | 4 | ||||
-rw-r--r-- | kernel/kprobes.c | 36 | ||||
-rw-r--r-- | kernel/ksysfs.c | 30 | ||||
-rw-r--r-- | kernel/module.c | 3 | ||||
-rw-r--r-- | kernel/panic.c | 4 | ||||
-rw-r--r-- | kernel/params.c | 2 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 13 | ||||
-rw-r--r-- | kernel/power/disk.c | 92 | ||||
-rw-r--r-- | kernel/power/power.h | 24 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 89 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 1020 | ||||
-rw-r--r-- | kernel/rcupdate.c | 59 | ||||
-rw-r--r-- | kernel/rcutorture.c | 3 | ||||
-rw-r--r-- | kernel/sys.c | 3 | ||||
-rw-r--r-- | kernel/sysctl.c | 43 | ||||
-rw-r--r-- | kernel/time.c | 22 |
22 files changed, 851 insertions, 638 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 0000000..f2ab700 --- /dev/null +++ b/kernel/.gitignore @@ -0,0 +1,5 @@ +# +# Generated files +# +config_data.h +config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6b..38d57fa 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ( - current->signal->utime + - current->group_leader->utime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ( - current->signal->stime + - current->group_leader->stime)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, + current->signal->utime)); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, + current->signal->stime)); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -580,7 +581,8 @@ void acct_process(long exitcode) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = tsk->stime - tsk->acct_stimexpd; + long delta = + cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; if (delta == 0) return; diff --git a/kernel/audit.c b/kernel/audit.c index 0c56320..32fa03a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -291,8 +291,10 @@ int kauditd_thread(void *dummy) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); schedule(); + } __set_current_state(TASK_RUNNING); remove_wait_queue(&kauditd_wait, &wait); diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af..009e1eb 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -3,7 +3,7 @@ * Echo the kernel .config file used to build the kernel * * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> - * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> + * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company * diff --git a/kernel/futex.c b/kernel/futex.c index 5872e35..5efa2f9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -270,7 +270,13 @@ static void wake_futex(struct futex_q *q) /* * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. + * + * A memory barrier is required here to prevent the following store + * to lock_ptr from getting ahead of the wakeup. Clearing the lock + * at the end of wake_up_all() does not prevent this store from + * moving. */ + wmb(); q->lock_ptr = NULL; } @@ -350,6 +356,13 @@ retry: if (bh1 != bh2) spin_unlock(&bh2->lock); +#ifndef CONFIG_MMU + /* we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking */ + ret = op_ret; + goto out; +#endif + if (unlikely(op_ret != -EFAULT)) { ret = op_ret; goto out; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81c49a4..97d5559 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -366,6 +366,8 @@ int request_irq(unsigned int irq, action->next = NULL; action->dev_id = dev_id; + select_smp_affinity(irq); + retval = setup_irq(irq, action); if (retval) kfree(action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f26e534..8a64a48 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, */ cpus_and(tmp, new_value, cpu_online_map); if (cpus_empty(tmp)) - return -EINVAL; + /* Special case for empty set - allow the architecture + code to set default SMP affinity. */ + return select_smp_affinity(irq) ? -EINVAL : full_count; proc_set_irq_affinity(irq, new_value); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5beda37..3bb71e6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -246,6 +246,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return ret; } +/* Walks the list and increments nmissed count for multiprobe case */ +void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +{ + struct kprobe *kp; + if (p->pre_handler != aggr_pre_handler) { + p->nmissed++; + } else { + list_for_each_entry_rcu(kp, &p->list, list) + kp->nmissed++; + } + return; +} + /* Called with kretprobe_lock held */ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { @@ -399,10 +412,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); - INIT_HLIST_NODE(&ap->hlist); - hlist_del_rcu(&p->hlist); - hlist_add_head_rcu(&ap->hlist, - &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); + hlist_replace_rcu(&p->hlist, &ap->hlist); } /* @@ -462,9 +472,16 @@ int __kprobes register_kprobe(struct kprobe *p) int ret = 0; unsigned long flags = 0; struct kprobe *old_p; + struct module *mod; + + if ((!kernel_text_address((unsigned long) p->addr)) || + in_kprobes_functions((unsigned long) p->addr)) + return -EINVAL; + + if ((mod = module_text_address((unsigned long) p->addr)) && + (unlikely(!try_module_get(mod)))) + return -EINVAL; - if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) - return ret; if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; @@ -488,6 +505,8 @@ out: rm_kprobe: if (ret == -EEXIST) arch_remove_kprobe(p); + if (ret && mod) + module_put(mod); return ret; } @@ -495,6 +514,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; + struct module *mod; spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); @@ -506,6 +526,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) cleanup_kprobe(p, flags); synchronize_sched(); + + if ((mod = module_text_address((unsigned long)p->addr))) + module_put(mod); + if (old_p->pre_handler == aggr_pre_handler && list_empty(&old_p->list)) kfree(old_p); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 015fb69..99af8b0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -15,6 +15,9 @@ #include <linux/module.h> #include <linux/init.h> +u64 uevent_seqnum; +char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; + #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -23,11 +26,29 @@ static struct subsys_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) #ifdef CONFIG_HOTPLUG -static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) +/* current uevent sequence number */ +static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); +} +KERNEL_ATTR_RO(uevent_seqnum); + +/* uevent helper program, used during early boo */ +static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%s\n", uevent_helper); +} +static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) { - return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); + if (count+1 > UEVENT_HELPER_PATH_LEN) + return -ENOENT; + memcpy(uevent_helper, page, count); + uevent_helper[count] = '\0'; + if (count && uevent_helper[count-1] == '\n') + uevent_helper[count-1] = '\0'; + return count; } -KERNEL_ATTR_RO(hotplug_seqnum); +KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_KEXEC @@ -45,7 +66,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys); static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG - &hotplug_seqnum_attr.attr, + &uevent_seqnum_attr.attr, + &uevent_helper_attr.attr, #endif #ifdef CONFIG_KEXEC &crash_notes_attr.attr, diff --git a/kernel/module.c b/kernel/module.c index 2ea929d..4b06bba 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); - if (err < 0) return ERR_PTR(err); - else return ptr; + return ERR_PTR(err); truncated: printk(KERN_ERR "Module len %lu truncated\n", len); diff --git a/kernel/panic.c b/kernel/panic.c index aabc5f8..c5c4ab2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...) long i; static char buf[1024]; va_list args; -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif @@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); } #endif -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) disabled_wait(caller); #endif local_irq_enable(); diff --git a/kernel/params.c b/kernel/params.c index 47ba695..c76ad25 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void) /* module-related sysfs stuff */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_SYSFS #define to_module_attr(n) container_of(n, struct module_attribute, attr); #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index cae4f57..4c68edf 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, while ((t = next_thread(t)) != p) { cpu->sched += t->sched_time; } - if (p->tgid == current->tgid) { - /* - * We're sampling ourselves, so include the - * cycles not yet banked. We still omit - * other threads running on other CPUs, - * so the total can always be behind as - * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). - */ - cpu->sched += current_sched_time(current); - } else { - cpu->sched += p->sched_time; - } + cpu->sched += sched_ns(p); break; } return 0; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a..e24446f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -24,10 +24,11 @@ extern suspend_disk_method_t pm_disk_mode; +extern int swsusp_shrink_memory(void); extern int swsusp_suspend(void); -extern int swsusp_write(void); +extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); extern int swsusp_check(void); -extern int swsusp_read(void); +extern int swsusp_read(struct pbe **pblist_ptr); extern void swsusp_close(void); extern int swsusp_resume(void); @@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode) static int in_suspend __nosavedata = 0; -/** - * free_some_memory - Try to free as much memory as possible - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped at this point, or - * livelock is possible. - */ - -static void free_some_memory(void) -{ - unsigned int i = 0; - unsigned int tmp; - unsigned long pages = 0; - char *p = "-\\|/"; - - printk("Freeing memory... "); - while ((tmp = shrink_all_memory(10000))) { - pages += tmp; - printk("\b%c", p[i++ % 4]); - } - printk("\bdone (%li pages freed)\n", pages); -} - - static inline void platform_finish(void) { if (pm_disk_mode == PM_DISK_PLATFORM) { @@ -127,8 +103,8 @@ static int prepare_processes(void) } /* Free memory before shutting down devices. */ - free_some_memory(); - return 0; + if (!(error = swsusp_shrink_memory())) + return 0; thaw: thaw_processes(); enable_nonboot_cpus(); @@ -176,7 +152,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); pr_debug("PM: writing image.\n"); - error = swsusp_write(); + error = swsusp_write(pagedir_nosave, nr_copy_pages); if (!error) power_down(pm_disk_mode); else { @@ -247,7 +223,7 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) { + if ((error = swsusp_read(&pagedir_nosave))) { swsusp_free(); goto Thaw; } @@ -363,37 +339,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf) MINOR(swsusp_resume_device)); } -static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) { - int len; - char *p; unsigned int maj, min; - int error = -EINVAL; dev_t res; + int ret = -EINVAL; - p = memchr(buf, '\n', n); - len = p ? p - buf : n; + if (sscanf(buf, "%u:%u", &maj, &min) != 2) + goto out; - if (sscanf(buf, "%u:%u", &maj, &min) == 2) { - res = MKDEV(maj,min); - if (maj == MAJOR(res) && min == MINOR(res)) { - down(&pm_sem); - swsusp_resume_device = res; - up(&pm_sem); - printk("Attempting manual resume\n"); - noresume = 0; - software_resume(); - } - } + res = MKDEV(maj,min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto out; - return error >= 0 ? n : error; + down(&pm_sem); + swsusp_resume_device = res; + up(&pm_sem); + printk("Attempting manual resume\n"); + noresume = 0; + software_resume(); + ret = n; +out: + return ret; } power_attr(resume); +static ssize_t image_size_show(struct subsystem * subsys, char *buf) +{ + return sprintf(buf, "%u\n", image_size); +} + +static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) +{ + unsigned int size; + + if (sscanf(buf, "%u", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + static struct attribute * g[] = { &disk_attr.attr, &resume_attr.attr, + &image_size_attr.attr, NULL, }; diff --git a/kernel/power/power.h b/kernel/power/power.h index 6c042b5..7e8492f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -9,19 +9,13 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) #endif -#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ - - 4 - 3*sizeof(unsigned long) - sizeof(int) \ - - sizeof(void *)) / sizeof(swp_entry_t)) - struct swsusp_info { struct new_utsname uts; u32 version_code; unsigned long num_physpages; int cpus; unsigned long image_pages; - unsigned long pagedir_pages; - suspend_pagedir_t * suspend_pagedir; - swp_entry_t pagedir[MAX_PBES]; + unsigned long pages; } __attribute__((aligned(PAGE_SIZE))); @@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -extern int freeze_processes(void); -extern void thaw_processes(void); - extern int pm_prepare_console(void); extern void pm_restore_console(void); - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; -extern suspend_pagedir_t *pagedir_nosave; -extern suspend_pagedir_t *pagedir_save; +extern struct pbe *pagedir_nosave; + +/* Preferred image size in MB (default 500) */ +extern unsigned int image_size; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern unsigned int count_data_pages(void); extern void free_pagedir(struct pbe *pblist); +extern void release_eaten_pages(void); extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); -extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); extern void swsusp_free(void); extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); +extern unsigned int snapshot_nr_pages(void); +extern struct pbe *snapshot_pblist(void); +extern void snapshot_pblist_set(struct pbe *pblist); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4a6dbce..41f6636 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -33,7 +33,35 @@ #include "power.h" +struct pbe *pagedir_nosave; +unsigned int nr_copy_pages; + #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) + if (is_highmem(zone)) { + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + if (PageNosaveFree(page)) + continue; + n++; + } + } + return n; +} + struct highmem_page { char *data; struct page *page; @@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); + if (PageReserved(page) && pfn_is_nosave(pfn)) return 0; - } if (PageNosaveFree(page)) return 0; return 1; } -static unsigned count_data_pages(void) +unsigned int count_data_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage) * of memory pages allocated with alloc_pagedir() */ -void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) +static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) { struct pbe *pbpage, *p; unsigned int num = PBES_PER_PAGE; @@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) p->next = p + 1; p->next = NULL; } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +/** + * On resume it is necessary to trace and eventually free the unsafe + * pages that have been allocated, because they are needed for I/O + * (on x86-64 we likely will "eat" these pages once again while + * creating the temporary page translation tables) + */ + +struct eaten_page { + struct eaten_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct eaten_page *eaten_pages = NULL; + +void release_eaten_pages(void) +{ + struct eaten_page *p, *q; + + p = eaten_pages; + while (p) { + q = p->next; + /* We don't want swsusp_free() to free this page again */ + ClearPageNosave(virt_to_page(p)); + free_page((unsigned long)p); + p = q; + } + eaten_pages = NULL; } /** @@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) if (safe_needed) do { res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) + if (res && PageNosaveFree(virt_to_page(res))) { /* This is for swsusp_free() */ SetPageNosave(virt_to_page(res)); + ((struct eaten_page *)res)->next = eaten_pages; + eaten_pages = res; + } } while (res && PageNosaveFree(virt_to_page(res))); else res = (void *)get_zeroed_page(gfp_mask); @@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed if (!pbe) { /* get_zeroed_page() failed */ free_pagedir(pblist); pblist = NULL; - } + } else + create_pbe_list(pblist, nr_pages); return pblist; } @@ -370,8 +428,14 @@ void swsusp_free(void) static int enough_free_mem(unsigned int nr_pages) { - pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); - return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + struct zone *zone; + unsigned int n = 0; + + for_each_zone (zone) + if (!is_highmem(zone)) + n += zone->free_pages; + pr_debug("swsusp: available memory: %u pages\n", n); + return n > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } @@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return NULL; } - create_pbe_list(pblist, nr_pages); if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { printk(KERN_ERR "suspend: Allocating image pages failed.\n"); @@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, PAGES_FOR_IO, nr_free_pages()); - /* This is needed because of the fixed size of swsusp_info */ - if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) - return -ENOSPC; - if (!enough_free_mem(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c05f46e..55a18d2 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -30,8 +30,8 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * - * Andreas Steinmetz <ast@domdv.de>: - * Added encrypted suspend option + * Rafael J. Wysocki <rjw@sisk.pl> + * Added the swap map data structure and reworked the handling of swap * * More state savers are welcome. Especially for the scsi layer... * @@ -67,44 +67,33 @@ #include <asm/tlbflush.h> #include <asm/io.h> -#include <linux/random.h> -#include <linux/crypto.h> -#include <asm/scatterlist.h> - #include "power.h" +/* + * Preferred image size in MB (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N MB, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned int image_size = 500; + #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else static int save_highmem(void) { return 0; } static int restore_highmem(void) { return 0; } +static unsigned int count_highmem_pages(void) { return 0; } #endif -#define CIPHER "aes" -#define MAXKEY 32 -#define MAXIV 32 - extern char resume_file[]; -/* Local variables that should not be affected by save */ -unsigned int nr_copy_pages __nosavedata = 0; - -/* Suspend pagedir is allocated before final copy, therefore it - must be freed after resume - - Warning: this is even more evil than it seems. Pagedirs this file - talks about are completely different from page directories used by - MMU hardware. - */ -suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; - #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; - u8 key_iv[MAXKEY+MAXIV]; - swp_entry_t swsusp_info; + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t image; char orig_sig[10]; char sig[10]; } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; @@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info; * Saving part... */ -/* We memorize in swapfile_used what swap devices are used for suspension */ -#define SWAPFILE_UNUSED 0 -#define SWAPFILE_SUSPEND 1 /* This is the suspending device */ -#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ - -static unsigned short swapfile_used[MAX_SWAPFILES]; -static unsigned short root_swap; - -static int write_page(unsigned long addr, swp_entry_t *loc); -static int bio_read_page(pgoff_t page_off, void *page); - -static u8 key_iv[MAXKEY+MAXIV]; - -#ifdef CONFIG_SWSUSP_ENCRYPT - -static int crypto_init(int mode, void **mem) -{ - int error = 0; - int len; - char *modemsg; - struct crypto_tfm *tfm; - - modemsg = mode ? "suspend not possible" : "resume not possible"; - - tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); - if(!tfm) { - printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); - error = -EINVAL; - goto out; - } - - if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { - printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); - error = -ENOKEY; - goto fail; - } - - if (mode) - get_random_bytes(key_iv, MAXKEY+MAXIV); - - len = crypto_tfm_alg_max_keysize(tfm); - if (len > MAXKEY) - len = MAXKEY; - - if (crypto_cipher_setkey(tfm, key_iv, len)) { - printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); - error = -EKEYREJECTED; - goto fail; - } - - len = crypto_tfm_alg_ivsize(tfm); - - if (MAXIV < len) { - printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); - error = -EOVERFLOW; - goto fail; - } - - crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); - - *mem=(void *)tfm; - - goto out; - -fail: crypto_free_tfm(tfm); -out: return error; -} - -static __inline__ void crypto_exit(void *mem) -{ - crypto_free_tfm((struct crypto_tfm *)mem); -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - src.page = virt_to_page(p->address); - src.offset = 0; - src.length = PAGE_SIZE; - dst.page = virt_to_page((void *)&swsusp_header); - dst.offset = 0; - dst.length = PAGE_SIZE; - - error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, - PAGE_SIZE); - - if (!error) - error = write_page((unsigned long)&swsusp_header, - &(p->swap_address)); - return error; -} - -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); - if (!error) { - src.offset = 0; - src.length = PAGE_SIZE; - dst.offset = 0; - dst.length = PAGE_SIZE; - src.page = dst.page = virt_to_page((void *)p->address); - - error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, - &src, PAGE_SIZE); - } - return error; -} -#else -static __inline__ int crypto_init(int mode, void *mem) -{ - return 0; -} - -static __inline__ void crypto_exit(void *mem) -{ -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - return write_page(p->address, &(p->swap_address)); -} +static unsigned short root_swap = 0xffff; -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - return bio_read_page(swp_offset(p->swap_address), (void *)p->address); -} -#endif - -static int mark_swapfiles(swp_entry_t prev) +static int mark_swapfiles(swp_entry_t start) { int error; @@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); - swsusp_header.swsusp_info = prev; + swsusp_header.image = start; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), virt_to_page((unsigned long) @@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) * devfs, since the resume code can only recognize the form /dev/hda4, * but the suspend code would see the long name.) */ -static int is_resume_device(const struct swap_info_struct *swap_info) +static inline int is_resume_device(const struct swap_info_struct *swap_info) { struct file *file = swap_info->swap_file; struct inode *inode = file->f_dentry->d_inode; @@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) static int swsusp_swap_check(void) /* This is called before saving image */ { - int i, len; - - len=strlen(resume_file); - root_swap = 0xFFFF; - - spin_lock(&swap_lock); - for (i=0; i<MAX_SWAPFILES; i++) { - if (!(swap_info[i].flags & SWP_WRITEOK)) { - swapfile_used[i]=SWAPFILE_UNUSED; - } else { - if (!len) { - printk(KERN_WARNING "resume= option should be used to set suspend device" ); - if (root_swap == 0xFFFF) { - swapfile_used[i] = SWAPFILE_SUSPEND; - root_swap = i; - } else - swapfile_used[i] = SWAPFILE_IGNORED; - } else { - /* we ignore all swap devices that are not the resume_file */ - if (is_resume_device(&swap_info[i])) { - swapfile_used[i] = SWAPFILE_SUSPEND; - root_swap = i; - } else { - swapfile_used[i] = SWAPFILE_IGNORED; - } - } - } - } - spin_unlock(&swap_lock); - return (root_swap != 0xffff) ? 0 : -ENODEV; -} - -/** - * This is called after saving image so modification - * will be lost after resume... and that's what we want. - * we make the device unusable. A new call to - * lock_swapdevices can unlock the devices. - */ -static void lock_swapdevices(void) -{ int i; + if (!swsusp_resume_device) + return -ENODEV; spin_lock(&swap_lock); - for (i = 0; i< MAX_SWAPFILES; i++) - if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= SWP_WRITEOK; + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + if (is_resume_device(swap_info + i)) { + spin_unlock(&swap_lock); + root_swap = i; + return 0; } + } spin_unlock(&swap_lock); + return -ENODEV; } /** @@ -359,72 +184,217 @@ static void lock_swapdevices(void) static int write_page(unsigned long addr, swp_entry_t *loc) { swp_entry_t entry; - int error = 0; + int error = -ENOSPC; - entry = get_swap_page(); - if (swp_offset(entry) && - swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { - error = rw_swap_page_sync(WRITE, entry, - virt_to_page(addr)); - if (error == -EIO) - error = 0; - if (!error) + entry = get_swap_page_of_type(root_swap); + if (swp_offset(entry)) { + error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); + if (!error || error == -EIO) *loc = entry; - } else - error = -ENOSPC; + } return error; } /** - * data_free - Free the swap entries used by the saved image. + * Swap map-handling functions + * + * The swap map is a data structure used for keeping track of each page + * written to the swap. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are linked together with the help of either the + * .next (in memory) or the .next_swap (in swap) member. * - * Walk the list of used swap entries and free each one. - * This is only used for cleanup when suspend fails. + * The swap map is created during suspend. At that time we need to keep + * it in memory, because we have to free all of the allocated swap + * entries if an error occurs. The memory needed is preallocated + * so that we know in advance if there's enough of it. + * + * The first swap_map_page structure is filled with the swap entries that + * correspond to the first MAP_PAGE_SIZE data pages written to swap and + * so on. After the all of the data pages have been written, the order + * of the swap_map_page structures in the map is reversed so that they + * can be read from swap in the original order. This causes the data + * pages to be loaded in exactly the same order in which they have been + * saved. + * + * During resume we only need to use one swap_map_page structure + * at a time, which means that we only need to use two memory pages for + * reading the image - one for reading the swap_map_page structures + * and the second for reading the data pages from swap. */ -static void data_free(void) + +#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ + / sizeof(swp_entry_t)) + +struct swap_map_page { + swp_entry_t entries[MAP_PAGE_SIZE]; + swp_entry_t next_swap; + struct swap_map_page *next; +}; + +static inline void free_swap_map(struct swap_map_page *swap_map) { - swp_entry_t entry; - struct pbe *p; + struct swap_map_page *swp; - for_each_pbe (p, pagedir_nosave) { - entry = p->swap_address; - if (entry.val) - swap_free(entry); - else - break; + while (swap_map) { + swp = swap_map->next; + free_page((unsigned long)swap_map); + swap_map = swp; } } +static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +{ + struct swap_map_page *swap_map, *swp; + unsigned n = 0; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); + swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swap_map; + for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { + swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swp->next; + if (!swp) { + free_swap_map(swap_map); + return NULL; + } + } + return swap_map; +} + /** - * data_write - Write saved image to swap. - * - * Walk the list of pages in the image and sync each one to swap. + * reverse_swap_map - reverse the order of pages in the swap map + * @swap_map */ -static int data_write(void) + +static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) { - int error = 0, i = 0; - unsigned int mod = nr_copy_pages / 100; - struct pbe *p; - void *tfm; + struct swap_map_page *prev, *next; + + prev = NULL; + while (swap_map) { + next = swap_map->next; + swap_map->next = prev; + prev = swap_map; + swap_map = next; + } + return prev; +} - if ((error = crypto_init(1, &tfm))) - return error; +/** + * free_swap_map_entries - free the swap entries allocated to store + * the swap map @swap_map (this is only called in case of an error) + */ +static inline void free_swap_map_entries(struct swap_map_page *swap_map) +{ + while (swap_map) { + if (swap_map->next_swap.val) + swap_free(swap_map->next_swap); + swap_map = swap_map->next; + } +} - if (!mod) - mod = 1; +/** + * save_swap_map - save the swap map used for tracing the data pages + * stored in the swap + */ - printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); - for_each_pbe (p, pagedir_nosave) { - if (!(i%mod)) - printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = crypto_write(p, tfm))) { - crypto_exit(tfm); +static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) +{ + swp_entry_t entry = (swp_entry_t){0}; + int error; + + while (swap_map) { + swap_map->next_swap = entry; + if ((error = write_page((unsigned long)swap_map, &entry))) return error; - } - i++; + swap_map = swap_map->next; } - printk("\b\b\b\bdone\n"); - crypto_exit(tfm); + *start = entry; + return 0; +} + +/** + * free_image_entries - free the swap entries allocated to store + * the image data pages (this is only called in case of an error) + */ + +static inline void free_image_entries(struct swap_map_page *swp) +{ + unsigned k; + + while (swp) { + for (k = 0; k < MAP_PAGE_SIZE; k++) + if (swp->entries[k].val) + swap_free(swp->entries[k]); + swp = swp->next; + } +} + +/** + * The swap_map_handle structure is used for handling the swap map in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned int k; +}; + +static inline void init_swap_map_handle(struct swap_map_handle *handle, + struct swap_map_page *map) +{ + handle->cur = map; + handle->k = 0; +} + +static inline int swap_map_write_page(struct swap_map_handle *handle, + unsigned long addr) +{ + int error; + + error = write_page(addr, handle->cur->entries + handle->k); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->cur = handle->cur->next; + handle->k = 0; + } + return 0; +} + +/** + * save_image_data - save the data pages pointed to by the PBEs + * from the list @pblist using the swap map handle @handle + * (assume there are @nr_pages data pages to save) + */ + +static int save_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) +{ + unsigned int m; + struct pbe *p; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + for_each_pbe (p, pblist) { + error = swap_map_write_page(handle, p->address); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + if (!error) + printk("\b\b\b\bdone\n"); return error; } @@ -440,70 +410,70 @@ static void dump_info(void) pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); - pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); + pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); } -static void init_header(void) +static void init_header(unsigned int nr_pages) { memset(&swsusp_info, 0, sizeof(swsusp_info)); swsusp_info.version_code = LINUX_VERSION_CODE; swsusp_info.num_physpages = num_physpages; memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); - swsusp_info.suspend_pagedir = pagedir_nosave; swsusp_info.cpus = num_online_cpus(); - swsusp_info.image_pages = nr_copy_pages; -} - -static int close_swap(void) -{ - swp_entry_t entry; - int error; - - dump_info(); - error = write_page((unsigned long)&swsusp_info, &entry); - if (!error) { - printk( "S" ); - error = mark_swapfiles(entry); - printk( "|\n" ); - } - return error; + swsusp_info.image_pages = nr_pages; + swsusp_info.pages = nr_pages + + ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; } /** - * free_pagedir_entries - Free pages used by the page directory. - * - * This is used during suspend for error recovery. + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) */ -static void free_pagedir_entries(void) +static inline struct pbe *pack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - int i; + int j; - for (i = 0; i < swsusp_info.pagedir_pages; i++) - swap_free(swsusp_info.pagedir[i]); + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; } - /** - * write_pagedir - Write the array of pages holding the page directory. - * @last: Last swap entry we write (needed for header). + * save_image_metadata - save the .orig_address fields of the PBEs + * from the list @pblist using the swap map handle @handle */ -static int write_pagedir(void) +static int save_image_metadata(struct pbe *pblist, + struct swap_map_handle *handle) { - int error = 0; + unsigned long *buf; unsigned int n = 0; - struct pbe *pbe; + struct pbe *p; + int error = 0; - printk( "Writing pagedir..."); - for_each_pb_page (pbe, pagedir_nosave) { - if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) - return error; + printk("Saving image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + p = pack_orig_addresses(buf, p); + error = swap_map_write_page(handle, (unsigned long)buf); + if (error) + break; + n++; } - - swsusp_info.pagedir_pages = n; - printk("done (%u pages)\n", n); + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages saved)\n", n); return error; } @@ -511,75 +481,125 @@ static int write_pagedir(void) * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable. - * - * FIXME: si_swapinfo(&i) returns all swap devices information. - * We should only consider resume_device. + * space avaiable from the resume partition. */ static int enough_swap(unsigned int nr_pages) { - struct sysinfo i; + unsigned int free_swap = swap_info[root_swap].pages - + swap_info[root_swap].inuse_pages; - si_swapinfo(&i); - pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); - return i.freeswap > (nr_pages + PAGES_FOR_IO + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } /** - * write_suspend_image - Write entire image and metadata. + * swsusp_write - Write entire image and metadata. * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) */ -static int write_suspend_image(void) + +int swsusp_write(struct pbe *pblist, unsigned int nr_pages) { + struct swap_map_page *swap_map; + struct swap_map_handle handle; + swp_entry_t start; int error; - if (!enough_swap(nr_copy_pages)) { + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } + if (!enough_swap(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); return -ENOSPC; } - init_header(); - if ((error = data_write())) - goto FreeData; + init_header(nr_pages); + swap_map = alloc_swap_map(swsusp_info.pages); + if (!swap_map) + return -ENOMEM; + init_swap_map_handle(&handle, swap_map); + + error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); + if (!error) + error = save_image_metadata(pblist, &handle); + if (!error) + error = save_image_data(pblist, &handle, nr_pages); + if (error) + goto Free_image_entries; - if ((error = write_pagedir())) - goto FreePagedir; + swap_map = reverse_swap_map(swap_map); + error = save_swap_map(swap_map, &start); + if (error) + goto Free_map_entries; - if ((error = close_swap())) - goto FreePagedir; - Done: - memset(key_iv, 0, MAXKEY+MAXIV); + dump_info(); + printk( "S" ); + error = mark_swapfiles(start); + printk( "|\n" ); + if (error) + goto Free_map_entries; + +Free_swap_map: + free_swap_map(swap_map); return error; - FreePagedir: - free_pagedir_entries(); - FreeData: - data_free(); - goto Done; + +Free_map_entries: + free_swap_map_entries(swap_map); +Free_image_entries: + free_image_entries(swap_map); + goto Free_swap_map; } -/* It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. */ -int swsusp_write(void) -{ - int error; - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); - return error; - } - lock_swapdevices(); - error = write_suspend_image(); - /* This will unlock ignored swap devices since writing is finished */ - lock_swapdevices(); - return error; -} +#define SHRINK_BITE 10000 +int swsusp_shrink_memory(void) +{ + long size, tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + + printk("Shrinking memory... "); + do { + size = 2 * count_highmem_pages(); + size += size / 50 + count_data_pages(); + size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + + PAGES_FOR_IO; + tmp = size; + for_each_zone (zone) + if (!is_highmem(zone)) + tmp -= zone->free_pages; + if (tmp > 0) { + tmp = shrink_all_memory(SHRINK_BITE); + if (!tmp) + return -ENOMEM; + pages += tmp; + } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { + tmp = shrink_all_memory(SHRINK_BITE); + pages += tmp; + } + printk("\b%c", p[i++%4]); + } while (tmp > 0); + printk("\bdone (%lu pages freed)\n", pages); + return 0; +} int swsusp_suspend(void) { @@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src) /* We assume both lists contain the same number of elements */ while (src) { dst->orig_address = src->orig_address; - dst->swap_address = src->swap_address; dst = dst->next; src = src->next; } @@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page) return submit(WRITE, page_off, page); } -/* - * Sanity check if this image makes sense with this kernel/swap context - * I really don't think that it's foolproof but more than nothing.. +/** + * The following functions allow us to read data using a swap map + * in a file-alike way */ -static const char *sanity_check(void) +static inline void release_swap_map_reader(struct swap_map_handle *handle) { - dump_info(); - if (swsusp_info.version_code != LINUX_VERSION_CODE) - return "kernel version"; - if (swsusp_info.num_physpages != num_physpages) - return "memory size"; - if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) - return "system type"; - if (strcmp(swsusp_info.uts.release,system_utsname.release)) - return "kernel release"; - if (strcmp(swsusp_info.uts.version,system_utsname.version)) - return "version"; - if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) - return "machine"; -#if 0 - /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ - if (swsusp_info.cpus != num_possible_cpus()) - return "number of cpus"; -#endif - return NULL; + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; } - -static int check_header(void) +static inline int get_swap_map_reader(struct swap_map_handle *handle, + swp_entry_t start) { - const char *reason = NULL; int error; - if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_map_reader(handle); return error; - - /* Is this same machine? */ - if ((reason = sanity_check())) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); - return -EPERM; } - nr_copy_pages = swsusp_info.image_pages; - return error; + handle->k = 0; + return 0; } -static int check_sig(void) +static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) { + unsigned long offset; int error; - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); - memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); - - /* - * Reset swap signature now. - */ - error = bio_write_page(0, &swsusp_header); - } else { + if (!handle->cur) + return -EINVAL; + offset = swp_offset(handle->cur->entries[handle->k]); + if (!offset) return -EINVAL; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->k = 0; + offset = swp_offset(handle->cur->next_swap); + if (!offset) + release_swap_map_reader(handle); + else + error = bio_read_page(offset, handle->cur); } - if (!error) - pr_debug("swsusp: Signature found, resuming\n"); return error; } -/** - * data_read - Read image pages from swap. - * - * You do not need to check for overlaps, check_pagedir() - * already did that. - */ - -static int data_read(struct pbe *pblist) +static int check_header(void) { - struct pbe *p; - int error = 0; - int i = 0; - int mod = swsusp_info.image_pages / 100; - void *tfm; - - if ((error = crypto_init(0, &tfm))) - return error; - - if (!mod) - mod = 1; - - printk("swsusp: Reading image data (%lu pages): ", - swsusp_info.image_pages); - - for_each_pbe (p, pblist) { - if (!(i % mod)) - printk("\b\b\b\b%3d%%", i / mod); + char *reason = NULL; - if ((error = crypto_read(p, tfm))) { - crypto_exit(tfm); - return error; - } - - i++; + dump_info(); + if (swsusp_info.version_code != LINUX_VERSION_CODE) + reason = "kernel version"; + if (swsusp_info.num_physpages != num_physpages) + reason = "memory size"; + if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) + reason = "system type"; + if (strcmp(swsusp_info.uts.release,system_utsname.release)) + reason = "kernel release"; + if (strcmp(swsusp_info.uts.version,system_utsname.version)) + reason = "version"; + if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + return -EPERM; } - printk("\b\b\b\bdone\n"); - crypto_exit(tfm); - return error; + return 0; } /** - * read_pagedir - Read page backup list pages from swap + * load_image_data - load the image data using the swap map handle + * @handle and store them using the page backup list @pblist + * (assume there are @nr_pages pages to load) */ -static int read_pagedir(struct pbe *pblist) +static int load_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) { - struct pbe *pbpage, *p; - unsigned int i = 0; int error; + unsigned int m; + struct pbe *p; if (!pblist) - return -EFAULT; - - printk("swsusp: Reading pagedir (%lu pages)\n", - swsusp_info.pagedir_pages); - - for_each_pb_page (pbpage, pblist) { - unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); - - error = -EFAULT; - if (offset) { - p = (pbpage + PB_PAGE_SKIP)->next; - error = bio_read_page(offset, (void *)pbpage); - (pbpage + PB_PAGE_SKIP)->next = p; - } + return -EINVAL; + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + p = pblist; + while (p) { + error = swap_map_read_page(handle, (void *)p->address); if (error) break; + p = p->next; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; } - if (!error) - BUG_ON(i != swsusp_info.pagedir_pages); - + printk("\b\b\b\bdone\n"); return error; } +/** + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe + */ -static int check_suspend_image(void) +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - int error = 0; + int j; - if ((error = check_sig())) - return error; - - if ((error = check_header())) - return error; - - return 0; + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; } -static int read_suspend_image(void) +/** + * load_image_metadata - load the image metadata using the swap map + * handle @handle and put them into the PBEs in the list @pblist + */ + +static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) { - int error = 0; struct pbe *p; + unsigned long *buf; + unsigned int n = 0; + int error = 0; - if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) + printk("Loading image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) return -ENOMEM; - - if ((error = read_pagedir(p))) - return error; - create_pbe_list(p, nr_copy_pages); - mark_unsafe_pages(p); - pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); - if (pagedir_nosave) { - create_pbe_list(pagedir_nosave, nr_copy_pages); - copy_page_backup_list(pagedir_nosave, p); + p = pblist; + while (p) { + error = swap_map_read_page(handle, buf); + if (error) + break; + p = unpack_orig_addresses(buf, p); + n++; } - free_pagedir(p); - if (!pagedir_nosave) - return -ENOMEM; + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages loaded)\n", n); + return error; +} - /* Allocate memory for the image and read the data from swap */ +int swsusp_read(struct pbe **pblist_ptr) +{ + int error; + struct pbe *p, *pblist; + struct swap_map_handle handle; + unsigned int nr_pages; - error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + error = get_swap_map_reader(&handle, swsusp_header.image); if (!error) - error = data_read(pagedir_nosave); + error = swap_map_read_page(&handle, &swsusp_info); + if (!error) + error = check_header(); + if (error) + return error; + nr_pages = swsusp_info.image_pages; + p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); + if (!p) + return -ENOMEM; + error = load_image_metadata(p, &handle); + if (!error) { + mark_unsafe_pages(p); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) { + release_eaten_pages(); + error = load_image_data(pblist, &handle, nr_pages); + } + if (!error) + *pblist_ptr = pblist; + } + release_swap_map_reader(&handle); + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); return error; } /** - * swsusp_check - Check for saved image in swap + * swsusp_check - Check for swsusp signature in the resume device */ int swsusp_check(void) @@ -958,40 +1003,27 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - error = check_suspend_image(); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } if (error) - blkdev_put(resume_bdev); - } else + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { error = PTR_ERR(resume_bdev); - - if (!error) - pr_debug("swsusp: resume file found\n"); - else - pr_debug("swsusp: Error %d check for resume file\n", error); - return error; -} - -/** - * swsusp_read - Read saved image from swap. - */ - -int swsusp_read(void) -{ - int error; - - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); } - error = read_suspend_image(); - blkdev_put(resume_bdev); - memset(key_iv, 0, MAXKEY+MAXIV); + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); - else - pr_debug("swsusp: Error %d resuming\n", error); return error; } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c4d159a..48d3bce 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -116,6 +116,10 @@ void fastcall call_rcu(struct rcu_head *head, local_irq_restore(flags); } +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -162,6 +166,42 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *notused) +{ + int cpu = smp_processor_id(); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_head *head; + + head = &rdp->barrier; + atomic_inc(&rcu_barrier_cpu_count); + call_rcu(head, rcu_barrier_callback); +} + +/** + * rcu_barrier - Wait until all the in-flight RCUs are complete. + */ +void rcu_barrier(void) +{ + BUG_ON(in_interrupt()); + /* Take cpucontrol semaphore to protect against CPU hotplug */ + down(&rcu_barrier_sema); + init_completion(&rcu_barrier_completion); + atomic_set(&rcu_barrier_cpu_count, 0); + on_each_cpu(rcu_barrier_func, NULL, 0, 1); + wait_for_completion(&rcu_barrier_completion); + up(&rcu_barrier_sema); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. @@ -217,15 +257,23 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, if (rcp->next_pending && rcp->completed == rcp->cur) { - /* Can't change, since spin lock held. */ - cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); - rcp->next_pending = 0; - /* next_pending == 0 must be visible in __rcu_process_callbacks() - * before it can see new value of cur. + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. */ smp_wmb(); rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rsp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); + } } @@ -457,6 +505,7 @@ static struct notifier_block __devinitdata rcu_nb = { */ void __init rcu_init(void) { + sema_init(&rcu_barrier_sema, 1); rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88c28d4..49fbbef 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -409,9 +409,8 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ + rcu_barrier(); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) - synchronize_rcu(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ printk(KERN_ALERT TORTURE_FLAG "--- End of test: %s\n", diff --git a/kernel/sys.c b/kernel/sys.c index bce933e..eecf845 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -168,7 +169,7 @@ EXPORT_SYMBOL(notifier_chain_unregister); * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9990e10..a85047b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/kobject.h> #include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> @@ -83,9 +84,6 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif -#ifdef CONFIG_HOTPLUG -extern char hotplug_path[]; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif @@ -110,7 +108,7 @@ extern int pwrsw_enabled; extern int unaligned_enabled; #endif -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU extern int sysctl_ieee_emulation_warnings; #endif @@ -397,8 +395,8 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", - .data = &hotplug_path, - .maxlen = HOTPLUG_PATH_LEN, + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -544,7 +542,7 @@ static ctl_table kern_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { .ctl_name = KERN_IEEE_EMULATION_WARNINGS, @@ -646,7 +644,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) { .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", @@ -2192,29 +2190,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context) { - size_t l, len; - if (!table->data || !table->maxlen) return -ENOTDIR; if (oldval && oldlenp) { - if (get_user(len, oldlenp)) + size_t bufsize; + if (get_user(bufsize, oldlenp)) return -EFAULT; - if (len) { - l = strlen(table->data); - if (len > l) len = l; - if (len >= table->maxlen) + if (bufsize) { + size_t len = strlen(table->data), copied; + + /* This shouldn't trigger for a well-formed sysctl */ + if (len > table->maxlen) len = table->maxlen; - if(copy_to_user(oldval, table->data, len)) - return -EFAULT; - if(put_user(0, ((char __user *) oldval) + len)) + + /* Copy up to a max of bufsize-1 bytes of the string */ + copied = (len >= bufsize) ? bufsize - 1 : len; + + if (copy_to_user(oldval, table->data, copied) || + put_user(0, (char __user *)(oldval + copied))) return -EFAULT; - if(put_user(len, oldlenp)) + if (put_user(len, oldlenp)) return -EFAULT; } } if (newval && newlen) { - len = newlen; + size_t len = newlen; if (len > table->maxlen) len = table->maxlen; if(copy_from_user(table->data, newval, len)) @@ -2223,7 +2224,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, len--; ((char *) table->data)[len] = 0; } - return 0; + return 1; } /* diff --git a/kernel/time.c b/kernel/time.c index 245d595..b94bfa8 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -561,6 +561,28 @@ void getnstimeofday(struct timespec *tv) EXPORT_SYMBOL_GPL(getnstimeofday); #endif +void getnstimestamp(struct timespec *ts) +{ + unsigned int seq; + struct timespec wall2mono; + + /* synchronize with settimeofday() changes */ + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + wall2mono = wall_to_monotonic; + } while(unlikely(read_seqretry(&xtime_lock, seq))); + + /* adjust to monotonicaly-increasing values */ + ts->tv_sec += wall2mono.tv_sec; + ts->tv_nsec += wall2mono.tv_nsec; + while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) { + ts->tv_nsec -= NSEC_PER_SEC; + ts->tv_sec++; + } +} +EXPORT_SYMBOL_GPL(getnstimestamp); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { |