diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/compaction.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 15 | ||||
-rw-r--r-- | mm/huge_memory.c | 16 | ||||
-rw-r--r-- | mm/hugetlb.c | 39 | ||||
-rw-r--r-- | mm/memblock.c | 961 | ||||
-rw-r--r-- | mm/memcontrol.c | 103 | ||||
-rw-r--r-- | mm/mempolicy.c | 11 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/nobootmem.c | 45 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 57 | ||||
-rw-r--r-- | mm/page_alloc.c | 520 | ||||
-rw-r--r-- | mm/percpu-vm.c | 17 | ||||
-rw-r--r-- | mm/percpu.c | 68 | ||||
-rw-r--r-- | mm/shmem.c | 17 | ||||
-rw-r--r-- | mm/slab.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 46 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/vmalloc.c | 60 | ||||
-rw-r--r-- | mm/vmscan.c | 40 |
24 files changed, 1005 insertions, 1063 deletions
@@ -131,6 +131,12 @@ config SPARSEMEM_VMEMMAP config HAVE_MEMBLOCK boolean +config HAVE_MEMBLOCK_NODE_MAP + boolean + +config ARCH_DISCARD_MEMBLOCK + boolean + config NO_BOOTMEM boolean diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a086064..7ba8fea 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -600,14 +600,10 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. Force - * unfreeze of the thread before calling kthread_stop(), otherwise - * it would never exet if it is currently stuck in the refrigerator. + * safe anymore, since the bdi is gone from visibility. */ - if (bdi->wb.task) { - thaw_process(bdi->wb.task); + if (bdi->wb.task) kthread_stop(bdi->wb.task); - } } /* @@ -724,6 +720,14 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); + /* + * If bdi_unregister() had already been called earlier, the + * wakeup_timer could still be armed because bdi_prune_sb() + * can race with the bdi_wakeup_thread_delayed() calls from + * __mark_inode_dirty(). + */ + del_timer_sync(&bdi->wb.wakeup_timer); + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); diff --git a/mm/compaction.c b/mm/compaction.c index 899d956..1253d7a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -721,23 +721,23 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct sys_device *dev, - struct sysdev_attribute *attr, +ssize_t sysfs_compact_node(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { compact_node(dev->id); return count; } -static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); +static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); int compaction_register_node(struct node *node) { - return sysdev_create_file(&node->sysdev, &attr_compact); + return device_create_file(&node->dev, &dev_attr_compact); } void compaction_unregister_node(struct node *node) { - return sysdev_remove_file(&node->sysdev, &attr_compact); + return device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ diff --git a/mm/failslab.c b/mm/failslab.c index 0dd7b8f..fefaaba 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -35,7 +35,7 @@ __setup("failslab=", setup_failslab); static int __init failslab_debugfs_init(void) { struct dentry *dir; - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); if (IS_ERR(dir)) diff --git a/mm/filemap.c b/mm/filemap.c index c0018f2..a0701e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1828,7 +1828,7 @@ repeat: page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) @@ -1925,10 +1925,7 @@ static struct page *wait_on_page_read(struct page *page) * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with - * any new page allocations done using the specified allocation flags. Note - * that the Radix tree operations will still use GFP_KERNEL, so you can't - * expect to do this atomically or anything like that - but you can pass in - * other page requirements. + * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. */ @@ -1971,7 +1968,7 @@ EXPORT_SYMBOL(read_cache_page); */ int should_remove_suid(struct dentry *dentry) { - mode_t mode = dentry->d_inode->i_mode; + umode_t mode = dentry->d_inode->i_mode; int kill = 0; /* suid always must be killed */ @@ -2407,7 +2404,6 @@ static ssize_t generic_perform_write(struct file *file, iov_iter_count(i)); again: - /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the @@ -2463,7 +2459,10 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } } while (iov_iter_count(i)); return written ? written : status; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4298aba..36b3d98 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage) static void khugepaged_alloc_sleep(void) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } #ifndef CONFIG_NUMA @@ -2313,14 +2309,10 @@ static void khugepaged_loop(void) if (unlikely(kthread_should_stop())) break; if (khugepaged_has_work()) { - DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) continue; - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_scan_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); } else if (khugepaged_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dae27ba..7acd125 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -900,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -914,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: @@ -1591,9 +1592,9 @@ static void __init hugetlb_sysfs_init(void) /* * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node sysdevs in node_devices[] using a parallel array. The array - * index of a node sysdev or _hstate == node id. - * This is here to avoid any static dependency of the node sysdev driver, in + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { @@ -1603,7 +1604,7 @@ struct node_hstate { struct node_hstate node_hstates[MAX_NUMNODES]; /* - * A subset of global hstate attributes for node sysdevs + * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, @@ -1617,7 +1618,7 @@ static struct attribute_group per_node_hstate_attr_group = { }; /* - * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) @@ -1640,13 +1641,13 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) } /* - * Unregister hstate attributes from a single node sysdev. + * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ @@ -1662,7 +1663,7 @@ void hugetlb_unregister_node(struct node *node) } /* - * hugetlb module exit: unregister hstate attributes from node sysdevs + * hugetlb module exit: unregister hstate attributes from node devices * that have them. */ static void hugetlb_unregister_all_nodes(void) @@ -1670,7 +1671,7 @@ static void hugetlb_unregister_all_nodes(void) int nid; /* - * disable node sysdev registrations. + * disable node device registrations. */ register_hugetlbfs_with_node(NULL, NULL); @@ -1682,20 +1683,20 @@ static void hugetlb_unregister_all_nodes(void) } /* - * Register hstate attributes for a single node sysdev. + * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->sysdev.kobj); + &node->dev.kobj); if (!nhs->hugepages_kobj) return; @@ -1706,7 +1707,7 @@ void hugetlb_register_node(struct node *node) if (err) { printk(KERN_ERR "Hugetlb: Unable to add hstate %s" " for node %d\n", - h->name, node->sysdev.id); + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@ -1715,8 +1716,8 @@ void hugetlb_register_node(struct node *node) /* * hugetlb init time: register hstate attributes for all registered node - * sysdevs of nodes that have memory. All on-line nodes should have - * registered their associated sysdev by this time. + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. */ static void hugetlb_register_all_nodes(void) { @@ -1724,12 +1725,12 @@ static void hugetlb_register_all_nodes(void) for_each_node_state(nid, N_HIGH_MEMORY) { struct node *node = &node_devices[nid]; - if (node->sysdev.id == nid) + if (node->dev.id == nid) hugetlb_register_node(node); } /* - * Let the node sysdev driver know we're here so it can + * Let the node device driver know we're here so it can * [un]register hstate attributes on node hotplug. */ register_hugetlbfs_with_node(hugetlb_register_node, @@ -2422,6 +2423,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { + page_cache_release(new_page); + page_cache_release(old_page); /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; diff --git a/mm/memblock.c b/mm/memblock.c index 84bec49..2f55f19 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,12 +20,23 @@ #include <linux/seq_file.h> #include <linux/memblock.h> -struct memblock memblock __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; + +struct memblock memblock __initdata_memblock = { + .memory.regions = memblock_memory_init_regions, + .memory.cnt = 1, /* empty dummy entry */ + .memory.max = INIT_MEMBLOCK_REGIONS, + + .reserved.regions = memblock_reserved_init_regions, + .reserved.cnt = 1, /* empty dummy entry */ + .reserved.max = INIT_MEMBLOCK_REGIONS, + + .current_limit = MEMBLOCK_ALLOC_ANYWHERE, +}; int memblock_debug __initdata_memblock; -int memblock_can_resize __initdata_memblock; -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; -static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static int memblock_can_resize __initdata_memblock; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -38,20 +49,15 @@ static inline const char *memblock_type_name(struct memblock_type *type) return "unknown"; } -/* - * Address comparison utilities - */ - -static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) -{ - return addr & ~(size - 1); -} - -static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ +static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return (addr + (size - 1)) & ~(size - 1); + return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); } +/* + * Address comparison utilities + */ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, phys_addr_t size2) { @@ -73,83 +79,66 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } -/* - * Find, allocate, deallocate or reserve unreserved regions. All allocations - * are top-down. +/** + * memblock_find_in_range_node - find free area in given range and node + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Find @size free area aligned to @align in the specified range and node. + * + * RETURNS: + * Found address on success, %0 on failure. */ - -static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align, int nid) { - phys_addr_t base, res_base; - long j; - - /* In case, huge size is requested */ - if (end < size) - return MEMBLOCK_ERROR; - - base = memblock_align_down((end - size), align); + phys_addr_t this_start, this_end, cand; + u64 i; - /* Prevent allocations returning 0 as it's also used to - * indicate an allocation failure - */ - if (start == 0) - start = PAGE_SIZE; - - while (start <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) - return base; - res_base = memblock.reserved.regions[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); - } + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); - return MEMBLOCK_ERROR; -} - -static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, - phys_addr_t align, phys_addr_t start, phys_addr_t end) -{ - long i; - - BUG_ON(0 == size); - - /* Pump up max_addr */ + /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; - /* We do a top-down search, this tends to limit memory - * fragmentation by keeping early boot allocs near the - * top of memory - */ - for (i = memblock.memory.cnt - 1; i >= 0; i--) { - phys_addr_t memblockbase = memblock.memory.regions[i].base; - phys_addr_t memblocksize = memblock.memory.regions[i].size; - phys_addr_t bottom, top, found; + /* adjust @start to avoid underflow and allocating the first page */ + start = max3(start, size, (phys_addr_t)PAGE_SIZE); + end = max(start, end); - if (memblocksize < size) - continue; - if ((memblockbase + memblocksize) <= start) - break; - bottom = max(memblockbase, start); - top = min(memblockbase + memblocksize, end); - if (bottom >= top) - continue; - found = memblock_find_region(bottom, top, size, align); - if (found != MEMBLOCK_ERROR) - return found; + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; } - return MEMBLOCK_ERROR; + return 0; } -/* - * Find a free area with specified alignment in a specific range. +/** + * memblock_find_in_range - find free area in given range + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * + * Find @size free area aligned to @align in the specified range. + * + * RETURNS: + * Found address on success, %0 on failure. */ -u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) +phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align) { - return memblock_find_base(size, align, start, end); + return memblock_find_in_range_node(start, end, size, align, + MAX_NUMNODES); } /* @@ -178,25 +167,21 @@ int __init_memblock memblock_reserve_reserved_regions(void) static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { - unsigned long i; - - for (i = r; i < type->cnt - 1; i++) { - type->regions[i].base = type->regions[i + 1].base; - type->regions[i].size = type->regions[i + 1].size; - } + type->total_size -= type->regions[r].size; + memmove(&type->regions[r], &type->regions[r + 1], + (type->cnt - (r + 1)) * sizeof(type->regions[r])); type->cnt--; /* Special case for empty arrays */ if (type->cnt == 0) { + WARN_ON(type->total_size != 0); type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } -/* Defined below but needed now */ -static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); - static int __init_memblock memblock_double_array(struct memblock_type *type) { struct memblock_region *new_array, *old_array; @@ -226,10 +211,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); - addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + addr = new_array ? __pa(new_array) : 0; } else - addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); - if (addr == MEMBLOCK_ERROR) { + addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; @@ -254,7 +239,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); + BUG_ON(memblock_reserve(addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -268,343 +253,514 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; } -int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, - phys_addr_t addr2, phys_addr_t size2) -{ - return 1; -} - -static long __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +/** + * memblock_merge_regions - merge neighboring compatible regions + * @type: memblock type to scan + * + * Scan @type and merge neighboring compatible regions. + */ +static void __init_memblock memblock_merge_regions(struct memblock_type *type) { - phys_addr_t end = base + size; - int i, slot = -1; - - /* First try and coalesce this MEMBLOCK with others */ - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rend = rgn->base + rgn->size; + int i = 0; - /* Exit if there's no possible hits */ - if (rgn->base > end || rgn->size == 0) - break; + /* cnt never goes below 1 */ + while (i < type->cnt - 1) { + struct memblock_region *this = &type->regions[i]; + struct memblock_region *next = &type->regions[i + 1]; - /* Check if we are fully enclosed within an existing - * block - */ - if (rgn->base <= base && rend >= end) - return 0; + if (this->base + this->size != next->base || + memblock_get_region_node(this) != + memblock_get_region_node(next)) { + BUG_ON(this->base + this->size > next->base); + i++; + continue; + } - /* Check if we overlap or are adjacent with the bottom - * of a block. - */ - if (base < rgn->base && end >= rgn->base) { - /* If we can't coalesce, create a new block */ - if (!memblock_memory_can_coalesce(base, size, - rgn->base, - rgn->size)) { - /* Overlap & can't coalesce are mutually - * exclusive, if you do that, be prepared - * for trouble - */ - WARN_ON(end != rgn->base); - goto new_block; - } - /* We extend the bottom of the block down to our - * base - */ - rgn->base = base; - rgn->size = rend - base; + this->size += next->size; + memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + type->cnt--; + } +} - /* Return if we have nothing else to allocate - * (fully coalesced) - */ - if (rend >= end) - return 0; +/** + * memblock_insert_region - insert new memblock region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * + * Insert new memblock region [@base,@base+@size) into @type at @idx. + * @type must already have extra room to accomodate the new region. + */ +static void __init_memblock memblock_insert_region(struct memblock_type *type, + int idx, phys_addr_t base, + phys_addr_t size, int nid) +{ + struct memblock_region *rgn = &type->regions[idx]; - /* We continue processing from the end of the - * coalesced block. - */ - base = rend; - size = end - base; - } + BUG_ON(type->cnt >= type->max); + memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); + rgn->base = base; + rgn->size = size; + memblock_set_region_node(rgn, nid); + type->cnt++; + type->total_size += size; +} - /* Now check if we overlap or are adjacent with the - * top of a block - */ - if (base <= rend && end >= rend) { - /* If we can't coalesce, create a new block */ - if (!memblock_memory_can_coalesce(rgn->base, - rgn->size, - base, size)) { - /* Overlap & can't coalesce are mutually - * exclusive, if you do that, be prepared - * for trouble - */ - WARN_ON(rend != base); - goto new_block; - } - /* We adjust our base down to enclose the - * original block and destroy it. It will be - * part of our new allocation. Since we've - * freed an entry, we know we won't fail - * to allocate one later, so we won't risk - * losing the original block allocation. - */ - size += (base - rgn->base); - base = rgn->base; - memblock_remove_region(type, i--); - } - } +/** + * memblock_add_region - add new memblock region + * @type: memblock type to add new region into + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * + * Add new memblock region [@base,@base+@size) into @type. The new region + * is allowed to overlap with existing ones - overlaps don't affect already + * existing regions. @type is guaranteed to be minimal (all neighbouring + * compatible regions are merged) after the addition. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, int nid) +{ + bool insert = false; + phys_addr_t obase = base; + phys_addr_t end = base + memblock_cap_size(base, &size); + int i, nr_new; - /* If the array is empty, special case, replace the fake - * filler region and return - */ - if ((type->cnt == 1) && (type->regions[0].size == 0)) { + /* special case for empty array */ + if (type->regions[0].size == 0) { + WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + memblock_set_region_node(&type->regions[0], nid); + type->total_size = size; return 0; } - - new_block: - /* If we are out of space, we fail. It's too late to resize the array - * but then this shouldn't have happened in the first place. +repeat: + /* + * The following is executed twice. Once with %false @insert and + * then with %true. The first counts the number of regions needed + * to accomodate the new area. The second actually inserts them. */ - if (WARN_ON(type->cnt >= type->max)) - return -1; + base = obase; + nr_new = 0; - /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ - for (i = type->cnt - 1; i >= 0; i--) { - if (base < type->regions[i].base) { - type->regions[i+1].base = type->regions[i].base; - type->regions[i+1].size = type->regions[i].size; - } else { - type->regions[i+1].base = base; - type->regions[i+1].size = size; - slot = i + 1; + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) break; + if (rend <= base) + continue; + /* + * @rgn overlaps. If it separates the lower part of new + * area, insert that portion. + */ + if (rbase > base) { + nr_new++; + if (insert) + memblock_insert_region(type, i++, base, + rbase - base, nid); } + /* area below @rend is dealt with, forget about it */ + base = min(rend, end); } - if (base < type->regions[0].base) { - type->regions[0].base = base; - type->regions[0].size = size; - slot = 0; + + /* insert the remaining portion */ + if (base < end) { + nr_new++; + if (insert) + memblock_insert_region(type, i, base, end - base, nid); } - type->cnt++; - /* The array is full ? Try to resize it. If that fails, we undo - * our allocation and return an error + /* + * If this was the first round, resize array and repeat for actual + * insertions; otherwise, merge and return. */ - if (type->cnt == type->max && memblock_double_array(type)) { - BUG_ON(slot < 0); - memblock_remove_region(type, slot); - return -1; + if (!insert) { + while (type->cnt + nr_new > type->max) + if (memblock_double_array(type) < 0) + return -ENOMEM; + insert = true; + goto repeat; + } else { + memblock_merge_regions(type); + return 0; } - - return 0; } -long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, + int nid) { - return memblock_add_region(&memblock.memory, base, size); + return memblock_add_region(&memblock.memory, base, size, nid); +} +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +{ + return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); } -static long __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +/** + * memblock_isolate_range - isolate given range into disjoint memblocks + * @type: memblock type to isolate range for + * @base: base of range to isolate + * @size: size of range to isolate + * @start_rgn: out parameter for the start of isolated region + * @end_rgn: out parameter for the end of isolated region + * + * Walk @type and ensure that regions don't cross the boundaries defined by + * [@base,@base+@size). Crossing regions are split at the boundaries, + * which may create at most two more regions. The index of the first + * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_isolate_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int *start_rgn, int *end_rgn) { - phys_addr_t end = base + size; + phys_addr_t end = base + memblock_cap_size(base, &size); int i; - /* Walk through the array for collisions */ + *start_rgn = *end_rgn = 0; + + /* we'll create at most two more regions */ + while (type->cnt + 2 > type->max) + if (memblock_double_array(type) < 0) + return -ENOMEM; + for (i = 0; i < type->cnt; i++) { struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rend = rgn->base + rgn->size; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; - /* Nothing more to do, exit */ - if (rgn->base > end || rgn->size == 0) + if (rbase >= end) break; - - /* If we fully enclose the block, drop it */ - if (base <= rgn->base && end >= rend) { - memblock_remove_region(type, i--); + if (rend <= base) continue; - } - /* If we are fully enclosed within a block - * then we need to split it and we are done - */ - if (base > rgn->base && end < rend) { - rgn->size = base - rgn->base; - if (!memblock_add_region(type, end, rend - end)) - return 0; - /* Failure to split is bad, we at least - * restore the block before erroring + if (rbase < base) { + /* + * @rgn intersects from below. Split and continue + * to process the next region - the new top half. + */ + rgn->base = base; + rgn->size -= base - rbase; + type->total_size -= base - rbase; + memblock_insert_region(type, i, rbase, base - rbase, + memblock_get_region_node(rgn)); + } else if (rend > end) { + /* + * @rgn intersects from above. Split and redo the + * current region - the new bottom half. */ - rgn->size = rend - rgn->base; - WARN_ON(1); - return -1; - } - - /* Check if we need to trim the bottom of a block */ - if (rgn->base < end && rend > end) { - rgn->size -= end - rgn->base; rgn->base = end; - break; + rgn->size -= end - rbase; + type->total_size -= end - rbase; + memblock_insert_region(type, i--, rbase, end - rbase, + memblock_get_region_node(rgn)); + } else { + /* @rgn is fully contained, record it */ + if (!*end_rgn) + *start_rgn = i; + *end_rgn = i + 1; } + } - /* And check if we need to trim the top of a block */ - if (base < rend) - rgn->size -= rend - base; + return 0; +} - } +static int __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = end_rgn - 1; i >= start_rgn; i--) + memblock_remove_region(type, i); return 0; } -long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { + memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", + (unsigned long long)base, + (unsigned long long)base + size, + (void *)_RET_IP_); + return __memblock_remove(&memblock.reserved, base, size); } -long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { struct memblock_type *_rgn = &memblock.reserved; + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + (unsigned long long)base, + (unsigned long long)base + size, + (void *)_RET_IP_); BUG_ON(0 == size); - return memblock_add_region(_rgn, base, size); + return memblock_add_region(_rgn, base, size, MAX_NUMNODES); } -phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +/** + * __next_free_mem_range - next function for for_each_free_mem_range() + * @idx: pointer to u64 loop variable + * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Find the first free area from *@idx which matches @nid, fill the out + * parameters, and update *@idx for the next iteration. The lower 32bit of + * *@idx contains index into memory region and the upper 32bit indexes the + * areas before each reserved region. For example, if reserved regions + * look like the following, + * + * 0:[0-16), 1:[32-48), 2:[128-130) + * + * The upper 32bit indexes the following regions. + * + * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) + * + * As both region arrays are sorted, the function advances the two indices + * in lockstep and returns each intersection. + */ +void __init_memblock __next_free_mem_range(u64 *idx, int nid, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - phys_addr_t found; + struct memblock_type *mem = &memblock.memory; + struct memblock_type *rsv = &memblock.reserved; + int mi = *idx & 0xffffffff; + int ri = *idx >> 32; - /* We align the size to limit fragmentation. Without this, a lot of - * small allocs quickly eat up the whole reserve array on sparc - */ - size = memblock_align_up(size, align); + for ( ; mi < mem->cnt; mi++) { + struct memblock_region *m = &mem->regions[mi]; + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; - found = memblock_find_base(size, align, 0, max_addr); - if (found != MEMBLOCK_ERROR && - !memblock_add_region(&memblock.reserved, found, size)) - return found; + /* only memory regions are associated with nodes, check it */ + if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + continue; - return 0; + /* scan areas before each reservation for intersection */ + for ( ; ri < rsv->cnt + 1; ri++) { + struct memblock_region *r = &rsv->regions[ri]; + phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; + phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + + /* if ri advanced past mi, break out to advance mi */ + if (r_start >= m_end) + break; + /* if the two regions intersect, we're done */ + if (m_start < r_end) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = memblock_get_region_node(m); + /* + * The region which ends first is advanced + * for the next iteration. + */ + if (m_end <= r_end) + mi++; + else + ri++; + *idx = (u32)mi | (u64)ri << 32; + return; + } + } + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; } -phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +/** + * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() + * @idx: pointer to u64 loop variable + * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Reverse of __next_free_mem_range(). + */ +void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - phys_addr_t alloc; + struct memblock_type *mem = &memblock.memory; + struct memblock_type *rsv = &memblock.reserved; + int mi = *idx & 0xffffffff; + int ri = *idx >> 32; - alloc = __memblock_alloc_base(size, align, max_addr); + if (*idx == (u64)ULLONG_MAX) { + mi = mem->cnt - 1; + ri = rsv->cnt; + } - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + for ( ; mi >= 0; mi--) { + struct memblock_region *m = &mem->regions[mi]; + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; - return alloc; -} + /* only memory regions are associated with nodes, check it */ + if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + continue; -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); -} + /* scan areas before each reservation for intersection */ + for ( ; ri >= 0; ri--) { + struct memblock_region *r = &rsv->regions[ri]; + phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; + phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + + /* if ri advanced past mi, break out to advance mi */ + if (r_end <= m_start) + break; + /* if the two regions intersect, we're done */ + if (m_end > r_start) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = memblock_get_region_node(m); + + if (m_start >= r_start) + mi--; + else + ri--; + *idx = (u32)mi | (u64)ri << 32; + return; + } + } + } + *idx = ULLONG_MAX; +} +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * Additional node-local allocators. Search for node memory is bottom up - * and walks memblock regions within that node bottom-up as well, but allocation - * within an memblock region is top-down. XXX I plan to fix that at some stage - * - * WARNING: Only available after early_node_map[] has been populated, - * on some architectures, that is after all the calls to add_active_range() - * have been done to populate it. + * Common iterator interface used to define for_each_mem_range(). */ - -phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) +void __init_memblock __next_mem_pfn_range(int *idx, int nid, + unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid) { -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP - /* - * This code originates from sparc which really wants use to walk by addresses - * and returns the nid. This is not very convenient for early_pfn_map[] users - * as the map isn't sorted yet, and it really wants to be walked by nid. - * - * For now, I implement the inefficient method below which walks the early - * map multiple times. Eventually we may want to use an ARCH config option - * to implement a completely different method for both case. - */ - unsigned long start_pfn, end_pfn; - int i; + struct memblock_type *type = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < MAX_NUMNODES; i++) { - get_pfn_range_for_nid(i, &start_pfn, &end_pfn); - if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + while (++*idx < type->cnt) { + r = &type->regions[*idx]; + + if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - *nid = i; - return min(end, PFN_PHYS(end_pfn)); + if (nid == MAX_NUMNODES || nid == r->nid) + break; + } + if (*idx >= type->cnt) { + *idx = -1; + return; } -#endif - *nid = 0; - return end; + if (out_start_pfn) + *out_start_pfn = PFN_UP(r->base); + if (out_end_pfn) + *out_end_pfn = PFN_DOWN(r->base + r->size); + if (out_nid) + *out_nid = r->nid; } -static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, - phys_addr_t size, - phys_addr_t align, int nid) +/** + * memblock_set_node - set node ID on memblock regions + * @base: base of area to set node ID for + * @size: size of area to set node ID for + * @nid: node ID to set + * + * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Regions which cross the area boundaries are split as necessary. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, + int nid) { - phys_addr_t start, end; + struct memblock_type *type = &memblock.memory; + int start_rgn, end_rgn; + int i, ret; - start = mp->base; - end = start + mp->size; + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; - start = memblock_align_up(start, align); - while (start < end) { - phys_addr_t this_end; - int this_nid; + for (i = start_rgn; i < end_rgn; i++) + type->regions[i].nid = nid; - this_end = memblock_nid_range(start, end, &this_nid); - if (this_nid == nid) { - phys_addr_t ret = memblock_find_region(start, this_end, size, align); - if (ret != MEMBLOCK_ERROR && - !memblock_add_region(&memblock.reserved, ret, size)) - return ret; - } - start = this_end; - } + memblock_merge_regions(type); + return 0; +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + phys_addr_t found; - return MEMBLOCK_ERROR; + found = memblock_find_in_range_node(0, max_addr, size, align, nid); + if (found && !memblock_reserve(found, size)) + return found; + + return 0; } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - struct memblock_type *mem = &memblock.memory; - int i; + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} - BUG_ON(0 == size); +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); +} - /* We align the size to limit fragmentation. Without this, a lot of - * small allocs quickly eat up the whole reserve array on sparc - */ - size = memblock_align_up(size, align); +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + phys_addr_t alloc; - /* We do a bottom-up search for a region with the right - * nid since that's easier considering how memblock_nid_range() - * works - */ - for (i = 0; i < mem->cnt; i++) { - phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], - size, align, nid); - if (ret != MEMBLOCK_ERROR) - return ret; - } + alloc = __memblock_alloc_base(size, align, max_addr); - return 0; + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; +} + +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) @@ -613,7 +769,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i if (res) return res; - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } @@ -621,10 +777,9 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Remaining API functions */ -/* You must call memblock_analyze() before this. */ phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory_size; + return memblock.memory.total_size; } /* lowest address */ @@ -640,45 +795,28 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } -/* You must call memblock_analyze() after this. */ -void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) +void __init memblock_enforce_memory_limit(phys_addr_t limit) { unsigned long i; - phys_addr_t limit; - struct memblock_region *p; + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; - if (!memory_limit) + if (!limit) return; - /* Truncate the memblock regions to satisfy the memory limit. */ - limit = memory_limit; + /* find out max address */ for (i = 0; i < memblock.memory.cnt; i++) { - if (limit > memblock.memory.regions[i].size) { - limit -= memblock.memory.regions[i].size; - continue; - } - - memblock.memory.regions[i].size = limit; - memblock.memory.cnt = i + 1; - break; - } - - memory_limit = memblock_end_of_DRAM(); + struct memblock_region *r = &memblock.memory.regions[i]; - /* And truncate any reserves above the limit also. */ - for (i = 0; i < memblock.reserved.cnt; i++) { - p = &memblock.reserved.regions[i]; - - if (p->base > memory_limit) - p->size = 0; - else if ((p->base + p->size) > memory_limit) - p->size = memory_limit - p->base; - - if (p->size == 0) { - memblock_remove_region(&memblock.reserved, i); - i--; + if (limit <= r->size) { + max_addr = r->base + limit; + break; } + limit -= r->size; } + + /* truncate both memory and reserved regions */ + __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); + __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) @@ -712,16 +850,18 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); + phys_addr_t end = base + memblock_cap_size(base, &size); if (idx == -1) return 0; return memblock.memory.regions[idx].base <= base && (memblock.memory.regions[idx].base + - memblock.memory.regions[idx].size) >= (base + size); + memblock.memory.regions[idx].size) >= end; } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { + memblock_cap_size(base, &size); return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } @@ -731,86 +871,45 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } -static void __init_memblock memblock_dump(struct memblock_type *region, char *name) +static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; int i; - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); - for (i = 0; i < region->cnt; i++) { - base = region->regions[i].base; - size = region->regions[i].size; - - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", - name, i, base, base + size - 1, size); + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + char nid_buf[32] = ""; + + base = rgn->base; + size = rgn->size; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + if (memblock_get_region_node(rgn) != MAX_NUMNODES) + snprintf(nid_buf, sizeof(nid_buf), " on node %d", + memblock_get_region_node(rgn)); +#endif + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", + name, i, base, base + size - 1, size, nid_buf); } } -void __init_memblock memblock_dump_all(void) +void __init_memblock __memblock_dump_all(void) { - if (!memblock_debug) - return; - pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + pr_info(" memory size = %#llx reserved size = %#llx\n", + (unsigned long long)memblock.memory.total_size, + (unsigned long long)memblock.reserved.total_size); memblock_dump(&memblock.memory, "memory"); memblock_dump(&memblock.reserved, "reserved"); } -void __init memblock_analyze(void) +void __init memblock_allow_resize(void) { - int i; - - /* Check marker in the unused last array entry */ - WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != MEMBLOCK_INACTIVE); - WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != MEMBLOCK_INACTIVE); - - memblock.memory_size = 0; - - for (i = 0; i < memblock.memory.cnt; i++) - memblock.memory_size += memblock.memory.regions[i].size; - - /* We allow resizing from there */ memblock_can_resize = 1; } -void __init memblock_init(void) -{ - static int init_done __initdata = 0; - - if (init_done) - return; - init_done = 1; - - /* Hookup the initial arrays */ - memblock.memory.regions = memblock_memory_init_regions; - memblock.memory.max = INIT_MEMBLOCK_REGIONS; - memblock.reserved.regions = memblock_reserved_init_regions; - memblock.reserved.max = INIT_MEMBLOCK_REGIONS; - - /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; - - /* Create a dummy zero size MEMBLOCK which will get coalesced away later. - * This simplifies the memblock_add() code below... - */ - memblock.memory.regions[0].base = 0; - memblock.memory.regions[0].size = 0; - memblock.memory.cnt = 1; - - /* Ditto. */ - memblock.reserved.regions[0].base = 0; - memblock.reserved.regions[0].size = 0; - memblock.reserved.cnt = 1; - - memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; -} - static int __init early_memblock(char *p) { if (p && strstr(p, "debug")) @@ -819,7 +918,7 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); -#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c..94da8ee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -50,6 +50,8 @@ #include <linux/cpu.h> #include <linux/oom.h> #include "internal.h" +#include <net/sock.h> +#include <net/tcp_memcontrol.h> #include <asm/uaccess.h> @@ -286,6 +288,10 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + +#ifdef CONFIG_INET + struct tcp_memcontrol tcp_mem; +#endif }; /* Stuffs for move charges at task migration. */ @@ -365,7 +371,58 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); + +/* Writing them here to avoid exposing memcg's inner layout */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_INET +#include <net/sock.h> +#include <net/ip.h> + +static bool mem_cgroup_is_root(struct mem_cgroup *memcg); +void sock_update_memcg(struct sock *sk) +{ + /* A socket spends its whole life in the same cgroup */ + if (sk->sk_cgrp) { + WARN_ON(1); + return; + } + if (static_branch(&memcg_socket_limit_enabled)) { + struct mem_cgroup *memcg; + + BUG_ON(!sk->sk_prot->proto_cgroup); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!mem_cgroup_is_root(memcg)) { + mem_cgroup_get(memcg); + sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) { + struct mem_cgroup *memcg; + WARN_ON(!sk->sk_cgrp->memcg); + memcg = sk->sk_cgrp->memcg; + mem_cgroup_put(memcg); + } +} + +struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + + return &memcg->tcp_mem.cg_proto; +} +EXPORT_SYMBOL(tcp_proto_cgroup); +#endif /* CONFIG_INET */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -745,7 +802,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, @@ -4612,6 +4669,36 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + /* + * Part of this would be better living in a separate allocation + * function, leaving us with just the cgroup tree population work. + * We, however, depend on state such as network's proto_list that + * is only initialized after cgroup creation. I found the less + * cumbersome way to deal with it to defer it all to populate time + */ + return mem_cgroup_sockets_init(cont, ss); +}; + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + mem_cgroup_sockets_destroy(cont, ss); +} +#else +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + return 0; +} + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ +} +#endif + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4843,12 +4930,13 @@ static void mem_cgroup_put(struct mem_cgroup *memcg) /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { if (!memcg->res.parent) return NULL; return mem_cgroup_from_res_counter(memcg->res.parent, res); } +EXPORT_SYMBOL(parent_mem_cgroup); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -4907,9 +4995,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = memcg; if (mem_cgroup_soft_limit_tree_init()) goto free_out; + root_mem_cgroup = memcg; for_each_possible_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); @@ -4948,7 +5036,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &memcg->css; free_out: __mem_cgroup_free(memcg); - root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -4965,6 +5052,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_destroy(ss, cont); + mem_cgroup_put(memcg); } @@ -4978,6 +5067,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, if (!ret) ret = register_memsw_files(cont, ss); + + if (!ret) + ret = register_kmem_files(cont, ss); + return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index adc3954..c3fdbcb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; + pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (!vma || vma->vm_start > start) return -EFAULT; + if (start > vma->vm_start) + prev = vma; + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + + pgoff = vma->vm_pgoff + + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, new_pol); if (prev) { vma = prev; diff --git a/mm/migrate.c b/mm/migrate.c index 578e291..177aca4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); -out: unlock_page(hpage); +out: if (rc != -EAGAIN) { list_del(&hpage->lru); put_page(hpage); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7fa41b4..24f0fc1 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,14 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) + addr = memblock_find_in_range_node(goal, limit, size, align, nid); + if (!addr) return NULL; ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + memblock_reserve(addr, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. @@ -107,23 +106,27 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } -unsigned long __init free_all_memory_core_early(int nodeid) +unsigned long __init free_low_memory_core_early(int nodeid) { - int i; - u64 start, end; unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); + phys_addr_t start, end; + u64 i; + + /* free reserved array temporarily so that it's treated as free area */ + memblock_free_reserved_regions(); + + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + if (start_pfn < end_pfn) { + __free_pages_memory(start_pfn, end_pfn); + count += end_pfn - start_pfn; + } } + /* put region array back? */ + memblock_reserve_reserved_regions(); return count; } @@ -137,7 +140,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ return 0; } @@ -155,7 +158,7 @@ unsigned long __init free_all_bootmem(void) * Use MAX_NUMNODES will make sure all ranges in early_node_map[] * will be used instead of only Node0 related */ - return free_all_memory_core_early(MAX_NUMNODES); + return free_low_memory_core_early(MAX_NUMNODES); } /** @@ -172,7 +175,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); + memblock_free(physaddr, size); } /** @@ -187,7 +190,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); + memblock_free(addr, size); } static void * __init ___alloc_bootmem_nopanic(unsigned long size, @@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void) * between processes, it syncs the pagetable across all * processes. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); return NULL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 471dedb..eeb27e2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,7 +176,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { - int points; + long points; if (oom_unkillable_task(p, mem, nodemask)) return 0; @@ -185,6 +185,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (!p) return 0; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + task_unlock(p); + return 0; + } + /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. @@ -323,7 +328,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) - thaw_process(p); + __thaw_task(p); return ERR_PTR(-1UL); } if (!p->mm) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a3278f0..8616ef3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -32,7 +32,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> +#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <trace/events/writeback.h> @@ -128,7 +128,6 @@ unsigned long global_dirty_limit; * */ static struct prop_descriptor vm_completions; -static struct prop_descriptor vm_dirties; /* * couple the period to the dirty_ratio: @@ -154,7 +153,6 @@ static void update_completion_period(void) { int shift = calc_period_shift(); prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); writeback_set_ratelimit(); } @@ -235,11 +233,6 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -void task_dirty_inc(struct task_struct *tsk) -{ - prop_inc_single(&vm_dirties, &tsk->dirties); -} - /* * Obtain an accurate fraction of the BDI's portion. */ @@ -418,8 +411,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * * Returns @bdi's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * And the "limit" in the name is not seriously taken as hard limit in - * balance_dirty_pages(). + * + * Note that balance_dirty_pages() will only seriously take it as a hard limit + * when sleeping max_pause per page is not enough to keep the dirty pages under + * control. For example, when the device is completely stalled due to some error + * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * In the other normal situations, it acts more gently by throttling the tasks + * more (rather than completely block them) when the bdi dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices @@ -601,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, */ if (unlikely(bdi_thresh > thresh)) bdi_thresh = thresh; + /* + * It's very possible that bdi_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); /* * scale global setpoint to bdi's: @@ -984,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - if (bdi_dirty) - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); + t = min(t, bdi_dirty * HZ / (8 * bw + 1)); /* * The pause time will be settled within range (max_pause/4, max_pause). @@ -1133,17 +1137,30 @@ pause: pages_dirtied, pause, start_time); - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); - dirty_thresh = hard_dirty_limit(dirty_thresh); /* - * max-pause area. If dirty exceeded but still within this - * area, no need to sleep for more than 200ms: (a) 8 pages per - * 200ms is typically more than enough to curb heavy dirtiers; - * (b) the pause time limit makes the dirtiers more responsive. + * This is typically equal to (nr_dirty < dirty_thresh) and can + * also keep "1000+ dd on a slow USB stick" under control. */ - if (nr_dirty < dirty_thresh) + if (task_ratelimit) + break; + + /* + * In the case of an unresponding NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good bdi's a pipe + * to go through, so that tasks on them still remain responsive. + * + * In theory 1 page is enough to keep the comsumer-producer + * pipe going: the flusher cleans 1 page => the task dirties 1 + * more page. However bdi_dirty has accounting errors. So use + * the larger and more IO friendly bdi_stat_error. + */ + if (bdi_dirty <= bdi_stat_error(bdi)) + break; + + if (fatal_signal_pending(current)) break; } @@ -1395,7 +1412,6 @@ void __init page_writeback_init(void) shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); - prop_descriptor_init(&vm_dirties, shift); } /** @@ -1724,7 +1740,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); - task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e51802d..7990ca1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -181,39 +181,17 @@ static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP - /* - * MAX_ACTIVE_REGIONS determines the maximum number of distinct - * ranges of memory (RAM) that may be registered with add_active_range(). - * Ranges passed to add_active_range() will be merged if possible - * so the number of times add_active_range() can be called is - * related to the number of nodes and the number of holes - */ - #ifdef CONFIG_MAX_ACTIVE_REGIONS - /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ - #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS - #else - #if MAX_NUMNODES >= 32 - /* If there can be many nodes, allow up to 50 holes per node */ - #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) - #else - /* By default, allow up to 256 distinct regions */ - #define MAX_ACTIVE_REGIONS 256 - #endif - #endif - - static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; - static int __meminitdata nr_nodemap_entries; - static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __initdata required_kernelcore; - static unsigned long __initdata required_movablecore; - static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; - - /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ - int movable_zone; - EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __initdata required_kernelcore; +static unsigned long __initdata required_movablecore; +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -356,8 +334,8 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -706,10 +684,10 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) int loop; prefetchw(page); - for (loop = 0; loop < BITS_PER_LONG; loop++) { + for (loop = 0; loop < (1 << order); loop++) { struct page *p = &page[loop]; - if (loop + 1 < BITS_PER_LONG) + if (loop + 1 < (1 << order)) prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); @@ -1408,7 +1386,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) static int __init fail_page_alloc_debugfs(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, @@ -3377,9 +3355,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) unsigned long block_migratetype; int reserve; - /* Get the start pfn, end pfn and the number of blocks to reserve */ + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; + start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; @@ -3731,35 +3715,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP -/* - * Basic iterator support. Return the first range of PFNs for a node - * Note: nid == MAX_NUMNODES returns first region regardless of node - */ -static int __meminit first_active_region_index_in_nid(int nid) -{ - int i; - - for (i = 0; i < nr_nodemap_entries; i++) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the next active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit next_active_region_index_in_nid(int index, int nid) -{ - for (index = index + 1; index < nr_nodemap_entries; index++) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3769,15 +3725,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) */ int __meminit __early_pfn_to_nid(unsigned long pfn) { - int i; - - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long start_pfn = early_node_map[i].start_pfn; - unsigned long end_pfn = early_node_map[i].end_pfn; + unsigned long start_pfn, end_pfn; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) if (start_pfn <= pfn && pfn < end_pfn) - return early_node_map[i].nid; - } + return nid; /* This is a memory hole */ return -1; } @@ -3806,11 +3759,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) } #endif -/* Basic iterator support to walk early_node_map[] */ -#define for_each_active_range_index_in_nid(i, nid) \ - for (i = first_active_region_index_in_nid(nid); i != -1; \ - i = next_active_region_index_in_nid(i, nid)) - /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3820,122 +3768,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * add_active_ranges() contain no holes and may be freed, this * this function may be used instead of calling free_bootmem() manually. */ -void __init free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn) +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { - int i; - - for_each_active_range_index_in_nid(i, nid) { - unsigned long size_pages = 0; - unsigned long end_pfn = early_node_map[i].end_pfn; - - if (early_node_map[i].start_pfn >= max_low_pfn) - continue; - - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; - - size_pages = end_pfn - early_node_map[i].start_pfn; - free_bootmem_node(NODE_DATA(early_node_map[i].nid), - PFN_PHYS(early_node_map[i].start_pfn), - size_pages << PAGE_SHIFT); - } -} - -#ifdef CONFIG_HAVE_MEMBLOCK -/* - * Basic iterator support. Return the last range of PFNs for a node - * Note: nid == MAX_NUMNODES returns last region regardless of node - */ -static int __meminit last_active_region_index_in_nid(int nid) -{ - int i; - - for (i = nr_nodemap_entries - 1; i >= 0; i--) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the previous active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit previous_active_region_index_in_nid(int index, int nid) -{ - for (index = index - 1; index >= 0; index--) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - -#define for_each_active_range_index_in_nid_reverse(i, nid) \ - for (i = last_active_region_index_in_nid(nid); i != -1; \ - i = previous_active_region_index_in_nid(i, nid)) - -u64 __init find_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - int i; - - /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid_reverse(i, nid) { - u64 addr; - u64 ei_start, ei_last; - u64 final_start, final_end; - - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - - final_start = max(ei_start, goal); - final_end = min(ei_last, limit); - - if (final_start >= final_end) - continue; - - addr = memblock_find_in_range(final_start, final_end, size, align); + unsigned long start_pfn, end_pfn; + int i, this_nid; - if (addr == MEMBLOCK_ERROR) - continue; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { + start_pfn = min(start_pfn, max_low_pfn); + end_pfn = min(end_pfn, max_low_pfn); - return addr; + if (start_pfn < end_pfn) + free_bootmem_node(NODE_DATA(this_nid), + PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT); } - - return MEMBLOCK_ERROR; } -#endif int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { + unsigned long start_pfn, end_pfn; int i; - u64 start, end; /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - start = early_node_map[i].start_pfn; - end = early_node_map[i].end_pfn; - nr_range = add_range(range, az, nr_range, start, end); - } + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) + nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); return nr_range; } -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) -{ - int i; - int ret; - - for_each_active_range_index_in_nid(i, nid) { - ret = work_fn(early_node_map[i].start_pfn, - early_node_map[i].end_pfn, data); - if (ret) - break; - } -} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -3946,12 +3806,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) */ void __init sparse_memory_present_with_active_regions(int nid) { - int i; + unsigned long start_pfn, end_pfn; + int i, this_nid; - for_each_active_range_index_in_nid(i, nid) - memory_present(early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); } /** @@ -3968,13 +3827,15 @@ void __init sparse_memory_present_with_active_regions(int nid) void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { + unsigned long this_start_pfn, this_end_pfn; int i; + *start_pfn = -1UL; *end_pfn = 0; - for_each_active_range_index_in_nid(i, nid) { - *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); - *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { + *start_pfn = min(*start_pfn, this_start_pfn); + *end_pfn = max(*end_pfn, this_end_pfn); } if (*start_pfn == -1UL) @@ -4077,46 +3938,16 @@ unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { - int i = 0; - unsigned long prev_end_pfn = 0, hole_pages = 0; - unsigned long start_pfn; - - /* Find the end_pfn of the first active range of pfns in the node */ - i = first_active_region_index_in_nid(nid); - if (i == -1) - return 0; - - prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - - /* Account for ranges before physical memory on this node */ - if (early_node_map[i].start_pfn > range_start_pfn) - hole_pages = prev_end_pfn - range_start_pfn; - - /* Find all holes for the zone within the node */ - for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { - - /* No need to continue if prev_end_pfn is outside the zone */ - if (prev_end_pfn >= range_end_pfn) - break; - - /* Make sure the end of the zone is not within the hole */ - start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - prev_end_pfn = max(prev_end_pfn, range_start_pfn); + unsigned long nr_absent = range_end_pfn - range_start_pfn; + unsigned long start_pfn, end_pfn; + int i; - /* Update the hole size cound and move on */ - if (start_pfn > range_start_pfn) { - BUG_ON(prev_end_pfn > start_pfn); - hole_pages += start_pfn - prev_end_pfn; - } - prev_end_pfn = early_node_map[i].end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + nr_absent -= end_pfn - start_pfn; } - - /* Account for ranges past physical memory on this node */ - if (range_end_pfn > prev_end_pfn) - hole_pages += range_end_pfn - - max(range_start_pfn, prev_end_pfn); - - return hole_pages; + return nr_absent; } /** @@ -4137,14 +3968,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); - zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], - node_start_pfn); - zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], - node_end_pfn); + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, @@ -4152,7 +3983,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -#else +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) @@ -4170,7 +4001,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4393,10 +4224,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ @@ -4421,7 +4252,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 /* @@ -4443,170 +4274,6 @@ static inline void setup_nr_node_ids(void) #endif /** - * add_active_range - Register a range of PFNs backed by physical memory - * @nid: The node ID the range resides on - * @start_pfn: The start PFN of the available physical memory - * @end_pfn: The end PFN of the available physical memory - * - * These ranges are stored in an early_node_map[] and later used by - * free_area_init_nodes() to calculate zone sizes and holes. If the - * range spans a memory hole, it is up to the architecture to ensure - * the memory is not freed by the bootmem allocator. If possible - * the range being registered will be merged with existing ranges. - */ -void __init add_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i; - - mminit_dprintk(MMINIT_TRACE, "memory_register", - "Entering add_active_range(%d, %#lx, %#lx) " - "%d entries of %d used\n", - nid, start_pfn, end_pfn, - nr_nodemap_entries, MAX_ACTIVE_REGIONS); - - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - - /* Merge with existing active regions if possible */ - for (i = 0; i < nr_nodemap_entries; i++) { - if (early_node_map[i].nid != nid) - continue; - - /* Skip if an existing region covers this new one */ - if (start_pfn >= early_node_map[i].start_pfn && - end_pfn <= early_node_map[i].end_pfn) - return; - - /* Merge forward if suitable */ - if (start_pfn <= early_node_map[i].end_pfn && - end_pfn > early_node_map[i].end_pfn) { - early_node_map[i].end_pfn = end_pfn; - return; - } - - /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].start_pfn && - end_pfn >= early_node_map[i].start_pfn) { - early_node_map[i].start_pfn = start_pfn; - return; - } - } - - /* Check that early_node_map is large enough */ - if (i >= MAX_ACTIVE_REGIONS) { - printk(KERN_CRIT "More than %d memory regions, truncating\n", - MAX_ACTIVE_REGIONS); - return; - } - - early_node_map[i].nid = nid; - early_node_map[i].start_pfn = start_pfn; - early_node_map[i].end_pfn = end_pfn; - nr_nodemap_entries = i + 1; -} - -/** - * remove_active_range - Shrink an existing registered range of PFNs - * @nid: The node id the range is on that should be shrunk - * @start_pfn: The new PFN of the range - * @end_pfn: The new PFN of the range - * - * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept near the end physical page range that has already been - * registered. This function allows an arch to shrink an existing registered - * range. - */ -void __init remove_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i, j; - int removed = 0; - - printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn <= end_pfn) { - /* clear it */ - early_node_map[i].start_pfn = 0; - early_node_map[i].end_pfn = 0; - removed = 1; - continue; - } - if (early_node_map[i].start_pfn < start_pfn && - early_node_map[i].end_pfn > start_pfn) { - unsigned long temp_end_pfn = early_node_map[i].end_pfn; - early_node_map[i].end_pfn = start_pfn; - if (temp_end_pfn > end_pfn) - add_active_range(nid, end_pfn, temp_end_pfn); - continue; - } - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn > end_pfn && - early_node_map[i].start_pfn < end_pfn) { - early_node_map[i].start_pfn = end_pfn; - continue; - } - } - - if (!removed) - return; - - /* remove the blank ones */ - for (i = nr_nodemap_entries - 1; i > 0; i--) { - if (early_node_map[i].nid != nid) - continue; - if (early_node_map[i].end_pfn) - continue; - /* we found it, get rid of it */ - for (j = i; j < nr_nodemap_entries - 1; j++) - memcpy(&early_node_map[j], &early_node_map[j+1], - sizeof(early_node_map[j])); - j = nr_nodemap_entries - 1; - memset(&early_node_map[j], 0, sizeof(early_node_map[j])); - nr_nodemap_entries--; - } -} - -/** - * remove_all_active_ranges - Remove all currently registered regions - * - * During discovery, it may be found that a table like SRAT is invalid - * and an alternative discovery method must be used. This function removes - * all currently registered regions. - */ -void __init remove_all_active_ranges(void) -{ - memset(early_node_map, 0, sizeof(early_node_map)); - nr_nodemap_entries = 0; -} - -/* Compare two active node_active_regions */ -static int __init cmp_node_active_region(const void *a, const void *b) -{ - struct node_active_region *arange = (struct node_active_region *)a; - struct node_active_region *brange = (struct node_active_region *)b; - - /* Done this way to avoid overflows */ - if (arange->start_pfn > brange->start_pfn) - return 1; - if (arange->start_pfn < brange->start_pfn) - return -1; - - return 0; -} - -/* sort the node_map by start_pfn */ -void __init sort_node_map(void) -{ - sort(early_node_map, (size_t)nr_nodemap_entries, - sizeof(struct node_active_region), - cmp_node_active_region, NULL); -} - -/** * node_map_pfn_alignment - determine the maximum internode alignment * * This function should be called after node map is populated and sorted. @@ -4628,15 +4295,11 @@ void __init sort_node_map(void) unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; + unsigned long start, end, mask; int last_nid = -1; - int i; - - for_each_active_range_index_in_nid(i, MAX_NUMNODES) { - int nid = early_node_map[i].nid; - unsigned long start = early_node_map[i].start_pfn; - unsigned long end = early_node_map[i].end_pfn; - unsigned long mask; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (!start || last_nid < 0 || last_nid == nid) { last_nid = nid; last_end = end; @@ -4663,12 +4326,12 @@ unsigned long __init node_map_pfn_alignment(void) /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { - int i; unsigned long min_pfn = ULONG_MAX; + unsigned long start_pfn; + int i; - /* Assuming a sorted map, the first range found has the starting pfn */ - for_each_active_range_index_in_nid(i, nid) - min_pfn = min(min_pfn, early_node_map[i].start_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) + min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { printk(KERN_WARNING @@ -4697,15 +4360,16 @@ unsigned long __init find_min_pfn_with_active_regions(void) */ static unsigned long __init early_calculate_totalpages(void) { - int i; unsigned long totalpages = 0; + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + unsigned long pages = end_pfn - start_pfn; - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long pages = early_node_map[i].end_pfn - - early_node_map[i].start_pfn; totalpages += pages; if (pages) - node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); + node_set_state(nid, N_HIGH_MEMORY); } return totalpages; } @@ -4760,6 +4424,8 @@ restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested @@ -4776,13 +4442,10 @@ restart: kernelcore_remaining = kernelcore_node; /* Go through each range of PFNs within this node */ - for_each_active_range_index_in_nid(i, nid) { - unsigned long start_pfn, end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; - start_pfn = max(early_node_map[i].start_pfn, - zone_movable_pfn[nid]); - end_pfn = early_node_map[i].end_pfn; + start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; @@ -4884,11 +4547,8 @@ static void check_for_regular_memory(pg_data_t *pgdat) */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { - unsigned long nid; - int i; - - /* Sort early_node_map as initialisation assumes it is sorted */ - sort_node_map(); + unsigned long start_pfn, end_pfn; + int i, nid; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -4935,11 +4595,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) } /* Print out the early_node_map[] */ - printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); - for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + printk("Early memory PFN ranges\n"); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4992,7 +4650,7 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** * set_dma_reserve - set the specified number of pages reserved in the first zone diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea53496..12a48a88 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, if (!pages || !bitmap) { if (may_alloc && !pages) - pages = pcpu_mem_alloc(pages_size); + pages = pcpu_mem_zalloc(pages_size); if (may_alloc && !bitmap) - bitmap = pcpu_mem_alloc(bitmap_size); + bitmap = pcpu_mem_zalloc(bitmap_size); if (!pages || !bitmap) return NULL; } - memset(pages, 0, pages_size); bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); *bitmapp = bitmap; @@ -143,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -206,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -284,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55..716eb4a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; -/* cpus with the lowest and highest unit numbers */ -static unsigned int pcpu_first_unit_cpu __read_mostly; -static unsigned int pcpu_last_unit_cpu __read_mostly; +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; @@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) /** - * pcpu_mem_alloc - allocate memory + * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vmalloc() is used. The returned + * kzalloc() is used; otherwise, vzalloc() is used. The returned * memory is always zeroed. * * CONTEXT: @@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_alloc(size_t size) +static void *pcpu_mem_zalloc(size_t size) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; @@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size) * @ptr: memory to free * @size: size of the area * - * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr, size_t size) { @@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; - new = pcpu_mem_alloc(new_size); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; - chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * + sizeof(chunk->map[0])); if (!chunk->map) { kfree(chunk); return NULL; @@ -977,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr) * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be tranlated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * * RETURNS: * The physical address for @addr. */ @@ -984,19 +996,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; - unsigned long first_start, first_end; + unsigned long first_low, first_high; unsigned int cpu; /* - * The following test on first_start/end isn't strictly + * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. */ - first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); - first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, - pcpu_unit_pages); - if ((unsigned long)addr >= first_start && - (unsigned long)addr < first_end) { + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); @@ -1011,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) if (!is_vmalloc_addr(addr)) return __pa(addr); else - return page_to_phys(vmalloc_to_page(addr)); + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); } else - return page_to_phys(pcpu_addr_to_page(addr)); + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); } /** @@ -1233,7 +1247,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; - pcpu_first_unit_cpu = NR_CPUS; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; @@ -1253,9 +1269,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; - if (pcpu_first_unit_cpu == NR_CPUS) - pcpu_first_unit_cpu = cpu; - pcpu_last_unit_cpu = cpu; + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; @@ -1889,7 +1909,7 @@ void __init percpu_init_late(void) BUILD_BUG_ON(size > PAGE_SIZE); - map = pcpu_mem_alloc(size); + map = pcpu_mem_zalloc(size); BUG_ON(!map); spin_lock_irqsave(&pcpu_lock, flags); @@ -1092,7 +1092,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) } static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, - int mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1456,7 +1456,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; @@ -1489,7 +1489,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } -static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error; @@ -1499,7 +1499,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) return 0; } -static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, +static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return shmem_mknod(dir, dentry, mode | S_IFREG, 0); @@ -2118,9 +2118,9 @@ out: return error; } -static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int shmem_show_options(struct seq_file *seq, struct dentry *root) { - struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb); + struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", @@ -2128,7 +2128,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) - seq_printf(seq, ",mode=%03o", sbinfo->mode); + seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (sbinfo->uid != 0) seq_printf(seq, ",uid=%u", sbinfo->uid); if (sbinfo->gid != 0) @@ -2234,13 +2234,12 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { - if ((inode->i_mode & S_IFMT) == S_IFREG) + if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); call_rcu(&inode->i_rcu, shmem_destroy_callback); } @@ -595,6 +595,7 @@ static enum { PARTIAL_AC, PARTIAL_L3, EARLY, + LATE, FULL } g_cpucache_up; @@ -671,7 +672,7 @@ static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; - if (g_cpucache_up != FULL) + if (g_cpucache_up < LATE) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { @@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + g_cpucache_up = LATE; + /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); @@ -368,7 +368,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page VM_BUG_ON(!irqs_disabled()); #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, + if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) return 1; @@ -402,7 +402,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, { #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, + if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) return 1; @@ -1862,7 +1862,7 @@ static void unfreeze_partials(struct kmem_cache *s) { struct kmem_cache_node *n = NULL; struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - struct page *page; + struct page *page, *discard_page = NULL; while ((page = c->partial)) { enum slab_modes { M_PARTIAL, M_FREE }; @@ -1904,7 +1904,8 @@ static void unfreeze_partials(struct kmem_cache *s) if (l == M_PARTIAL) remove_partial(n, page); else - add_partial(n, page, 1); + add_partial(n, page, + DEACTIVATE_TO_TAIL); l = m; } @@ -1915,14 +1916,22 @@ static void unfreeze_partials(struct kmem_cache *s) "unfreezing slab")); if (m == M_FREE) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); - stat(s, FREE_SLAB); + page->next = discard_page; + discard_page = page; } } if (n) spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } } /* @@ -1969,7 +1978,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->pobjects = pobjects; page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); stat(s, CPU_PARTIAL_FREE); return pobjects; } @@ -4435,30 +4444,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + int node = ACCESS_ONCE(c->node); struct page *page; - if (!c || c->node < 0) + if (node < 0) continue; - - if (c->page) { - if (flags & SO_TOTAL) - x = c->page->objects; + page = ACCESS_ONCE(c->page); + if (page) { + if (flags & SO_TOTAL) + x = page->objects; else if (flags & SO_OBJECTS) - x = c->page->inuse; + x = page->inuse; else x = 1; total += x; - nodes[c->node] += x; + nodes[node] += x; } page = c->partial; if (page) { x = page->pobjects; - total += x; - nodes[c->node] += x; + total += x; + nodes[node] += x; } - per_cpu[c->node]++; + per_cpu[node]++; } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 78cc4d1..ea6b32d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -13,7 +13,6 @@ #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/migrate.h> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b669aa6..21fdf46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1118,6 +1118,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); /** + * vm_area_add_early - add vmap area early during boot + * @vm: vm_struct to add + * + * This function is used to add fixed kernel vm area to vmlist before + * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags + * should contain proper values and the other fields should be zero. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_add_early(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + + BUG_ON(vmap_initialized); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) { + BUG_ON(tmp->addr < vm->addr + vm->size); + break; + } else + BUG_ON(tmp->addr + tmp->size > vm->addr); + } + vm->next = *p; + *p = vm; +} + +/** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment @@ -1139,8 +1165,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm->addr = (void *)addr; - vm->next = vmlist; - vmlist = vm; + vm_area_add_early(vm); } void __init vmalloc_init(void) @@ -1290,7 +1315,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { - static struct vmap_area *va; + struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); @@ -1633,6 +1658,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + if (!addr) + return NULL; /* * In this function, newly allocated vm_struct is not added @@ -2141,23 +2168,30 @@ void __attribute__((weak)) vmalloc_sync_all(void) static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { - /* apply_to_page_range() does all the hard work. */ + pte_t ***p = data; + + if (p) { + *(*p) = pte; + (*p)++; + } return 0; } /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area + * @ptes: returns the PTEs for the address space * * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. + * are created. + * + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { struct vm_struct *area; @@ -2171,19 +2205,11 @@ struct vm_struct *alloc_vm_area(size_t size) * of kernel virtual address space and mapped into init_mm. */ if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - area->size, f, NULL)) { + size, f, ptes ? &ptes : NULL)) { free_vm_area(area); return NULL; } - /* - * If the allocated address space is passed to a hypercall - * before being used then we cannot rely on a page fault to - * trigger an update of the page tables. So sync all the page - * tables here. - */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index a1893c0..11adc89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -247,25 +247,26 @@ unsigned long shrink_slab(struct shrink_control *shrink, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; + long total_scan; + long max_pass; int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + if (max_pass <= 0) + continue; + /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -325,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } @@ -3475,16 +3475,16 @@ int scan_unevictable_handler(struct ctl_table *table, int write, * a specified node's per zone unevictable lists for evictable pages. */ -static ssize_t read_scan_unevictable_node(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t read_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, char *buf) { warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } -static ssize_t write_scan_unevictable_node(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t write_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { warn_scan_unevictable_pages(); @@ -3492,17 +3492,17 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, } -static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, +static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, read_scan_unevictable_node, write_scan_unevictable_node); int scan_unevictable_register_node(struct node *node) { - return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); + return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); } void scan_unevictable_unregister_node(struct node *node) { - sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); + device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); } #endif |