diff options
author | Sangwook Lee <sangwook.lee@linaro.org> | 2012-02-15 12:43:09 +0000 |
---|---|---|
committer | Sangwook Lee <sangwook.lee@linaro.org> | 2012-02-15 15:51:38 +0000 |
commit | bab8c75ca1f4e72b2a063e55b5043e6be73a447c (patch) | |
tree | a3da5698b93c2dc67000f3bb94d86090b16a2d92 | |
parent | 5532e0e70244177ecc750eac9ead8ad56c15dd38 (diff) | |
parent | bdf6ccf48442b76a112f1117e29a246adc82ee3c (diff) |
Merge remote-tracking branch 'origin/topic/cma_v18' into local-malilinux-mali-v3.3-rc3
35 files changed, 1773 insertions, 425 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 033d4e69b43b..ff97085085d9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Also note the kernel might malfunction if you disable some critical bits. + cma=nn[MG] [ARM,KNL] + Sets the size of kernel global memory area for contiguous + memory allocations. For more information, see + include/linux/dma-contiguous.h + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments @@ -515,6 +520,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a hypervisor. Default: yes + coherent_pool=nn[KMG] [ARM,KNL] + Sets the size of memory pool for coherent, atomic dma + allocations if Contiguous Memory Allocator (CMA) is used. + code_bytes [X86] How many bytes of object code to print in an oops report. Range: 0 - 8192 diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..8ec200c79b2f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -128,6 +128,9 @@ config HAVE_ARCH_TRACEHOOK config HAVE_DMA_ATTRS bool +config HAVE_DMA_CONTIGUOUS + bool + config USE_GENERIC_SMP_HELPERS bool diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a48aecc17eac..56192fe52722 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,6 +4,8 @@ config ARM select HAVE_AOUT select HAVE_DMA_API_DEBUG select HAVE_IDE if PCI || ISA || PCMCIA + select HAVE_DMA_CONTIGUOUS if (CPU_V6 || CPU_V6K || CPU_V7) + select CMA if (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_MEMBLOCK select RTC_LIB select SYS_SUPPORTS_APM_EMULATION diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h new file mode 100644 index 000000000000..c7ba05ef6e69 --- /dev/null +++ b/arch/arm/include/asm/dma-contiguous.h @@ -0,0 +1,16 @@ +#ifndef ASMARM_DMA_CONTIGUOUS_H +#define ASMARM_DMA_CONTIGUOUS_H + +#ifdef __KERNEL__ + +#include <linux/device.h> +#include <linux/dma-contiguous.h> +#include <asm-generic/dma-contiguous.h> + +#ifdef CONFIG_CMA + +void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size); + +#endif +#endif +#endif diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h index b36f3654bf54..a6efcdd6fd25 100644 --- a/arch/arm/include/asm/mach/map.h +++ b/arch/arm/include/asm/mach/map.h @@ -30,6 +30,7 @@ struct map_desc { #define MT_MEMORY_DTCM 12 #define MT_MEMORY_ITCM 13 #define MT_MEMORY_SO 14 +#define MT_MEMORY_DMA_READY 15 #ifdef CONFIG_MMU extern void iotable_init(struct map_desc *, int); diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a255c39612ca..a7d5fb74bb2c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -79,6 +79,7 @@ __setup("fpe=", fpe_setup); extern void paging_init(struct machine_desc *desc); extern void sanity_check_meminfo(void); extern void reboot_setup(char *str); +extern void setup_dma_zone(struct machine_desc *desc); unsigned int processor_id; EXPORT_SYMBOL(processor_id); @@ -923,12 +924,8 @@ void __init setup_arch(char **cmdline_p) machine_desc = mdesc; machine_name = mdesc->name; -#ifdef CONFIG_ZONE_DMA - if (mdesc->dma_zone_size) { - extern unsigned long arm_dma_zone_size; - arm_dma_zone_size = mdesc->dma_zone_size; - } -#endif + setup_dma_zone(mdesc); + if (mdesc->restart_mode) reboot_setup(&mdesc->restart_mode); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 1aa664a1999f..77e7755c9a4f 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -17,7 +17,9 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/dma-mapping.h> +#include <linux/dma-contiguous.h> #include <linux/highmem.h> +#include <linux/memblock.h> #include <linux/slab.h> #include <asm/memory.h> @@ -26,6 +28,8 @@ #include <asm/tlbflush.h> #include <asm/sizes.h> #include <asm/mach/arch.h> +#include <asm/mach/map.h> +#include <asm/dma-contiguous.h> #include "mm.h" @@ -56,6 +60,19 @@ static u64 get_coherent_dma_mask(struct device *dev) return mask; } +static void __dma_clear_buffer(struct page *page, size_t size) +{ + void *ptr; + /* + * Ensure that the allocated pages are zeroed, and that any data + * lurking in the kernel direct-mapped region is invalidated. + */ + ptr = page_address(page); + memset(ptr, 0, size); + dmac_flush_range(ptr, ptr + size); + outer_flush_range(__pa(ptr), __pa(ptr) + size); +} + /* * Allocate a DMA buffer for 'dev' of size 'size' using the * specified gfp mask. Note that 'size' must be page aligned. @@ -64,23 +81,6 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf { unsigned long order = get_order(size); struct page *page, *p, *e; - void *ptr; - u64 mask = get_coherent_dma_mask(dev); - -#ifdef CONFIG_DMA_API_DEBUG - u64 limit = (mask + 1) & ~mask; - if (limit && size >= limit) { - dev_warn(dev, "coherent allocation too big (requested %#x mask %#llx)\n", - size, mask); - return NULL; - } -#endif - - if (!mask) - return NULL; - - if (mask < 0xffffffffULL) - gfp |= GFP_DMA; page = alloc_pages(gfp, order); if (!page) @@ -93,14 +93,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order); p < e; p++) __free_page(p); - /* - * Ensure that the allocated pages are zeroed, and that any data - * lurking in the kernel direct-mapped region is invalidated. - */ - ptr = page_address(page); - memset(ptr, 0, size); - dmac_flush_range(ptr, ptr + size); - outer_flush_range(__pa(ptr), __pa(ptr) + size); + __dma_clear_buffer(page, size); return page; } @@ -170,6 +163,9 @@ static int __init consistent_init(void) unsigned long base = consistent_base; unsigned long num_ptes = (CONSISTENT_END - base) >> PMD_SHIFT; + if (cpu_architecture() >= CPU_ARCH_ARMv6) + return 0; + consistent_pte = kmalloc(num_ptes * sizeof(pte_t), GFP_KERNEL); if (!consistent_pte) { pr_err("%s: no memory\n", __func__); @@ -210,9 +206,101 @@ static int __init consistent_init(void) return ret; } - core_initcall(consistent_init); +static void *__alloc_from_contiguous(struct device *dev, size_t size, + pgprot_t prot, struct page **ret_page); + +static struct arm_vmregion_head coherent_head = { + .vm_lock = __SPIN_LOCK_UNLOCKED(&coherent_head.vm_lock), + .vm_list = LIST_HEAD_INIT(coherent_head.vm_list), +}; + +size_t coherent_pool_size = DEFAULT_CONSISTENT_DMA_SIZE / 8; + +static int __init early_coherent_pool(char *p) +{ + coherent_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +/* + * Initialise the coherent pool for atomic allocations. + */ +static int __init coherent_init(void) +{ + pgprot_t prot = pgprot_dmacoherent(pgprot_kernel); + size_t size = coherent_pool_size; + struct page *page; + void *ptr; + + if (cpu_architecture() < CPU_ARCH_ARMv6) + return 0; + + ptr = __alloc_from_contiguous(NULL, size, prot, &page); + if (ptr) { + coherent_head.vm_start = (unsigned long) ptr; + coherent_head.vm_end = (unsigned long) ptr + size; + printk(KERN_INFO "DMA: preallocated %u KiB pool for atomic coherent allocations\n", + (unsigned)size / 1024); + return 0; + } + printk(KERN_ERR "DMA: failed to allocate %u KiB pool for atomic coherent allocation\n", + (unsigned)size / 1024); + return -ENOMEM; +} +/* + * CMA is activated by core_initcall, so we must be called after it + */ +postcore_initcall(coherent_init); + +struct dma_contig_early_reserve { + phys_addr_t base; + unsigned long size; +}; + +static struct dma_contig_early_reserve dma_mmu_remap[MAX_CMA_AREAS] __initdata; + +static int dma_mmu_remap_num __initdata; + +void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) +{ + dma_mmu_remap[dma_mmu_remap_num].base = base; + dma_mmu_remap[dma_mmu_remap_num].size = size; + dma_mmu_remap_num++; +} + +void __init dma_contiguous_remap(void) +{ + int i; + for (i = 0; i < dma_mmu_remap_num; i++) { + phys_addr_t start = dma_mmu_remap[i].base; + phys_addr_t end = start + dma_mmu_remap[i].size; + struct map_desc map; + unsigned long addr; + + if (end > arm_lowmem_limit) + end = arm_lowmem_limit; + if (start >= end) + return; + + map.pfn = __phys_to_pfn(start); + map.virtual = __phys_to_virt(start); + map.length = end - start; + map.type = MT_MEMORY_DMA_READY; + + /* + * Clear previous low-memory mapping + */ + for (addr = __phys_to_virt(start); addr < __phys_to_virt(end); + addr += PGDIR_SIZE) + pmd_clear(pmd_off_k(addr)); + + iotable_init(&map, 1); + } +} + static void * __dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot) { @@ -318,20 +406,172 @@ static void __dma_free_remap(void *cpu_addr, size_t size) arm_vmregion_free(&consistent_head, c); } +static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr, + void *data) +{ + struct page *page = virt_to_page(addr); + pgprot_t prot = *(pgprot_t *)data; + + set_pte_ext(pte, mk_pte(page, prot), 0); + return 0; +} + +static void __dma_remap(struct page *page, size_t size, pgprot_t prot) +{ + unsigned long start = (unsigned long) page_address(page); + unsigned end = start + size; + + apply_to_page_range(&init_mm, start, size, __dma_update_pte, &prot); + dsb(); + flush_tlb_kernel_range(start, end); +} + +static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, + pgprot_t prot, struct page **ret_page) +{ + struct page *page; + void *ptr; + page = __dma_alloc_buffer(dev, size, gfp); + if (!page) + return NULL; + + ptr = __dma_alloc_remap(page, size, gfp, prot); + if (!ptr) { + __dma_free_buffer(page, size); + return NULL; + } + + *ret_page = page; + return ptr; +} + +static void *__alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page) +{ + struct arm_vmregion *c; + size_t align; + + if (!coherent_head.vm_start) { + printk(KERN_ERR "%s: coherent pool not initialised!\n", + __func__); + dump_stack(); + return NULL; + } + + /* + * Align the region allocation - allocations from pool are rather + * small, so align them to their order in pages, minimum is a page + * size. This helps reduce fragmentation of the DMA space. + */ + align = PAGE_SIZE << get_order(size); + c = arm_vmregion_alloc(&coherent_head, align, size, 0); + if (c) { + void *ptr = (void *)c->vm_start; + struct page *page = virt_to_page(ptr); + *ret_page = page; + return ptr; + } + return NULL; +} + +static int __free_from_pool(void *cpu_addr, size_t size) +{ + unsigned long start = (unsigned long)cpu_addr; + unsigned long end = start + size; + struct arm_vmregion *c; + + if (start < coherent_head.vm_start || end > coherent_head.vm_end) + return 0; + + c = arm_vmregion_find_remove(&coherent_head, (unsigned long)start); + + if ((c->vm_end - c->vm_start) != size) { + printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", + __func__, c->vm_end - c->vm_start, size); + dump_stack(); + size = c->vm_end - c->vm_start; + } + + arm_vmregion_free(&coherent_head, c); + return 1; +} + +static void *__alloc_from_contiguous(struct device *dev, size_t size, + pgprot_t prot, struct page **ret_page) +{ + unsigned long order = get_order(size); + size_t count = size >> PAGE_SHIFT; + struct page *page; + + page = dma_alloc_from_contiguous(dev, count, order); + if (!page) + return NULL; + + __dma_clear_buffer(page, size); + __dma_remap(page, size, prot); + + *ret_page = page; + return page_address(page); +} + +static void __free_from_contiguous(struct device *dev, struct page *page, + size_t size) +{ + __dma_remap(page, size, pgprot_kernel); + dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); +} + +#define nommu() 0 + #else /* !CONFIG_MMU */ -#define __dma_alloc_remap(page, size, gfp, prot) page_address(page) -#define __dma_free_remap(addr, size) do { } while (0) +#define nommu() 1 + +#define __alloc_remap_buffer(dev, size, gfp, prot, ret) NULL +#define __alloc_from_pool(dev, size, ret_page) NULL +#define __alloc_from_contiguous(dev, size, prot, ret) NULL +#define __free_from_pool(cpu_addr, size) 0 +#define __free_from_contiguous(dev, page, size) do { } while (0) +#define __dma_free_remap(cpu_addr, size) do { } while (0) #endif /* CONFIG_MMU */ -static void * -__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, - pgprot_t prot) +static void *__alloc_simple_buffer(struct device *dev, size_t size, gfp_t gfp, + struct page **ret_page) { struct page *page; + page = __dma_alloc_buffer(dev, size, gfp); + if (!page) + return NULL; + + *ret_page = page; + return page_address(page); +} + + + +static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp, pgprot_t prot) +{ + u64 mask = get_coherent_dma_mask(dev); + struct page *page; void *addr; +#ifdef CONFIG_DMA_API_DEBUG + u64 limit = (mask + 1) & ~mask; + if (limit && size >= limit) { + dev_warn(dev, "coherent allocation too big (requested %#x mask %#llx)\n", + size, mask); + return NULL; + } +#endif + + if (!mask) + return NULL; + + if (mask < 0xffffffffULL) + gfp |= GFP_DMA; + /* * Following is a work-around (a.k.a. hack) to prevent pages * with __GFP_COMP being passed to split_page() which cannot @@ -344,19 +584,17 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, *handle = ~0; size = PAGE_ALIGN(size); - page = __dma_alloc_buffer(dev, size, gfp); - if (!page) - return NULL; - - if (!arch_is_coherent()) - addr = __dma_alloc_remap(page, size, gfp, prot); + if (arch_is_coherent() || nommu()) + addr = __alloc_simple_buffer(dev, size, gfp, &page); + else if (cpu_architecture() < CPU_ARCH_ARMv6) + addr = __alloc_remap_buffer(dev, size, gfp, prot, &page); + else if (gfp & GFP_ATOMIC) + addr = __alloc_from_pool(dev, size, &page); else - addr = page_address(page); + addr = __alloc_from_contiguous(dev, size, prot, &page); if (addr) *handle = pfn_to_dma(dev, page_to_pfn(page)); - else - __dma_free_buffer(page, size); return addr; } @@ -365,8 +603,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) +void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp) { void *memory; @@ -395,25 +633,11 @@ static int dma_mmap(struct device *dev, struct vm_area_struct *vma, { int ret = -ENXIO; #ifdef CONFIG_MMU - unsigned long user_size, kern_size; - struct arm_vmregion *c; - - user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - c = arm_vmregion_find(&consistent_head, (unsigned long)cpu_addr); - if (c) { - unsigned long off = vma->vm_pgoff; - - kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT; - - if (off < kern_size && - user_size <= (kern_size - off)) { - ret = remap_pfn_range(vma, vma->vm_start, - page_to_pfn(c->vm_pages) + off, - user_size << PAGE_SHIFT, - vma->vm_page_prot); - } - } + unsigned long pfn = dma_to_pfn(dev, dma_addr); + ret = remap_pfn_range(vma, vma->vm_start, + pfn + vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); #endif /* CONFIG_MMU */ return ret; @@ -435,23 +659,33 @@ int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_writecombine); + /* - * free a page as defined by the above mapping. - * Must not be called with IRQs disabled. + * Free a buffer as defined by the above mapping. */ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle) { - WARN_ON(irqs_disabled()); + struct page *page = pfn_to_page(dma_to_pfn(dev, handle)); if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) return; size = PAGE_ALIGN(size); - if (!arch_is_coherent()) + if (arch_is_coherent() || nommu()) { + __dma_free_buffer(page, size); + } else if (cpu_architecture() < CPU_ARCH_ARMv6) { __dma_free_remap(cpu_addr, size); - - __dma_free_buffer(pfn_to_page(dma_to_pfn(dev, handle)), size); + __dma_free_buffer(page, size); + } else { + if (__free_from_pool(cpu_addr, size)) + return; + /* + * Non-atomic allocations cannot be freed with IRQs disabled + */ + WARN_ON(irqs_disabled()); + __free_from_contiguous(dev, page, size); + } } EXPORT_SYMBOL(dma_free_coherent); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 5dc7d127a40f..c2723ab9ca0c 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -20,6 +20,8 @@ #include <linux/highmem.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/sort.h> +#include <linux/dma-contiguous.h> #include <asm/mach-types.h> #include <asm/memblock.h> @@ -227,6 +229,17 @@ static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole, } #endif +void __init setup_dma_zone(struct machine_desc *mdesc) +{ +#ifdef CONFIG_ZONE_DMA + if (mdesc->dma_zone_size) { + arm_dma_zone_size = mdesc->dma_zone_size; + arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1; + } else + arm_dma_limit = 0xffffffff; +#endif +} + static void __init arm_bootmem_free(unsigned long min, unsigned long max_low, unsigned long max_high) { @@ -274,12 +287,9 @@ static void __init arm_bootmem_free(unsigned long min, unsigned long max_low, * Adjust the sizes according to any special requirements for * this machine type. */ - if (arm_dma_zone_size) { + if (arm_dma_zone_size) arm_adjust_dma_zone(zone_size, zhole_size, arm_dma_zone_size >> PAGE_SHIFT); - arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1; - } else - arm_dma_limit = 0xffffffff; #endif free_area_init_node(0, zone_size, min, zhole_size); @@ -365,8 +375,14 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc) if (mdesc->reserve) mdesc->reserve(); + /* reserve memory for DMA contigouos allocations, + must come from DMA area inside low memory */ + dma_contiguous_reserve(arm_dma_limit < arm_lowmem_limit ? + arm_dma_limit : arm_lowmem_limit); + arm_memblock_steal_permitted = false; memblock_allow_resize(); + memblock_dump_all(); } diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 70f6d3ea4834..398c4380586f 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -43,5 +43,8 @@ extern u32 arm_dma_limit; #define arm_dma_limit ((u32)~0) #endif +extern phys_addr_t arm_lowmem_limit; + void __init bootmem_init(void); void arm_mm_memblock_reserve(void); +void dma_contiguous_remap(void); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 94c5a0c94f5e..b9fbec246e0d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -286,6 +286,11 @@ static struct mem_type mem_types[] = { PMD_SECT_UNCACHED | PMD_SECT_XN, .domain = DOMAIN_KERNEL, }, + [MT_MEMORY_DMA_READY] = { + .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY, + .prot_l1 = PMD_TYPE_TABLE, + .domain = DOMAIN_KERNEL, + }, }; const struct mem_type *get_mem_type(unsigned int type) @@ -427,6 +432,7 @@ static void __init build_mem_type_table(void) if (arch_is_coherent() && cpu_is_xsc3()) { mem_types[MT_MEMORY].prot_sect |= PMD_SECT_S; mem_types[MT_MEMORY].prot_pte |= L_PTE_SHARED; + mem_types[MT_MEMORY_DMA_READY].prot_pte |= L_PTE_SHARED; mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_S; mem_types[MT_MEMORY_NONCACHED].prot_pte |= L_PTE_SHARED; } @@ -458,6 +464,7 @@ static void __init build_mem_type_table(void) mem_types[MT_DEVICE_CACHED].prot_pte |= L_PTE_SHARED; mem_types[MT_MEMORY].prot_sect |= PMD_SECT_S; mem_types[MT_MEMORY].prot_pte |= L_PTE_SHARED; + mem_types[MT_MEMORY_DMA_READY].prot_pte |= L_PTE_SHARED; mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_S; mem_types[MT_MEMORY_NONCACHED].prot_pte |= L_PTE_SHARED; } @@ -509,6 +516,7 @@ static void __init build_mem_type_table(void) mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask; mem_types[MT_MEMORY].prot_sect |= ecc_mask | cp->pmd; mem_types[MT_MEMORY].prot_pte |= kern_pgprot; + mem_types[MT_MEMORY_DMA_READY].prot_pte |= kern_pgprot; mem_types[MT_MEMORY_NONCACHED].prot_sect |= ecc_mask; mem_types[MT_ROM].prot_sect |= cp->pmd; @@ -593,7 +601,7 @@ static void __init alloc_init_section(pud_t *pud, unsigned long addr, * L1 entries, whereas PGDs refer to a group of L1 entries making * up one logical pointer to an L2 table. */ - if (((addr | end | phys) & ~SECTION_MASK) == 0) { + if (type->prot_sect && ((addr | end | phys) & ~SECTION_MASK) == 0) { pmd_t *p = pmd; #ifndef CONFIG_ARM_LPAE @@ -811,7 +819,7 @@ static int __init early_vmalloc(char *arg) } early_param("vmalloc", early_vmalloc); -static phys_addr_t lowmem_limit __initdata = 0; +phys_addr_t arm_lowmem_limit __initdata = 0; void __init sanity_check_meminfo(void) { @@ -894,8 +902,8 @@ void __init sanity_check_meminfo(void) bank->size = newsize; } #endif - if (!bank->highmem && bank->start + bank->size > lowmem_limit) - lowmem_limit = bank->start + bank->size; + if (!bank->highmem && bank->start + bank->size > arm_lowmem_limit) + arm_lowmem_limit = bank->start + bank->size; j++; } @@ -920,8 +928,8 @@ void __init sanity_check_meminfo(void) } #endif meminfo.nr_banks = j; - high_memory = __va(lowmem_limit - 1) + 1; - memblock_set_current_limit(lowmem_limit); + high_memory = __va(arm_lowmem_limit - 1) + 1; + memblock_set_current_limit(arm_lowmem_limit); } static inline void prepare_page_table(void) @@ -946,8 +954,8 @@ static inline void prepare_page_table(void) * Find the end of the first block of lowmem. */ end = memblock.memory.regions[0].base + memblock.memory.regions[0].size; - if (end >= lowmem_limit) - end = lowmem_limit; + if (end >= arm_lowmem_limit) + end = arm_lowmem_limit; /* * Clear out all the kernel space mappings, except for the first @@ -1087,8 +1095,8 @@ static void __init map_lowmem(void) phys_addr_t end = start + reg->size; struct map_desc map; - if (end > lowmem_limit) - end = lowmem_limit; + if (end > arm_lowmem_limit) + end = arm_lowmem_limit; if (start >= end) break; @@ -1109,11 +1117,12 @@ void __init paging_init(struct machine_desc *mdesc) { void *zero_page; - memblock_set_current_limit(lowmem_limit); + memblock_set_current_limit(arm_lowmem_limit); build_mem_type_table(); prepare_page_table(); map_lowmem(); + dma_contiguous_remap(); devicemaps_init(mdesc); kmap_init(); diff --git a/arch/arm/plat-s5p/dev-mfc.c b/arch/arm/plat-s5p/dev-mfc.c index a30d36b7f61b..fcb84008d367 100644 --- a/arch/arm/plat-s5p/dev-mfc.c +++ b/arch/arm/plat-s5p/dev-mfc.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/dma-mapping.h> +#include <linux/dma-contiguous.h> #include <linux/memblock.h> #include <linux/ioport.h> @@ -22,52 +23,14 @@ #include <plat/irqs.h> #include <plat/mfc.h> -struct s5p_mfc_reserved_mem { - phys_addr_t base; - unsigned long size; - struct device *dev; -}; - -static struct s5p_mfc_reserved_mem s5p_mfc_mem[2] __initdata; - void __init s5p_mfc_reserve_mem(phys_addr_t rbase, unsigned int rsize, phys_addr_t lbase, unsigned int lsize) { - int i; - - s5p_mfc_mem[0].dev = &s5p_device_mfc_r.dev; - s5p_mfc_mem[0].base = rbase; - s5p_mfc_mem[0].size = rsize; - - s5p_mfc_mem[1].dev = &s5p_device_mfc_l.dev; - s5p_mfc_mem[1].base = lbase; - s5p_mfc_mem[1].size = lsize; - - for (i = 0; i < ARRAY_SIZE(s5p_mfc_mem); i++) { - struct s5p_mfc_reserved_mem *area = &s5p_mfc_mem[i]; - if (memblock_remove(area->base, area->size)) { - printk(KERN_ERR "Failed to reserve memory for MFC device (%ld bytes at 0x%08lx)\n", - area->size, (unsigned long) area->base); - area->base = 0; - } - } -} - -static int __init s5p_mfc_memory_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(s5p_mfc_mem); i++) { - struct s5p_mfc_reserved_mem *area = &s5p_mfc_mem[i]; - if (!area->base) - continue; + if (dma_declare_contiguous(&s5p_device_mfc_r.dev, rsize, rbase, 0)) + printk(KERN_ERR "Failed to reserve memory for MFC device (%u bytes at 0x%08lx)\n", + rsize, (unsigned long) rbase); - if (dma_declare_coherent_memory(area->dev, area->base, - area->base, area->size, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE) == 0) - printk(KERN_ERR "Failed to declare coherent memory for MFC device (%ld bytes at 0x%08lx)\n", - area->size, (unsigned long) area->base); - } - return 0; + if (dma_declare_contiguous(&s5p_device_mfc_l.dev, lsize, lbase, 0)) + printk(KERN_ERR "Failed to reserve memory for MFC device (%u bytes at 0x%08lx)\n", + rsize, (unsigned long) rbase); } -device_initcall(s5p_mfc_memory_init); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..de6e06935ae3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS + select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h new file mode 100644 index 000000000000..8fb117d6f6de --- /dev/null +++ b/arch/x86/include/asm/dma-contiguous.h @@ -0,0 +1,13 @@ +#ifndef ASMX86_DMA_CONTIGUOUS_H +#define ASMX86_DMA_CONTIGUOUS_H + +#ifdef __KERNEL__ + +#include <linux/device.h> +#include <linux/dma-contiguous.h> +#include <asm-generic/dma-contiguous.h> + +static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } + +#endif +#endif diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ed3065fd6314..90ac6f025c4a 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -13,6 +13,7 @@ #include <asm/io.h> #include <asm/swiotlb.h> #include <asm-generic/dma-coherent.h> +#include <linux/dma-contiguous.h> #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -61,6 +62,9 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); +extern void dma_generic_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr); + static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21ea..d3c372391ece 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -99,14 +99,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { unsigned long dma_mask; - struct page *page; + struct page *page = NULL; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; dma_mask = dma_alloc_coherent_mask(dev, flag); flag |= __GFP_ZERO; again: - page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!(flag & GFP_ATOMIC)) + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (!page) + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) return NULL; @@ -126,6 +130,16 @@ again: return page_address(page); } +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = virt_to_page(vaddr); + + if (!dma_release_from_contiguous(dev, page, count)) + free_pages((unsigned long)vaddr, get_order(size)); +} + /* * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel * parameter documentation. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3af4af810c07..656566f8ca2f 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} - static void nommu_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev, struct dma_map_ops nommu_dma_ops = { .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, + .free_coherent = dma_generic_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe874..be6795fc3686 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -50,6 +50,7 @@ #include <asm/pci-direct.h> #include <linux/init_ohci1394_dma.h> #include <linux/kvm_para.h> +#include <linux/dma-contiguous.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -938,6 +939,7 @@ void __init setup_arch(char **cmdline_p) } #endif memblock.current_limit = get_max_mapped(); + dma_contiguous_reserve(0); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 7be9f79018e9..f56cb208418c 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -189,4 +189,93 @@ config DMA_SHARED_BUFFER APIs extension; the file's descriptor can then be passed on to other driver. +config CMA + bool "Contiguous Memory Allocator (EXPERIMENTAL)" + depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL + select MIGRATION + help + This enables the Contiguous Memory Allocator which allows drivers + to allocate big physically-contiguous blocks of memory for use with + hardware components that do not support I/O map nor scatter-gather. + + For more information see <include/linux/dma-contiguous.h>. + If unsure, say "n". + +if CMA + +config CMA_DEBUG + bool "CMA debug messages (DEVELOPMENT)" + depends on DEBUG_KERNEL + help + Turns on debug messages in CMA. This produces KERN_DEBUG + messages for every CMA call as well as various messages while + processing calls such as dma_alloc_from_contiguous(). + This option does not affect warning and error messages. + +comment "Default contiguous memory area size:" + +config CMA_SIZE_MBYTES + int "Size in Mega Bytes" + depends on !CMA_SIZE_SEL_PERCENTAGE + default 16 + help + Defines the size (in MiB) of the default memory area for Contiguous + Memory Allocator. + +config CMA_SIZE_PERCENTAGE + int "Percentage of total memory" + depends on !CMA_SIZE_SEL_MBYTES + default 10 + help + Defines the size of the default memory area for Contiguous Memory + Allocator as a percentage of the total memory in the system. + +choice + prompt "Selected region size" + default CMA_SIZE_SEL_ABSOLUTE + +config CMA_SIZE_SEL_MBYTES + bool "Use mega bytes value only" + +config CMA_SIZE_SEL_PERCENTAGE + bool "Use percentage value only" + +config CMA_SIZE_SEL_MIN + bool "Use lower value (minimum)" + +config CMA_SIZE_SEL_MAX + bool "Use higher value (maximum)" + +endchoice + +config CMA_ALIGNMENT + int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 4 9 + default 8 + help + DMA mapping framework by default aligns all buffers to the smallest + PAGE_SIZE order which is greater than or equal to the requested buffer + size. This works well for buffers up to a few hundreds kilobytes, but + for larger buffers it just a memory waste. With this parameter you can + specify the maximum PAGE_SIZE order for contiguous buffers. Larger + buffers will be aligned only to this specified order. The order is + expressed as a power of two multiplied by the PAGE_SIZE. + + For example, if your system defaults to 4KiB pages, the order value + of 8 means that the buffers will be aligned up to 1MiB only. + + If unsure, leave the default value "8". + +config CMA_AREAS + int "Maximum count of the CMA device-private areas" + default 7 + help + CMA allows to create CMA areas for particular devices. This parameter + sets the maximum number of such device private CMA areas in the + system. + + If unsure, leave the default value "7". + +endif + endmenu diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 610f9997a403..ffd9dae1d411 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \ attribute_container.o transport_class.o \ topology.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o +obj-$(CONFIG_CMA) += dma-contiguous.o obj-y += power/ obj-$(CONFIG_HAS_DMA) += dma-mapping.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c new file mode 100644 index 000000000000..40c63417d6e3 --- /dev/null +++ b/drivers/base/dma-contiguous.c @@ -0,0 +1,404 @@ +/* + * Contiguous Memory Allocator for DMA mapping framework + * Copyright (c) 2010-2011 by Samsung Electronics. + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <asm/page.h> +#include <asm/dma-contiguous.h> + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/page-isolation.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/mm_types.h> +#include <linux/dma-contiguous.h> + +#ifndef SZ_1M +#define SZ_1M (1 << 20) +#endif + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; +}; + +struct cma *dma_contiguous_default_area; + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +#ifdef CONFIG_CMA_SIZE_PERCENTAGE +#define CMA_SIZE_PERCENTAGE CONFIG_CMA_SIZE_PERCENTAGE +#else +#define CMA_SIZE_PERCENTAGE 0 +#endif + +/* + * Default global CMA area size can be defined in kernel's .config. + * This is usefull mainly for distro maintainers to create a kernel + * that works correctly for most supported systems. + * The size can be set in bytes or as a percentage of the total memory + * in the system. + * + * Users, who want to set the size of global CMA area for their system + * should use cma= kernel parameter. + */ +static unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; +static unsigned long size_percent = CMA_SIZE_PERCENTAGE; +static long size_cmdline = -1; + +static int __init early_cma(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + size_cmdline = memparse(p, &p); + return 0; +} +early_param("cma", early_cma); + +static unsigned long __init cma_early_get_total_pages(void) +{ + struct memblock_region *reg; + unsigned long total_pages = 0; + + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + total_pages += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + return total_pages; +} + +/** + * dma_contiguous_reserve() - reserve area for contiguous memory handling + * @limit: End address of the reserved memory (optional, 0 for any). + * + * This funtion reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init dma_contiguous_reserve(phys_addr_t limit) +{ + unsigned long selected_size = 0; + unsigned long total_pages; + + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); + + total_pages = cma_early_get_total_pages(); + size_percent *= (total_pages << PAGE_SHIFT) / 100; + + pr_debug("%s: total available: %ld MiB, size absolute: %ld MiB, size percentage: %ld MiB\n", + __func__, (total_pages << PAGE_SHIFT) / SZ_1M, + size_bytes / SZ_1M, size_percent / SZ_1M); + +#ifdef CONFIG_CMA_SIZE_SEL_MBYTES + selected_size = size_bytes; +#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) + selected_size = size_percent; +#elif defined(CONFIG_CMA_SIZE_SEL_MIN) + selected_size = min(size_bytes, size_percent); +#elif defined(CONFIG_CMA_SIZE_SEL_MAX) + selected_size = max(size_bytes, size_percent); +#endif + + if (size_cmdline != -1) + selected_size = size_cmdline; + + if (!selected_size) + return; + + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + selected_size / SZ_1M); + + dma_declare_contiguous(NULL, selected_size, 0, limit); +}; + +static DEFINE_MUTEX(cma_mutex); + +static int cma_activate_area(unsigned long base_pfn, unsigned long count) +{ + unsigned long pfn = base_pfn; + unsigned i = count >> pageblock_order; + struct zone *zone; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + if (page_zone(pfn_to_page(pfn)) != zone) + return -EINVAL; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + return 0; +} + +static struct cma *cma_create_area(unsigned long base_pfn, + unsigned long count) +{ + int bitmap_size = BITS_TO_LONGS(count) * sizeof(long); + struct cma *cma; + int ret = -ENOMEM; + + pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count); + + cma = kmalloc(sizeof *cma, GFP_KERNEL); + if (!cma) + return ERR_PTR(-ENOMEM); + + cma->base_pfn = base_pfn; + cma->count = count; + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + goto no_mem; + + ret = cma_activate_area(base_pfn, count); + if (ret) + goto error; + + pr_debug("%s: returned %p\n", __func__, (void *)cma); + return cma; + +error: + kfree(cma->bitmap); +no_mem: + kfree(cma); + return ERR_PTR(ret); +} + +static struct cma_reserved { + phys_addr_t start; + unsigned long size; + struct device *dev; +} cma_reserved[MAX_CMA_AREAS] __initdata; +static unsigned cma_reserved_count __initdata; + +static int __init cma_init_reserved_areas(void) +{ + struct cma_reserved *r = cma_reserved; + unsigned i = cma_reserved_count; + + pr_debug("%s()\n", __func__); + + for (; i; --i, ++r) { + struct cma *cma; + cma = cma_create_area(PFN_DOWN(r->start), + r->size >> PAGE_SHIFT); + if (!IS_ERR(cma)) + dev_set_cma_area(r->dev, cma); + } + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * dma_declare_contiguous() - reserve area for contiguous memory handling + * for particular device + * @dev: Pointer to device structure. + * @size: Size of the reserved memory. + * @start: Start address of the reserved memory (optional, 0 for any). + * @limit: End address of the reserved memory (optional, 0 for any). + * + * This funtion reserves memory for specified device. It should be + * called by board specific code when early allocator (memblock or bootmem) + * is still activate. + */ +int __init dma_declare_contiguous(struct device *dev, unsigned long size, + phys_addr_t base, phys_addr_t limit) +{ + struct cma_reserved *r = &cma_reserved[cma_reserved_count]; + unsigned long alignment; + + pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__, + (unsigned long)size, (unsigned long)base, + (unsigned long)limit); + + /* Sanity checks */ + if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + /* Sanitise input arguments */ + alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit = ALIGN(limit, alignment); + + /* Reserve memory */ + if (base) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + base = -EBUSY; + goto err; + } + } else { + /* + * Use __memblock_alloc_base() since + * memblock_alloc_base() panic()s. + */ + phys_addr_t addr = __memblock_alloc_base(size, alignment, limit); + if (!addr) { + base = -ENOMEM; + goto err; + } else if (addr + size > ~(unsigned long)0) { + memblock_free(addr, size); + base = -EINVAL; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + r->start = base; + r->size = size; + r->dev = dev; + cma_reserved_count++; + pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M, + (unsigned long)base); + + /* + * Architecture specific contiguous memory fixup. + */ + dma_contiguous_early_fixup(base, size); + return 0; +err: + pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M); + return base; +} + +/** + * dma_alloc_from_contiguous() - allocate pages from contiguous area + * @dev: Pointer to device for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This funtion allocates memory buffer for specified device. It uses + * device specific contiguous memory area if available or the default + * global one. Requires architecture specific get_dev_cma_area() helper + * function. + */ +struct page *dma_alloc_from_contiguous(struct device *dev, int count, + unsigned int align) +{ + struct cma *cma = dev_get_cma_area(dev); + unsigned long pfn, pageno, start = 0; + unsigned long mask = (1 << align) - 1; + int ret; + + if (!cma || !cma->count) + return NULL; + + if (align > CONFIG_CMA_ALIGNMENT) + align = CONFIG_CMA_ALIGNMENT; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mutex_lock(&cma_mutex); + + for (;;) { + pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count, + start, count, mask); + if (pageno >= cma->count) { + ret = -ENOMEM; + goto error; + } + + pfn = cma->base_pfn + pageno; + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + if (ret == 0) { + bitmap_set(cma->bitmap, pageno, count); + break; + } else if (ret != -EBUSY) { + goto error; + } + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = pageno + mask + 1; + } + + mutex_unlock(&cma_mutex); + + pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn)); + return pfn_to_page(pfn); +error: + mutex_unlock(&cma_mutex); + return NULL; +} + +/** + * dma_release_from_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This funtion releases memory allocated by dma_alloc_from_contiguous(). + * It return 0 when provided pages doen't belongs to contiguous area and + * 1 on success. + */ +int dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + struct cma *cma = dev_get_cma_area(dev); + unsigned long pfn; + + if (!cma || !pages) + return 0; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return 0; + + mutex_lock(&cma_mutex); + + bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count); + free_contig_range(pfn, count); + + mutex_unlock(&cma_mutex); + return 1; +} diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h new file mode 100644 index 000000000000..bf2bcccffbfc --- /dev/null +++ b/include/asm-generic/dma-contiguous.h @@ -0,0 +1,27 @@ +#ifndef ASM_DMA_CONTIGUOUS_H +#define ASM_DMA_CONTIGUOUS_H + +#ifdef __KERNEL__ + +#include <linux/device.h> +#include <linux/dma-contiguous.h> + +#ifdef CONFIG_CMA + +static inline struct cma *dev_get_cma_area(struct device *dev) +{ + if (dev && dev->cma_area) + return dev->cma_area; + return dma_contiguous_default_area; +} + +static inline void dev_set_cma_area(struct device *dev, struct cma *cma) +{ + if (dev) + dev->cma_area = cma; + dma_contiguous_default_area = cma; +} + +#endif +#endif +#endif diff --git a/include/linux/device.h b/include/linux/device.h index b63fb393aa58..9559c183cb10 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -667,6 +667,10 @@ struct device { struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */ +#ifdef CONFIG_CMA + struct cma *cma_area; /* contiguous memory area for dma + allocations */ +#endif /* arch specific additions */ struct dev_archdata archdata; diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h new file mode 100644 index 000000000000..ffb4b40d728e --- /dev/null +++ b/include/linux/dma-contiguous.h @@ -0,0 +1,110 @@ +#ifndef __LINUX_CMA_H +#define __LINUX_CMA_H + +/* + * Contiguous Memory Allocator for DMA mapping framework + * Copyright (c) 2010-2011 by Samsung Electronics. + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +/* + * Contiguous Memory Allocator + * + * The Contiguous Memory Allocator (CMA) makes it possible to + * allocate big contiguous chunks of memory after the system has + * booted. + * + * Why is it needed? + * + * Various devices on embedded systems have no scatter-getter and/or + * IO map support and require contiguous blocks of memory to + * operate. They include devices such as cameras, hardware video + * coders, etc. + * + * Such devices often require big memory buffers (a full HD frame + * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * MB of memory), which makes mechanisms such as kmalloc() or + * alloc_page() ineffective. + * + * At the same time, a solution where a big memory region is + * reserved for a device is suboptimal since often more memory is + * reserved then strictly required and, moreover, the memory is + * inaccessible to page system even if device drivers don't use it. + * + * CMA tries to solve this issue by operating on memory regions + * where only movable pages can be allocated from. This way, kernel + * can use the memory for pagecache and when device driver requests + * it, allocated pages can be migrated. + * + * Driver usage + * + * CMA should not be used by the device drivers directly. It is + * only a helper framework for dma-mapping subsystem. + * + * For more information, see kernel-docs in drivers/base/dma-contiguous.c + */ + +#ifdef __KERNEL__ + +struct cma; +struct page; +struct device; + +#ifdef CONFIG_CMA + +/* + * There is always at least global CMA area and a few optional device + * private areas configured in kernel .config. + */ +#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) + +extern struct cma *dma_contiguous_default_area; + +void dma_contiguous_reserve(phys_addr_t addr_limit); +int dma_declare_contiguous(struct device *dev, unsigned long size, + phys_addr_t base, phys_addr_t limit); + +struct page *dma_alloc_from_contiguous(struct device *dev, int count, + unsigned int order); +int dma_release_from_contiguous(struct device *dev, struct page *pages, + int count); + +#else + +#define MAX_CMA_AREAS (0) + +static inline void dma_contiguous_reserve(phys_addr_t limit) { } + +static inline +int dma_declare_contiguous(struct device *dev, unsigned long size, + phys_addr_t base, phys_addr_t limit) +{ + return -ENOSYS; +} + +static inline +struct page *dma_alloc_from_contiguous(struct device *dev, int count, + unsigned int order) +{ + return NULL; +} + +static inline +int dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + return 0; +} + +#endif + +#endif + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 17b27cd269c4..5cb0312c89f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,7 +450,7 @@ void put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); -int split_free_page(struct page *page); +int split_free_page(struct page *page, int force_reclaim); /* * Compound pages have a destructor function. Provide a diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 650ba2fb3301..ba83aff8ae8a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,13 +35,35 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -#define MIGRATE_UNMOVABLE 0 -#define MIGRATE_RECLAIMABLE 1 -#define MIGRATE_MOVABLE 2 -#define MIGRATE_PCPTYPES 3 /* the number of types on the pcp lists */ -#define MIGRATE_RESERVE 3 -#define MIGRATE_ISOLATE 4 /* can't allocate from here */ -#define MIGRATE_TYPES 5 +enum { + MIGRATE_UNMOVABLE, + MIGRATE_RECLAIMABLE, + MIGRATE_MOVABLE, + MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ + MIGRATE_RESERVE = MIGRATE_PCPTYPES, + /* + * MIGRATE_CMA migration type is designed to mimic the way + * ZONE_MOVABLE works. Only movable pages can be allocated + * from MIGRATE_CMA pageblocks and page allocator never + * implicitly change migration type of MIGRATE_CMA pageblock. + * + * The way to use it is to change migratetype of a range of + * pageblocks to MIGRATE_CMA which can be done by + * __free_pageblock_cma() function. What is important though + * is that a range of pageblocks must be aligned to + * MAX_ORDER_NR_PAGES should biggest page be bigger then + * a single pageblock. + */ + MIGRATE_CMA, + MIGRATE_ISOLATE, /* can't allocate from here */ + MIGRATE_TYPES +}; + +#ifdef CONFIG_CMA +# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +#else +# define is_migrate_cma(migratetype) false +#endif #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ @@ -54,6 +76,11 @@ static inline int get_pageblock_migratetype(struct page *page) return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); } +static inline bool is_pageblock_cma(struct page *page) +{ + return is_migrate_cma(get_pageblock_migratetype(page)); +} + struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 051c1b1ede4e..c8004dc397f5 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,7 +3,7 @@ /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. - * If specified range includes migrate types other than MOVABLE, + * If specified range includes migrate types other than MOVABLE or CMA, * this will fail with -EBUSY. * * For isolating all pages in the range finally, the caller have to @@ -11,27 +11,34 @@ * test it. */ extern int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn); +start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype); /* * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ extern int -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn); +undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype); /* - * test all pages in [start_pfn, end_pfn)are isolated or not. + * Test all pages in [start_pfn, end_pfn) are isolated or not. */ -extern int -test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); /* - * Internal funcs.Changes pageblock's migrate type. - * Please use make_pagetype_isolated()/make_pagetype_movable(). + * Internal functions. Changes pageblock's migrate type. */ -extern int set_migratetype_isolate(struct page *page); -extern void unset_migratetype_isolate(struct page *page); +int set_migratetype_isolate(struct page *page); +void unset_migratetype_isolate(struct page *page, unsigned migratetype); + +/* The below functions must be run on a range from a single zone. */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype); +void free_contig_range(unsigned long pfn, unsigned nr_pages); +/* CMA stuff */ +extern void init_cma_reserved_pageblock(struct page *page); #endif diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..39220026c797 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -198,7 +198,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..8aada89efbbb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -13,7 +13,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o mmu_context.o percpu.o \ - $(mmu-y) + compaction.o $(mmu-y) obj-y += init-mm.o ifdef CONFIG_NO_BOOTMEM @@ -32,7 +32,6 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o -obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o diff --git a/mm/compaction.c b/mm/compaction.c index d9ebebe1a2aa..a2b256c01fd9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,203 +16,96 @@ #include <linux/sysfs.h> #include "internal.h" +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> -/* - * compact_control is used to track pages being migrated and the free pages - * they are being migrated to during memory compaction. The free_pfn starts - * at the end of a zone and migrate_pfn begins at the start. Movable pages - * are moved to the end of a zone during a compaction run and the run - * completes when free_pfn <= migrate_pfn - */ -struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ - struct list_head migratepages; /* List of pages being migrated */ - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ - unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ - - unsigned int order; /* order a direct compactor needs */ - int migratetype; /* MOVABLE, RECLAIMABLE etc */ - struct zone *zone; -}; - -static unsigned long release_freepages(struct list_head *freelist) +static inline bool is_migrate_cma_or_movable(int migratetype) { - struct page *page, *next; - unsigned long count = 0; - - list_for_each_entry_safe(page, next, freelist, lru) { - list_del(&page->lru); - __free_page(page); - count++; - } - - return count; + return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* Isolate free pages onto a private freelist. Must hold zone->lock */ -static unsigned long isolate_freepages_block(struct zone *zone, - unsigned long blockpfn, - struct list_head *freelist) +/** + * isolate_freepages_range() - isolate free pages, must hold zone->lock. + * @zone: Zone pages are in. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * @freelist: A list to save isolated pages to. + * + * If @freelist is not provided, holes in range (either non-free pages + * or invalid PFNs) are considered an error and function undos its + * actions and returns zero. + * + * If @freelist is provided, function will simply skip non-free and + * missing pages and put only the ones isolated on the list. + * + * Returns number of isolated pages. This may be more then end_pfn-start_pfn + * if end fell in a middle of a free page. + */ +unsigned long +isolate_freepages_range(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn, + struct list_head *freelist, int force_reclaim) { - unsigned long zone_end_pfn, end_pfn; - int nr_scanned = 0, total_isolated = 0; - struct page *cursor; - - /* Get the last PFN we should scan for free pages at */ - zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + unsigned long nr_scanned = 0, total_isolated = 0; + unsigned long pfn = start_pfn; + struct page *page; - /* Find the first usable PFN in the block to initialse page cursor */ - for (; blockpfn < end_pfn; blockpfn++) { - if (pfn_valid_within(blockpfn)) - break; - } - cursor = pfn_to_page(blockpfn); + VM_BUG_ON(!pfn_valid(pfn)); + page = pfn_to_page(pfn); /* Isolate free pages. This assumes the block is valid */ - for (; blockpfn < end_pfn; blockpfn++, cursor++) { - int isolated, i; - struct page *page = cursor; + while (pfn < end_pfn) { + int n = 0, i; - if (!pfn_valid_within(blockpfn)) - continue; - nr_scanned++; + if (!pfn_valid_within(pfn)) + goto next; + ++nr_scanned; if (!PageBuddy(page)) - continue; + goto next; /* Found a free page, break it into order-0 pages */ - isolated = split_free_page(page); - total_isolated += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; + n = split_free_page(page, force_reclaim); + total_isolated += n; + if (freelist) { + struct page *p = page; + for (i = n; i; --i, ++p) + list_add(&p->lru, freelist); } - /* If a page was split, advance to the end of it */ - if (isolated) { - blockpfn += isolated - 1; - cursor += isolated - 1; +next: + if (!n) { + /* If n == 0, we have isolated no pages. */ + if (!freelist) + goto cleanup; + n = 1; } + + /* + * If we pass max order page, we might end up in a different + * vmemmap, so account for that. + */ + pfn += n; + if (pfn & (MAX_ORDER_NR_PAGES - 1)) + page += n; + else + page = pfn_to_page(pfn); } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; -} - -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; - - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; - - /* If the block is MIGRATE_MOVABLE, allow migration */ - if (migratetype == MIGRATE_MOVABLE) - return true; - - /* Otherwise skip the block */ - return false; -} - -/* - * Based on information in the current compact_control, find blocks - * suitable for isolating free pages from and then isolate them. - */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) -{ - struct page *page; - unsigned long high_pfn, low_pfn, pfn; - unsigned long flags; - int nr_freepages = cc->nr_freepages; - struct list_head *freelist = &cc->freepages; - - /* - * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. - */ - pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; +cleanup: /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. + * Undo what we have done so far, and return. We know all pages from + * [start_pfn, pfn) are free because we have just freed them. If one of + * the page in the range was not freed, we would end up here earlier. */ - high_pfn = min(low_pfn, pfn); - - /* - * Isolate free pages until enough are available to migrate the - * pages on cc->migratepages. We stop searching if the migrate - * and free page scanners meet or enough free pages are isolated. - */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { - unsigned long isolated; - - if (!pfn_valid(pfn)) - continue; - - /* - * Check for overlapping nodes/zones. It's possible on some - * configurations to have a setup like - * node0 node1 node0 - * i.e. it's possible that all pages within a zones range of - * pages do not belong to a single zone. - */ - page = pfn_to_page(pfn); - if (page_zone(page) != zone) - continue; - - /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) - continue; - - /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled - */ - isolated = 0; - spin_lock_irqsave(&zone->lock, flags); - if (suitable_migration_target(page)) { - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); - - /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator - */ - if (isolated) - high_pfn = max(high_pfn, pfn); - } - - /* split_free_page does not map the pages */ - list_for_each_entry(page, freelist, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); - } - - cc->free_pfn = high_pfn; - cc->nr_freepages = nr_freepages; + for (; start_pfn < pfn; ++start_pfn) + __free_page(pfn_to_page(start_pfn)); + return 0; } /* Update the number of anon and file isolated pages in the zone */ @@ -243,38 +136,34 @@ static bool too_many_isolated(struct zone *zone) return isolated > (inactive + active) / 2; } -/* possible outcome of isolate_migratepages */ -typedef enum { - ISOLATE_ABORT, /* Abort compaction now */ - ISOLATE_NONE, /* No pages isolated, continue scanning */ - ISOLATE_SUCCESS, /* Pages isolated, migrate */ -} isolate_migrate_t; - -/* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. +/** + * isolate_migratepages_range() - isolate all migrate-able pages in range. + * @zone: Zone pages are in. + * @cc: Compaction control structure. + * @low_pfn: The first PFN of the range. + * @end_pfn: The one-past-the-last PFN of the range. + * + * Isolate all pages that can be migrated from the range specified by + * [low_pfn, end_pfn). Returns zero if there is a fatal signal + * pending), otherwise PFN of the first page that was not scanned + * (which may be both less, equal to or more then end_pfn). + * + * Assumes that cc->migratepages is empty and cc->nr_migratepages is + * zero. + * + * Other then cc->migratepages and cc->nr_migratetypes this function + * does not modify any cc's fields, ie. it does not modify (or read + * for that matter) cc->migrate_pfn. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn) { - unsigned long low_pfn, end_pfn; unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); - - /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); - - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } - /* * Ensure that there are not too many pages isolated from the LRU * list by either parallel reclaimers or compaction. If there are, @@ -283,12 +172,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ if (!cc->sync) - return ISOLATE_ABORT; + return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return ISOLATE_ABORT; + return 0; } /* Time to isolate some pages for migration */ @@ -351,7 +240,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, */ pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && - get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + is_migrate_cma_or_movable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; last_pageblock_nr = pageblock_nr; @@ -396,11 +285,141 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, acct_isolated(zone, cc); spin_unlock_irq(&zone->lru_lock); - cc->migrate_pfn = low_pfn; trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - return ISOLATE_SUCCESS; + return low_pfn; +} + +#endif /* CONFIG_COMPACTION || CONFIG_CMA */ +#ifdef CONFIG_COMPACTION + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long count = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + list_del(&page->lru); + __free_page(page); + count++; + } + + return count; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (is_migrate_cma_or_movable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + /* + * Initialise the free scanner. The starting point is where we last + * scanned from (or the end of the zone if starting). The low point + * is the end of the pageblock the migration scanner is using. + */ + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + + /* + * Take care that if the migration scanner is at the end of the zone + * that the free scanner does not accidentally move to the next zone + * in the next isolation cycle. + */ + high_pfn = min(low_pfn, pfn); + + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + isolated = isolate_freepages_range(zone, pfn, + end_pfn, freelist, false); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + + /* split_free_page does not map the pages */ + list_for_each_entry(page, freelist, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; } /* @@ -449,6 +468,44 @@ static void update_nr_listpages(struct compact_control *cc) cc->nr_freepages = nr_freepages; } +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static isolate_migrate_t isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return ISOLATE_NONE; + } + + /* Perform the isolation */ + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); + if (!low_pfn) + return ISOLATE_ABORT; + + cc->migrate_pfn = low_pfn; + + return ISOLATE_SUCCESS; +} + static int compact_finished(struct zone *zone, struct compact_control *cc) { @@ -766,3 +823,5 @@ void compaction_unregister_node(struct node *node) return device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +#endif /* CONFIG_COMPACTION */ diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..8b800275e955 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -100,6 +100,41 @@ extern void prep_compound_page(struct page *page, unsigned long order); extern bool is_free_buddy_page(struct page *page); #endif +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + +/* + * in mm/compaction.c + */ +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ + + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +}; + +unsigned long +isolate_freepages_range(struct zone *zone, + unsigned long start, unsigned long end, + struct list_head *freelist, int force_reclaim); +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); + +#endif /* * function for dealing with page's order in buddy system. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 56080ea36140..76b01bf02529 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1400,7 +1400,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p); + unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..fc898cb4fe8f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -891,7 +891,7 @@ static int __ref offline_pages(unsigned long start_pfn, nr_pages = end_pfn - start_pfn; /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn); + ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); if (ret) goto out; @@ -956,7 +956,7 @@ repeat: We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; @@ -981,7 +981,7 @@ failed_removal: start_pfn, end_pfn); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: unlock_memory_hotplug(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..516774881d47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,6 +58,7 @@ #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/page-debug-flags.h> +#include <linux/migrate.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -628,15 +629,16 @@ static inline int free_pages_check(struct page *page) * * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. + * + * Caller must hold zone->lock. */ -static void free_pcppages_bulk(struct zone *zone, int count, +static void __free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { int migratetype = 0; int batch_free = 0; int to_free = count; - spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; @@ -666,13 +668,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, page_private(page)); trace_mm_page_pcpu_drain(page, 0, page_private(page)); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); - spin_unlock(&zone->lock); } static void free_one_page(struct zone *zone, struct page *page, int order, @@ -749,6 +751,26 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#ifdef CONFIG_CMA +/* + * Free whole pageblock and set it's migration type to MIGRATE_CMA. + */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + totalram_pages += pageblock_nr_pages; +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@ -874,11 +896,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { +static int fallbacks[MIGRATE_PCPTYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_CMA , MIGRATE_RESERVE }, }; /* @@ -973,12 +994,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0; i < ARRAY_SIZE(fallbacks[0]); i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) @@ -993,11 +1014,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages + * + * On the other hand, never change migration + * type of MIGRATE_CMA pageblocks nor move CMA + * pages on different free lists. We don't + * want unmovable pages to be allocated from + * MIGRATE_CMA areas. */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; + if (!is_pageblock_cma(page) && + (unlikely(current_order >= pageblock_order / 2) || + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled)) { + int pages; pages = move_freepages_block(zone, page, start_migratetype); @@ -1015,11 +1043,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) rmv_page_order(page); /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) + if (current_order >= pageblock_order && + !is_pageblock_cma(page)) change_pageblock_range(page, current_order, start_migratetype); - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + is_migrate_cma(start_migratetype) + ? start_migratetype : migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); @@ -1091,7 +1122,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (is_pageblock_cma(page)) + set_page_private(page, MIGRATE_CMA); + else + set_page_private(page, migratetype); list = &page->lru; } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); @@ -1113,14 +1147,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain; - local_irq_save(flags); + spin_lock_irqsave(&zone->lock, flags); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); + __free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; - local_irq_restore(flags); + spin_unlock_irqrestore(&zone->lock, flags); } #endif @@ -1145,7 +1179,9 @@ static void drain_pages(unsigned int cpu) pcp = &pset->pcp; if (pcp->count) { - free_pcppages_bulk(zone, pcp->count, pcp); + spin_lock(&zone->lock); + __free_pcppages_bulk(zone, pcp->count, pcp); + spin_unlock(&zone->lock); pcp->count = 0; } local_irq_restore(flags); @@ -1168,6 +1204,32 @@ void drain_all_pages(void) on_each_cpu(drain_local_pages, NULL, 1); } +/* Caller must hold zone->lock. */ +static void __zone_drain_local_pages(void *arg) +{ + struct per_cpu_pages *pcp; + struct zone *zone = arg; + unsigned long flags; + + local_irq_save(flags); + pcp = &per_cpu_ptr(zone->pageset, smp_processor_id())->pcp; + if (pcp->count) { + /* Caller holds zone->lock, no need to grab it. */ + __free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } + local_irq_restore(flags); +} + +/* + * Like drain_all_pages() but operates on a single zone. Caller must + * hold zone->lock. + */ +static void __zone_drain_all_pages(struct zone *zone) +{ + on_each_cpu(__zone_drain_local_pages, zone, 1); +} + #ifdef CONFIG_HIBERNATION void mark_free_pages(struct zone *zone) @@ -1248,7 +1310,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); + spin_lock(&zone->lock); + __free_pcppages_bulk(zone, pcp->batch, pcp); + spin_unlock(&zone->lock); pcp->count -= pcp->batch; } @@ -1297,17 +1361,65 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } +static inline +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, + enum zone_type high_zoneidx, + enum zone_type classzone_idx) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, classzone_idx); +} + +/* + * Trigger memory pressure bump to reclaim at least 2^order of free pages. + * Does similar work as it is done in __alloc_pages_slowpath(). Used only + * by migration code to allocate contiguous memory range. + */ +static int __force_memory_reclaim(int order, struct zone *zone) +{ + gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zonelist *zonelist = node_zonelist(0, gfp_mask); + struct reclaim_state reclaim_state; + int did_some_progress = 0; + + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + current->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + current->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, NULL); + + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + current->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (!did_some_progress) { + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order, NULL); + } + return order; +} + /* * Similar to split_page except the page is already free. As this is only * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. + * The caller is responsible for calling arch_alloc_page() and kernel_map_page() + * after interrupts are enabled. * * Note: this is probably too low level an operation for use in drivers. * Please consult with lkml before using this in your driver. */ -int split_free_page(struct page *page) +int split_free_page(struct page *page, int force_reclaim) { unsigned int order; unsigned long watermark; @@ -1318,10 +1430,15 @@ int split_free_page(struct page *page) zone = page_zone(page); order = page_order(page); +try_again: /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - return 0; + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) { + if (force_reclaim && __force_memory_reclaim(order, zone)) + goto try_again; + else + return 0; + } /* Remove page from free list */ list_del(&page->lru); @@ -1335,8 +1452,12 @@ int split_free_page(struct page *page) if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } } return 1 << order; @@ -2126,18 +2247,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx, - enum zone_type classzone_idx) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order, classzone_idx); -} - static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { @@ -3803,10 +3912,10 @@ static int __zone_pcp_update(void *data) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); + spin_lock_irqsave(&zone->lock, flags); + __free_pcppages_bulk(zone, pcp->count, pcp); setup_pageset(pset, batch); - local_irq_restore(flags); + spin_unlock_irqrestore(&zone->lock, flags); } return 0; } @@ -5373,8 +5482,8 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) */ if (zone_idx(zone) == ZONE_MOVABLE) return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || + is_pageblock_cma(page)) return true; pfn = page_to_pfn(page); @@ -5481,17 +5590,16 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { + __zone_drain_all_pages(zone); set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE); } spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) - drain_all_pages(); return ret; } -void unset_migratetype_isolate(struct page *page) +void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags; @@ -5499,12 +5607,207 @@ void unset_migratetype_isolate(struct page *page) spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } +static unsigned long pfn_align_to_maxpage_down(unsigned long pfn) +{ + return pfn & ~(MAX_ORDER_NR_PAGES - 1); +} + +static unsigned long pfn_align_to_maxpage_up(unsigned long pfn) +{ + return ALIGN(pfn, MAX_ORDER_NR_PAGES); +} + +static struct page * +__cma_migrate_alloc(struct page *page, unsigned long private, int **resultp) +{ + return alloc_page(GFP_HIGHUSER_MOVABLE); +} + +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + + unsigned long pfn = start; + int ret = -EBUSY; + unsigned tries = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + migrate_prep_local(); + + while (pfn < end || cc.nr_migratepages) { + /* Abort on signal */ + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto done; + } + + /* Get some pages to migrate. */ + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + pfn = isolate_migratepages_range(cc.zone, &cc, + pfn, end); + if (!pfn) { + ret = -EINTR; + goto done; + } + tries = 0; + } + + /* Try to migrate. */ + ret = migrate_pages(&cc.migratepages, __cma_migrate_alloc, + 0, false, true); + + /* Migrated all of them? Great! */ + if (list_empty(&cc.migratepages)) + continue; + + /* Try five times. */ + if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + goto done; + } + + /* Before each time drain everything and reschedule. */ + lru_add_drain_all(); + drain_all_pages(); + cond_resched(); + } + ret = 0; + +done: + /* Make sure all pages are isolated. */ + if (!ret) { + lru_add_drain_all(); + drain_all_pages(); + if (WARN_ON(test_pages_isolated(start, end))) + ret = -EBUSY; + } + + /* Release pages */ + putback_lru_pages(&cc.migratepages); + + return ret; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, hovewer it's callers responsibility to guarantee that we + * are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in (start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + unsigned long outer_start, outer_end; + int ret; + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because of the way page allocator work, we + * align the range to MAX_ORDER pages so that page allocator + * won't try to merge buddies from different pageblocks and + * change MIGRATE_ISOLATE to some other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * MAX_ORDER aligned range but not in the unaligned, original + * range are put back to page allocator so that buddy can use + * them. + */ + + ret = start_isolate_page_range(pfn_align_to_maxpage_down(start), + pfn_align_to_maxpage_up(end), + migratetype); + if (ret) + goto done; + + ret = __alloc_contig_migrate_range(start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocater). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + */ + + ret = 0; + while (!PageBuddy(pfn_to_page(start & (~0UL << ret)))) + if (WARN_ON(++ret >= MAX_ORDER)) { + ret = -EINVAL; + goto done; + } + + outer_start = start & (~0UL << ret); + outer_end = isolate_freepages_range(page_zone(pfn_to_page(outer_start)), + outer_start, end, NULL, true); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + outer_end += outer_start; + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + + ret = 0; +done: + undo_isolate_page_range(pfn_align_to_maxpage_down(start), + pfn_align_to_maxpage_up(end), migratetype); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + for (; nr_pages--; ++pfn) + __free_page(pfn_to_page(pfn)); +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4ae42bb40892..c9f04774f2b8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -24,6 +24,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * to be MIGRATE_ISOLATE. * @start_pfn: The lower PFN of the range to be isolated. * @end_pfn: The upper PFN of the range to be isolated. + * @migratetype: migrate type to set in error recovery. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -32,8 +33,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * start_pfn/end_pfn must be aligned to pageblock_order. * Returns 0 on success and -EBUSY if any part of range cannot be isolated. */ -int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) { unsigned long pfn; unsigned long undo_pfn; @@ -56,7 +57,7 @@ undo: for (pfn = start_pfn; pfn < undo_pfn; pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn)); + unset_migratetype_isolate(pfn_to_page(pfn), migratetype); return -EBUSY; } @@ -64,8 +65,8 @@ undo: /* * Make isolated pages available again. */ -int -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) { unsigned long pfn; struct page *page; @@ -77,7 +78,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) continue; - unset_migratetype_isolate(page); + unset_migratetype_isolate(page, migratetype); } return 0; } @@ -86,7 +87,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 1 if all pages in the range is isolated. + * Returns 1 if all pages in the range are isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) diff --git a/mm/vmstat.c b/mm/vmstat.c index f600557a7659..ad1d919013a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -613,6 +613,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Reclaimable", "Movable", "Reserve", + "Cma", "Isolate", }; |