diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-05-11 16:32:10 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-05-11 16:32:10 +1000 |
commit | 76e72e7cc2fa5ca221530d80e0595a95f36c6c4b (patch) | |
tree | 4b89bc3c3dc272e0646975c676cb7170a8c9659c | |
parent | 2013fdf7d6eac8ceb513c22ac5a05570f29ca734 (diff) | |
parent | a3980f9598c603bdfe65a109d2557d53b838cc1c (diff) |
Merge branch 'akpm-current/current'
Conflicts:
Documentation/devicetree/bindings/rtc/abracon,abx80x.txt
arch/x86/kernel/machine_kexec_64.c
drivers/block/zram/zram_drv.c
221 files changed, 4561 insertions, 1979 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-zram b/Documentation/ABI/testing/sysfs-class-zram new file mode 100644 index 000000000000..74c885073d0e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-zram @@ -0,0 +1,24 @@ +What: /sys/class/zram-control/ +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + The zram-control/ class sub-directory belongs to zram + device class + +What: /sys/class/zram-control/zram_add +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + RO attribute. Read operation will cause zram to add a new + device and return its device id back to user (so one can + use /dev/zram<id>), or error code. + +What: /sys/class/zram-control/zram_add +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + Remove a specific /dev/zramX device, where X is a device_id + provided by user diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 48a183e29988..fc686d456f44 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -19,7 +19,9 @@ Following shows a typical sequence of steps for using zram. 1) Load Module: modprobe zram num_devices=4 This creates 4 devices: /dev/zram{0,1,2,3} - (num_devices parameter is optional. Default: 1) + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. 2) Set max number of compression streams Compression backend may use up to max_comp_streams compression streams, @@ -97,7 +99,24 @@ size of the disk when not in use so a huge zram is wasteful. mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -7) Stats: +7) Add/remove zram devices + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on zram_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram<id>) or error code. + +Example: + cat /sys/class/zram-control/zram_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute + echo X > /sys/class/zram-control/zram_remove + +8) Stats: Per-device statistics are exported as various nodes under /sys/block/zram<id>/ A brief description of exported device attritbutes. For more details please @@ -126,7 +145,7 @@ mem_used_max RW the maximum amount memory zram have consumed to mem_limit RW the maximum amount of memory ZRAM can use to store the compressed data num_migrated RO the number of objects migrated migrated by compaction - +compact WO trigger memory compaction WARNING ======= @@ -172,11 +191,11 @@ line of text and contains the following stats separated by whitespace: zero_pages num_migrated -8) Deactivate: +9) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -9) Reset: +10) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126aceed8..223c32171dcc 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block <bool>: 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt index ab0baa692c13..22dd6af2e4bd 100644 --- a/Documentation/lockup-watchdogs.txt +++ b/Documentation/lockup-watchdogs.txt @@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows administrators to configure the period of the hrtimer and the perf event. The right value for a particular environment is a trade-off between fast response to lockups and detection overhead. + +By default, the watchdog runs on all online cores. However, on a +kernel configured with NO_HZ_FULL, by default the watchdog runs only +on the housekeeping cores, not the cores specified in the "nohz_full" +boot argument. If we allowed the watchdog to run by default on +the "nohz_full" cores, we would have to run timer ticks to activate +the scheduler, which would prevent the "nohz_full" functionality +from protecting the user code on those cores from the kernel. +Of course, disabling it by default on the nohz_full cores means that +when those cores do enter the kernel, by default we will not be +able to detect if they lock up. However, allowing the watchdog +to continue to run on the housekeeping (non-tickless) cores means +that we will continue to detect lockups properly on those cores. + +In either case, the set of cores excluded from running the watchdog +may be adjusted via the kernel.watchdog_cpumask sysctl. For +nohz_full cores, this may be useful for debugging a case where the +kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 2216eb187c21..2ec6d84f391c 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -244,6 +244,14 @@ dentry names: Passed by reference. +task_struct comm name: + + %pT + + For printing task_struct->comm. + + Passed by reference (NULL for "current"). + struct va_format: %pV diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c831001c45f1..e5d528e0c46e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -923,6 +923,27 @@ and nmi_watchdog. ============================================================== +watchdog_cpumask: + +This value can be used to control on which cpus the watchdog may run. +The default cpumask is all possible cores, but if NO_HZ_FULL is +enabled in the kernel config, and cores are specified with the +nohz_full= boot argument, those cores are excluded by default. +Offline cores can be included in this mask, and if the core is later +brought online, the watchdog will be started based on the mask value. + +Typically this value would only be touched in the nohz_full case +to re-enable cores that by default were not running the watchdog, +if a kernel lockup was suspected on those cores. + +The argument value is the standard cpulist format for cpumasks, +so for example to enable the watchdog on cores 0, 2, 3, and 4 you +might say: + + echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask + +============================================================== + watchdog_thresh: This value can be used to control the frequency of hrtimer and NMI diff --git a/MAINTAINERS b/MAINTAINERS index df657f543e67..ab739926bcc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2155,14 +2155,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rpi/linux-rpi.git S: Maintained N: bcm2835 -BROADCOM BCM33XX MIPS ARCHITECTURE -M: Kevin Cernekee <cernekee@gmail.com> -L: linux-mips@linux-mips.org -S: Maintained -F: arch/mips/bcm3384/* -F: arch/mips/include/asm/mach-bcm3384/* -F: arch/mips/kernel/*bmips* - BROADCOM BCM5301X ARM ARCHITECTURE M: Hauke Mehrtens <hauke@hauke-m.de> L: linux-arm-kernel@lists.infradead.org @@ -2205,7 +2197,7 @@ S: Maintained F: arch/mips/bmips/* F: arch/mips/include/asm/mach-bmips/* F: arch/mips/kernel/*bmips* -F: arch/mips/boot/dts/bcm*.dts* +F: arch/mips/boot/dts/brcm/bcm*.dts* F: drivers/irqchip/irq-bcm7* F: drivers/irqchip/irq-brcmstb* @@ -2257,7 +2249,7 @@ M: Ray Jui <rjui@broadcom.com> L: bcm-kernel-feedback-list@broadcom.com S: Supported F: drivers/gpio/gpio-bcm-kona.c -F: Documentation/devicetree/bindings/gpio/gpio-bcm-kona.txt +F: Documentation/devicetree/bindings/gpio/brcm,kona-gpio.txt BROADCOM SPECIFIC AMBA DRIVER (BCMA) M: Rafał Miłecki <zajec5@gmail.com> @@ -5116,11 +5108,10 @@ INTEL ASoC BDW/HSW DRIVERS M: Jie Yang <yang.jie@linux.intel.com> L: alsa-devel@alsa-project.org S: Supported -F: sound/soc/intel/sst-haswell* -F: sound/soc/intel/sst-dsp* -F: sound/soc/intel/sst-firmware.c -F: sound/soc/intel/broadwell.c -F: sound/soc/intel/haswell.c +F: sound/soc/intel/common/sst-dsp* +F: sound/soc/intel/common/sst-firmware.c +F: sound/soc/intel/boards/broadwell.c +F: sound/soc/intel/haswell/ INTEL C600 SERIES SAS CONTROLLER DRIVER M: Intel SCU Linux support <intel-linux-scu@intel.com> @@ -6814,7 +6805,6 @@ L: nbd-general@lists.sourceforge.net T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c -F: include/linux/nbd.h F: include/uapi/linux/nbd.h NETWORK DROP MONITOR @@ -7439,7 +7429,6 @@ F: arch/*/include/asm/paravirt.h PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh <tim@cyberelk.net> L: linux-parport@lists.infradead.org (subscribers-only) -W: http://www.torque.net/linux-pp.html S: Maintained F: Documentation/blockdev/paride.txt F: drivers/block/paride/ diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b07fd862fec3 --- /dev/null +++ b/arch/alpha/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H +#define _ASM_ALPHA_MM_ARCH_HOOKS_H + +#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */ diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 0086b472bc2b..836fbd44f65b 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -44,6 +44,7 @@ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_SPACEAVAIL 5 /* ensure resources are available */ #define MADV_DONTNEED 6 /* don't need these pages */ +#define MADV_FREE 7 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h index 45b8e0cea176..f787894613ad 100644 --- a/arch/arc/include/asm/dma-mapping.h +++ b/arch/arc/include/asm/dma-mapping.h @@ -178,22 +178,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, } static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir) { int i; + struct scatterlist *sg; - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sglist, sg, nelems, i) _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction dir) +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction dir) { int i; + struct scatterlist *sg; - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sglist, sg, nelems, i) _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); } diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..c37541c5f8ba --- /dev/null +++ b/arch/arc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARC_MM_ARCH_HOOKS_H +#define _ASM_ARC_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */ diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 1f1b1cd112f3..31bb7dccb971 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int huge_pte_none(pte_t pte) { return pte_none(pte); diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7056660c7cc4 --- /dev/null +++ b/arch/arm/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARM_MM_ARCH_HOOKS_H +#define _ASM_ARM_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a745a2a53853..6d6012a320b2 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING); PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); +PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY); PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index c72412415093..fcafb521f14e 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -41,11 +41,6 @@ int pud_huge(pud_t pud) return 0; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 5b7ca8ace95f..734c17e89e94 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int huge_pte_none(pte_t pte) { return pte_none(pte); diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..562b655f5ba9 --- /dev/null +++ b/arch/arm64/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H +#define _ASM_ARM64_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 56283f8a675c..bd5db28324ba 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -285,10 +285,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2de9d2e59d96..cccc4af87a03 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -31,13 +31,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> -#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} -#endif - int pmd_huge(pmd_t pmd) { return !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h index b3d18f9f3e8d..ae7ac9205d20 100644 --- a/arch/avr32/include/asm/dma-mapping.h +++ b/arch/avr32/include/asm/dma-mapping.h @@ -209,17 +209,18 @@ dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, * the same here. */ static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++) { + for_each_sg(sglist, sg, nents, i) { char *virt; - sg[i].dma_address = page_to_bus(sg_page(&sg[i])) + sg[i].offset; - virt = sg_virt(&sg[i]); - dma_cache_sync(dev, virt, sg[i].length, direction); + sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; + virt = sg_virt(sg); + dma_cache_sync(dev, virt, sg->length, direction); } return nents; @@ -321,14 +322,14 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++) { - dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, direction); - } + for_each_sg(sglist, sg, nents, i) + dma_cache_sync(dev, sg_virt(sg), sg->length, direction); } /* Now for the API extensions over the pci_ one */ diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..145452ffbdad --- /dev/null +++ b/arch/avr32/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H +#define _ASM_AVR32_MM_ARCH_HOOKS_H + +#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */ diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..1c5211ec338f --- /dev/null +++ b/arch/blackfin/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H +#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H + +#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */ diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..bb3c4a6ce8e9 --- /dev/null +++ b/arch/c6x/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_C6X_MM_ARCH_HOOKS_H +#define _ASM_C6X_MM_ARCH_HOOKS_H + +#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */ diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..314f774db2b0 --- /dev/null +++ b/arch/cris/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H +#define _ASM_CRIS_MM_ARCH_HOOKS_H + +#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */ diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..51d13a870404 --- /dev/null +++ b/arch/frv/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_FRV_MM_ARCH_HOOKS_H +#define _ASM_FRV_MM_ARCH_HOOKS_H + +#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */ diff --git a/arch/frv/include/asm/sections.h b/arch/frv/include/asm/sections.h index 17d0fb171bba..d03fb64e93e9 100644 --- a/arch/frv/include/asm/sections.h +++ b/arch/frv/include/asm/sections.h @@ -35,12 +35,6 @@ extern unsigned long __nongprelbss memory_start; extern unsigned long __nongprelbss memory_end; extern unsigned long __nongprelbss rom_length; -/* determine if we're running from ROM */ -static inline int is_in_rom(unsigned long addr) -{ - return 0; /* default case: not in ROM */ -} - #endif #endif #endif /* _ASM_SECTIONS_H */ diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index b99c2a7cc7a4..8eeea0d77aad 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c @@ -119,14 +119,16 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, EXPORT_SYMBOL(dma_map_single); -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i=0; i<nents; i++) - frv_cache_wback_inv(sg_dma_address(&sg[i]), - sg_dma_address(&sg[i]) + sg_dma_len(&sg[i])); + for_each_sg(sglist, sg, nents, i) { + frv_cache_wback_inv(sg_dma_address(sg), + sg_dma_address(sg) + sg_dma_len(sg)); + } BUG_ON(direction == DMA_NONE); diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index 82478979ac9a..4d1f01dc46e5 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c @@ -50,19 +50,20 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, EXPORT_SYMBOL(dma_map_single); -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { unsigned long dampr2; void *vaddr; int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); dampr2 = __get_DAMPR(2); - for (i = 0; i < nents; i++) { - vaddr = kmap_atomic_primary(sg_page(&sg[i])); + for_each_sg(sglist, sg, nents, i) { + vaddr = kmap_atomic_primary(sg_page(sg)); frv_dcache_writeback((unsigned long) vaddr, (unsigned long) vaddr + PAGE_SIZE); diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..05e8b939e416 --- /dev/null +++ b/arch/hexagon/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H +#define _ASM_HEXAGON_MM_ARCH_HOOKS_H + +#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */ diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index aa910054b8e7..ff1377bc02a6 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..ab4b5c698322 --- /dev/null +++ b/arch/ia64/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_IA64_MM_ARCH_HOOKS_H +#define _ASM_IA64_MM_ARCH_HOOKS_H + +#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */ diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 52b7604b5215..f50d4b3f501a 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index ea21d4cad540..aa19b7ac8222 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -58,27 +58,22 @@ paddr_to_nid(unsigned long paddr) * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where * the section resides. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static int __meminitdata last_ssec, last_esec; - static int __meminitdata last_nid; - if (section >= last_ssec && section < last_esec) - return last_nid; + if (section >= state->last_start && section < state->last_end) + return state->last_nid; for (i = 0; i < num_node_memblks; i++) { ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; esec = (node_memblk[i].start_paddr + node_memblk[i].size + ((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; if (section >= ssec && section < esec) { - last_ssec = ssec; - last_esec = esec; - last_nid = node_memblk[i].nid; + state->last_start = ssec; + state->last_end = esec; + state->last_nid = node_memblk[i].nid; return node_memblk[i].nid; } } diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d60b4750f41 --- /dev/null +++ b/arch/m32r/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_M32R_MM_ARCH_HOOKS_H +#define _ASM_M32R_MM_ARCH_HOOKS_H + +#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */ diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7e8709bc90ae --- /dev/null +++ b/arch/m68k/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_M68K_MM_ARCH_HOOKS_H +#define _ASM_M68K_MM_ARCH_HOOKS_H + +#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */ diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h index 471f481e67f3..f730b396d79b 100644 --- a/arch/metag/include/asm/hugetlb.h +++ b/arch/metag/include/asm/hugetlb.h @@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len); -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b0072b2eb0de --- /dev/null +++ b/arch/metag/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_METAG_MM_ARCH_HOOKS_H +#define _ASM_METAG_MM_ARCH_HOOKS_H + +#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */ diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 7ca80ac42ed5..53f0f6c47027 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return pmd_page_shift(pmd) > PAGE_SHIFT; diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5c4065911bda --- /dev/null +++ b/arch/microblaze/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H +#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H + +#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fe0d15d32660..4a5bb5453408 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b5609fe8e475 --- /dev/null +++ b/arch/mips/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H +#define _ASM_MIPS_MM_ARCH_HOOKS_H + +#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index cfcb876cae6b..106e741aa7ee 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -67,6 +67,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index 609d1241b0c4..eeaf0245c3b1 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -262,12 +262,13 @@ static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, plat_unmap_dma_mem(dev, dma_addr, size, direction); } -static int mips_dma_map_sg(struct device *dev, struct scatterlist *sg, +static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++, sg++) { + for_each_sg(sglist, sg, nents, i) { if (!plat_device_is_coherent(dev)) __dma_sync(sg_page(sg), sg->offset, sg->length, direction); @@ -291,13 +292,14 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page, return plat_map_dma_mem_page(dev, page) + offset; } -static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sg, +static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nhwentries, enum dma_data_direction direction, struct dma_attrs *attrs) { int i; + struct scatterlist *sg; - for (i = 0; i < nhwentries; i++, sg++) { + for_each_sg(sglist, sg, nhwentries, i) { if (!plat_device_is_coherent(dev) && direction != DMA_TO_DEVICE) __dma_sync(sg_page(sg), sg->offset, sg->length, @@ -324,26 +326,34 @@ static void mips_dma_sync_single_for_device(struct device *dev, } static void mips_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { int i; + struct scatterlist *sg; - if (cpu_needs_post_dma_flush(dev)) - for (i = 0; i < nelems; i++, sg++) + if (cpu_needs_post_dma_flush(dev)) { + for_each_sg(sglist, sg, nelems, i) { __dma_sync(sg_page(sg), sg->offset, sg->length, direction); + } + } plat_post_dma_flush(dev); } static void mips_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { int i; + struct scatterlist *sg; - if (!plat_device_is_coherent(dev)) - for (i = 0; i < nelems; i++, sg++) + if (!plat_device_is_coherent(dev)) { + for_each_sg(sglist, sg, nelems, i) { __dma_sync(sg_page(sg), sg->offset, sg->length, direction); + } + } } int mips_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 06e0f421b41b..74aa6f62468f 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - /* * This function checks for proper alignment of input addr and len parameters. */ diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..e2029a652f4c --- /dev/null +++ b/arch/mn10300/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H +#define _ASM_MN10300_MM_ARCH_HOOKS_H + +#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */ diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d7290dc68558 --- /dev/null +++ b/arch/nios2/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H +#define _ASM_NIOS2_MM_ARCH_HOOKS_H + +#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */ diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d33cb555fe1 --- /dev/null +++ b/arch/openrisc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H +#define _ASM_OPENRISC_MM_ARCH_HOOKS_H + +#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */ diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..654ec63b0ee9 --- /dev/null +++ b/arch/parisc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H +#define _ASM_PARISC_MM_ARCH_HOOKS_H + +#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 294d251ca7b2..6cb8db76fd4e 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -40,6 +40,7 @@ #define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ #define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ #define MADV_VPS_INHERIT 7 /* Inherit parents page size */ +#define MADV_FREE 8 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ff834fd67478..b9402c9b3454 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++, sglist++ ) { - unsigned long vaddr = (unsigned long)sg_virt(sglist); - sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr); - sg_dma_len(sglist) = sglist->length; - flush_kernel_dcache_range(vaddr, sglist->length); + for_each_sg(sglist, sg, nents, i) { + unsigned long vaddr = (unsigned long)sg_virt(sg); + + sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); + sg_dma_len(sg) = sg->length; + flush_kernel_dcache_range(vaddr, sg->length); } return nents; } @@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); @@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); return; } @@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); } static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); } struct hppa_dma_ops pcxl_dma_ops = { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 1d53a65b4ec1..4bbd3c8c2888 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..f2a2da895897 --- /dev/null +++ b/arch/powerpc/include/asm/mm-arch-hooks.h @@ -0,0 +1,28 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H +#define _ASM_POWERPC_MM_ARCH_HOOKS_H + +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ + /* + * mremap() doesn't allow moving multiple vmas so we can limit the + * check to old_start == vdso_base. + */ + if (old_start == mm->context.vdso_base) + mm->context.vdso_base = new_start; +} +#define arch_remap arch_remap + +#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 73382eba02dc..825cb232eab6 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -8,7 +8,6 @@ #include <linux/spinlock.h> #include <asm/mmu.h> #include <asm/cputable.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputhreads.h> /* @@ -109,5 +108,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (start <= mm->context.vdso_base && mm->context.vdso_base < end) + mm->context.vdso_base = 0; +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ +} + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 43e6ad424c7f..1bc103a86160 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -491,9 +491,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5bfdab9047be..b7a68442b767 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, struct vio_dev *viodev = to_vio_dev(dev); struct iommu_table *tbl; struct scatterlist *sgl; - int ret, count = 0; + int ret, count; size_t alloc_size = 0; tbl = get_iommu_table_base(dev); - for (sgl = sglist; count < nelems; count++, sgl++) + for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); if (vio_cmo_alloc(viodev, alloc_size)) { @@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, return ret; } - for (sgl = sglist, count = 0; count < ret; count++, sgl++) + for_each_sg(sglist, sgl, ret, count) alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); @@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, struct iommu_table *tbl; struct scatterlist *sgl; size_t alloc_size = 0; - int count = 0; + int count; tbl = get_iommu_table_base(dev); - for (sgl = sglist; count < nelems; count++, sgl++) + for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0ce968b00b7c..1b88b1c57d08 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate) } #endif -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #ifdef CONFIG_PPC_FSL_BOOK3E #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 11eae5f55b70..dfb542ade6b1 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -#define hugetlb_prefault_arch_hook(mm) do { } while (0) #define arch_clear_hugepage_flags(page) do { } while (0) int arch_prepare_hugepage(struct page *page); diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..07680b2f3c59 --- /dev/null +++ b/arch/s390/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_S390_MM_ARCH_HOOKS_H +#define _ASM_S390_MM_ARCH_HOOKS_H + +#endif /* _ASM_S390_MM_ARCH_HOOKS_H */ diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e617e74b7be2..c3f8e3df92ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmdp; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { if (!MACHINE_HAS_HPAGE) diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5e38689f189a --- /dev/null +++ b/arch/score/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H +#define _ASM_SCORE_MM_ARCH_HOOKS_H + +#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */ diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 699255d6d1c6..b788a9bc8918 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..18087298b728 --- /dev/null +++ b/arch/sh/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SH_MM_ARCH_HOOKS_H +#define _ASM_SH_MM_ARCH_HOOKS_H + +#endif /* _ASM_SH_MM_ARCH_HOOKS_H */ diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 534bc978af8a..6385f60209b6 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return 0; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index e4cab465b81f..3130d7636312 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b89ba44c16f1 --- /dev/null +++ b/arch/sparc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H +#define _ASM_SPARC_MM_ARCH_HOOKS_H + +#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index dc165ebdf05a..5c02174828ed 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -697,6 +697,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return __pmd(pte_val(pte)); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkclean(pte); + + return __pmd(pte_val(pte)); +} + static inline pmd_t pmd_mkyoung(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c index 7d3ca30fcd15..1ae5eb1bb045 100644 --- a/arch/sparc/kernel/ldc.c +++ b/arch/sparc/kernel/ldc.c @@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp, struct cookie_state state; struct ldc_iommu *iommu; int err; + struct scatterlist *s; if (map_perm & ~LDC_MAP_ALL) return -EINVAL; @@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp, state.pte_idx = (base - iommu->page_table); state.nc = 0; - for (i = 0; i < num_sg; i++) - fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT, - sg[i].offset, sg[i].length); + for_each_sg(sg, s, num_sg, i) { + fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT, + s->offset, s->length); + } return state.nc; } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 4242eab12e10..131eaf4ad7f5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index 3257733003f8..1abd00c55236 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h @@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d1709ea774f7 --- /dev/null +++ b/arch/tile/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_TILE_MM_ARCH_HOOKS_H +#define _ASM_TILE_MM_ARCH_HOOKS_H + +#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */ diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 8416240c322c..c034dc3fe2d4 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -160,11 +160,6 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..a7c8b0dfdd4e --- /dev/null +++ b/arch/um/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UM_MM_ARCH_HOOKS_H +#define _ASM_UM_MM_ARCH_HOOKS_H + +#endif /* _ASM_UM_MM_ARCH_HOOKS_H */ diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4d79a850c509 --- /dev/null +++ b/arch/unicore32/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H +#define _ASM_UNICORE32_MM_ARCH_HOOKS_H + +#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0c73110d1260..879a1125b691 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,7 @@ config X86_64 select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF select HAVE_LIVEPATCH + select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT ### Arch settings config X86 @@ -100,7 +101,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 68c05398bba9..dab7a3a750bf 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4e881a342236 --- /dev/null +++ b/arch/x86/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_X86_MM_ARCH_HOOKS_H +#define _ASM_X86_MM_ARCH_HOOKS_H + +#endif /* _ASM_X86_MM_ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f768f6298419..da8dff10f632 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -50,12 +50,13 @@ extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: */ - return 0xff; + *uniform = 1; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..affcb3459847 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_ACCESSED); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_DIRTY); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index d0acb658c8f4..0bc86c6fcae0 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -88,6 +88,10 @@ struct mtrr_state_type { mtrr_type def_type; }; +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) @@ -103,7 +107,7 @@ struct mtrr_state_type { #define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) #define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) -/* These are the region types */ +/* MTRR memory types, which are defined in SDM */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 /*#define MTRR_TYPE_ 2*/ @@ -113,5 +117,11 @@ struct mtrr_state_type { #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 +/* + * Invalid MTRR memory type. mtrr_type_lookup() returns this value when + * MTRRs are disabled. Note, this value is allocated from the reserved + * values (0x7-0xff) of the MTRR memory types. + */ +#define MTRR_TYPE_INVALID 0xff #endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..a83f27a89cd3 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,59 +102,75 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched or fixed entries are disabled + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.have_fixed) || + !(mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) + return MTRR_TYPE_INVALID; + + if (start < 0x80000) { /* 0x0 - 0x7FFFF */ + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + + } else if (start < 0xC0000) { /* 0x80000 - 0xBFFFF */ + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Arguments: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. + * uniform - Set to 1 when MTRR covers the region uniformly, i.e. the region + * is fully covered by a single MTRR entry or the default type. */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return 0xFF; - - if (!mtrr_state.enabled) - return 0xFF; + *uniform = 1; /* Make end inclusive end, instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,20 +182,22 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * - start_state:1 + * (start:mtrr_end) (mtrr_end:end) + * - end_state:1 or inclusive:1 + * (start:mtrr_start) (mtrr_start:end) * depending on kind of overlap. * Return the type for first region and a pointer to * the start of second region so that caller will * lookup again on the second region. - * Note: This way we handle multiple overlaps as well. + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -193,59 +211,95 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } - if ((start & mask) != (base & mask)) + if (!start_state) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * 0xFF - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when MTRR covers the region uniformly, i.e. the region + * is fully covered by a single MTRR entry or the default type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform, dummy; int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + *uniform = 1; + + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + type = mtrr_type_lookup_fixed(start, end); + if (type != MTRR_TYPE_INVALID) { + *uniform = 0; + return type; + } + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + is_uniform = 0; + + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); - if (check_type_overlap(&prev_type, &type)) + if (check_type_overlap(&prev_type, &type)) { + *uniform = 0; return type; + } } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + return MTRR_TYPE_WRBACK; + + *uniform = is_uniform; return type; } @@ -347,7 +401,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -360,7 +416,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index e1029633f664..5e7047263b35 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -336,6 +336,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 35af6771a95a..372ad422c2c3 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -267,9 +267,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { - u8 mtrr_type; + u8 mtrr_type, uniform; - mtrr_type = mtrr_type_lookup(start, end); + mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..3d6edea8087e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -563,16 +563,22 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRR can override PAT memory types with 4KB granularity. Therefore, + * it only sets up a huge page when the range is mapped uniformly by MTRR + * (i.e. the range is fully covered by a single MTRR entry or the default + * type) or the MTRR memory type is WB. + * + * Return 1 on success, and 0 when no PUD was set. + */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((!uniform) && (mtrr != MTRR_TYPE_WRBACK)) return 0; prot = pgprot_4k_2_large(prot); @@ -584,17 +590,26 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pmd_set_huge - setup kernel PMD mapping + * + * MTRR can override PAT memory types with 4KB granularity. Therefore, + * it only sets up a huge page when the range is mapped uniformly by MTRR + * (i.e. the range is fully covered by a single MTRR entry or the default + * type) or the MTRR memory type is WB. + * + * Return 1 on success, and 0 when no PMD was set. + */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((!uniform) && (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn("pmd_set_huge: requesting [mem %#010llx-%#010llx], which spans more than a single MTRR entry\n", + addr, addr + PMD_SIZE); return 0; + } prot = pgprot_4k_2_large(prot); @@ -605,6 +620,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Return 1 on success, and 0 when no PUD map was found. + */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { @@ -615,6 +635,11 @@ int pud_clear_huge(pud_t *pud) return 0; } +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Return 1 on success, and 0 when no PMD map was found. + */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h index ba78ccf651e7..1f5f6dc09736 100644 --- a/arch/xtensa/include/asm/dma-mapping.h +++ b/arch/xtensa/include/asm/dma-mapping.h @@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, } static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++, sg++ ) { + for_each_sg(sglist, sg, nents, i) { BUG_ON(!sg_page(sg)); sg->dma_address = sg_phys(sg); @@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction); } static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir) { int i; - for (i = 0; i < nelems; i++, sg++) + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) consistent_sync(sg_virt(sg), sg->length, dir); } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction dir) +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction dir) { int i; - for (i = 0; i < nelems; i++, sg++) + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) consistent_sync(sg_virt(sg), sg->length, dir); } static inline int diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d2e5cfd3dd02 --- /dev/null +++ b/arch/xtensa/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H +#define _ASM_XTENSA_MM_ARCH_HOOKS_H + +#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 201aec0e0446..1b19f25bc567 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -80,6 +80,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/block/genhd.c b/block/genhd.c index 0a536dc05f3b..64600e911aaa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/base/node.c b/drivers/base/node.c index a2aa65b4215d..31df474d72f4 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -359,12 +359,16 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #define page_initialized(page) (page->lru.next) -static int get_nid_for_pfn(unsigned long pfn) +static int __init_refok get_nid_for_pfn(unsigned long pfn) { struct page *page; if (!pfn_valid_within(pfn)) return -1; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + if (system_state == SYSTEM_BOOTING) + return early_pfn_to_nid(pfn); +#endif page = pfn_to_page(pfn); if (!page_initialized(page)) return -1; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 6489c0fd0ea6..386ba3d1a6ee 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -23,12 +23,4 @@ config ZRAM_LZ4_COMPRESS default n help This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. - -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" - depends on ZRAM - default n - help - This option adds additional debugging code to the compressed - RAM block device driver. + algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8dcbced0eafd..e6c431691dde 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -15,10 +15,6 @@ #define KMSG_COMPONENT "zram" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/bio.h> @@ -32,12 +28,16 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/err.h> +#include <linux/idr.h> +#include <linux/sysfs.h> #include "zram_drv.h" -/* Globals */ +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + static int zram_major; -static struct zram *zram_devices; static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ @@ -53,7 +53,7 @@ static inline void deprecated_attr_warn(const char *name) } #define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ @@ -74,33 +74,117 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) { - unsigned long nr_migrated; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; + return meta->table[index].value & BIT(flag); +} - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} - meta = zram->meta; - nr_migrated = zs_compact(meta->mem_pool); - atomic64_add(nr_migrated, &zram->stats.num_migrated); - up_read(&zram->init_lock); +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} - return len; +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) { - struct zram *zram = dev_to_zram(dev); + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return 0; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return 0; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return 0; + + /* I/O request is valid */ + return 1; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static int page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return 0; + } + + return 1; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); } static ssize_t initstate_show(struct device *dev, @@ -116,6 +200,14 @@ static ssize_t initstate_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", val); } +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + static ssize_t orig_data_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -143,19 +235,6 @@ static ssize_t mem_used_total_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } -static ssize_t max_comp_streams_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); -} - static ssize_t mem_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -225,6 +304,19 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -282,65 +374,95 @@ static ssize_t comp_algorithm_store(struct device *dev, return len; } -/* flag operations needs meta->tb_lock */ -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - return meta->table[index].value & BIT(flag); -} + unsigned long nr_migrated; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value |= BIT(flag); -} + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value &= ~BIT(flag); + meta = zram->meta; + nr_migrated = zs_compact(meta->mem_pool); + atomic64_add(nr_migrated, &zram->stats.num_migrated); + up_read(&zram->init_lock); + + return len; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; } -static void zram_set_obj_size(struct zram_meta *meta, - u32 index, size_t size) +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + struct zram *zram = dev_to_zram(dev); + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + down_read(&zram->init_lock); + if (init_done(zram)) + mem_used = zs_get_total_pages(zram->meta->mem_pool); + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.num_migrated)); + up_read(&zram->init_lock); + + return ret; } -static inline int is_partial_io(struct bio_vec *bvec) +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) { - return bvec->bv_len != PAGE_SIZE; + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, - sector_t start, unsigned int size) +static inline void zram_meta_put(struct zram *zram) { - u64 end, bound; - - /* unaligned request */ - if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - end = start + (size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; + atomic_dec(&zram->refcount); } static void zram_meta_free(struct zram_meta *meta, u64 disksize) @@ -394,56 +516,6 @@ out_error: return NULL; } -static inline bool zram_meta_get(struct zram *zram) -{ - if (atomic_inc_not_zero(&zram->refcount)) - return true; - return false; -} - -static inline void zram_meta_put(struct zram *zram) -{ - atomic_dec(&zram->refcount); -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - - /* * To protect concurrent access to the same index entry, * caller should hold this table index entry's bit_spinlock to @@ -561,21 +633,6 @@ out_cleanup: return ret; } -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long old_max, cur_max; - - old_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - cur_max = old_max; - if (pages > cur_max) - old_max = atomic_long_cmpxchg( - &zram->stats.max_used_pages, cur_max, pages); - } while (old_max != cur_max); -} - static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -703,35 +760,6 @@ out: return ret; } -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) -{ - unsigned long start_time = jiffies; - int ret; - - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, - &zram->disk->part0); - - if (rw == READ) { - atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset); - } else { - atomic64_inc(&zram->stats.num_writes); - ret = zram_bvec_write(zram, bvec, index, offset); - } - - generic_end_io_acct(rw, &zram->disk->part0, start_time); - - if (unlikely(ret)) { - if (rw == READ) - atomic64_inc(&zram->stats.failed_reads); - else - atomic64_inc(&zram->stats.failed_writes); - } - - return ret; -} - /* * zram_bio_discard - handler on discard request * @index: physical block index in PAGE_SIZE units @@ -771,149 +799,32 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } -static void zram_reset_device(struct zram *zram) -{ - struct zram_meta *meta; - struct zcomp *comp; - u64 disksize; - - down_write(&zram->init_lock); - - zram->limit_pages = 0; - - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - - meta = zram->meta; - comp = zram->comp; - disksize = zram->disksize; - /* - * Refcount will go down to 0 eventually and r/w handler - * cannot handle further I/O so it will bail out by - * check zram_meta_get. - */ - zram_meta_put(zram); - /* - * We want to free zram_meta in process context to avoid - * deadlock between reclaim path and any other locks. - */ - wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - zram->disksize = 0; - zram->max_comp_streams = 1; - set_capacity(zram->disk, 0); - - up_write(&zram->init_lock); - /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); - zcomp_destroy(comp); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zcomp *comp; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - int err; - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->first_minor, disksize); - if (!meta) - return -ENOMEM; - - comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (IS_ERR(comp)) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = PTR_ERR(comp); - goto out_free_meta; - } - - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } - - init_waitqueue_head(&zram->io_done); - atomic_set(&zram->refcount, 1); - zram->meta = meta; - zram->comp = comp; - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - revalidate_disk(zram->disk); - - return len; - -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); -out_free_meta: - zram_meta_free(meta, disksize); - return err; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) { + unsigned long start_time = jiffies; int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); - mutex_lock(&bdev->bd_mutex); - /* Do not reset an active device! */ - if (bdev->bd_openers) { - ret = -EBUSY; - goto out; + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); } - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; + generic_end_io_acct(rw, &zram->disk->part0, start_time); - if (!do_reset) { - ret = -EINVAL; - goto out; + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); } - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - zram_reset_device(zram); - - mutex_unlock(&bdev->bd_mutex); - revalidate_disk(zram->disk); - bdput(bdev); - - return len; - -out: - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return ret; } @@ -1053,80 +964,183 @@ out: return err; } -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, - .owner = THIS_MODULE -}; +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; -static DEVICE_ATTR_WO(compact); -static DEVICE_ATTR_RW(disksize); -static DEVICE_ATTR_RO(initstate); -static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); -static DEVICE_ATTR_RW(max_comp_streams); -static DEVICE_ATTR_RW(comp_algorithm); + down_write(&zram->init_lock); -static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + zram->max_comp_streams = 1; + set_capacity(zram->disk, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - ssize_t ret; + int err; - down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8llu\n", - (u64)atomic64_read(&zram->stats.failed_reads), - (u64)atomic64_read(&zram->stats.failed_writes), - (u64)atomic64_read(&zram->stats.invalid_io), - (u64)atomic64_read(&zram->stats.notify_free)); - up_read(&zram->init_lock); + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; - return ret; + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->first_minor, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; } -static ssize_t mm_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - struct zram *zram = dev_to_zram(dev); - u64 orig_size, mem_used = 0; - long max_used; - ssize_t ret; + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; - down_read(&zram->init_lock); - if (init_done(zram)) - mem_used = zs_get_total_pages(zram->meta->mem_pool); + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; - orig_size = atomic64_read(&zram->stats.pages_stored); - max_used = atomic_long_read(&zram->stats.max_used_pages); + if (!do_reset) + return -EINVAL; - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", - orig_size << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.compr_data_size), - mem_used << PAGE_SHIFT, - zram->limit_pages << PAGE_SHIFT, - max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), - (u64)atomic64_read(&zram->stats.num_migrated)); - up_read(&zram->init_lock); + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; return ret; } -static DEVICE_ATTR_RO(io_stat); -static DEVICE_ATTR_RO(mm_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1156,10 +1170,24 @@ static struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; -static int create_device(struct zram *zram, int device_id) +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) { + struct zram *zram; struct request_queue *queue; - int ret = -ENOMEM; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; init_rwsem(&zram->init_lock); @@ -1167,12 +1195,13 @@ static int create_device(struct zram *zram, int device_id) if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); - goto out; + ret = -ENOMEM; + goto out_free_idr; } blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ + /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { pr_warn("Error allocating disk structure for device %d\n", @@ -1230,90 +1259,177 @@ static int create_device(struct zram *zram, int device_id) strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; - return 0; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; out_free_disk: del_gendisk(zram->disk); put_disk(zram->disk); out_free_queue: blk_cleanup_queue(queue); -out: +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); return ret; } -static void destroy_devices(unsigned int nr) +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * zram_remove() -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + idr_remove(&zram_index_idr, zram->disk->first_minor); + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram module control sysfs attributes */ +static ssize_t zram_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t zram_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) { struct zram *zram; - unsigned int i; + int ret, dev_id; - for (i = 0; i < nr; i++) { - zram = &zram_devices[i]; - /* - * Remove sysfs first, so no one will perform a disksize - * store while we destroy the devices - */ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; - zram_reset_device(zram); + mutex_lock(&zram_index_mutex); - blk_cleanup_queue(zram->disk->queue); - del_gendisk(zram->disk); - put_disk(zram->disk); - } + zram = idr_find(&zram_index_idr, dev_id); + if (zram) + ret = zram_remove(zram); + else + ret = -ENODEV; + + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(zram_add), + __ATTR_WO(zram_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} - kfree(zram_devices); +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); - pr_info("Destroyed %u device(s)\n", nr); } static int __init zram_init(void) { - int ret, dev_id; + int ret; - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - return -EINVAL; + ret = class_register(&zram_control_class); + if (ret) { + pr_warn("Unable to register zram-control class\n"); + return ret; } zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_warn("Unable to get major number\n"); + class_unregister(&zram_control_class); return -EBUSY; } - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - unregister_blkdev(zram_major, "zram"); - return -ENOMEM; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) goto out_error; + num_devices--; } - pr_info("Created %u device(s)\n", num_devices); return 0; out_error: - destroy_devices(dev_id); + destroy_devices(); return ret; } static void __exit zram_exit(void) { - destroy_devices(num_devices); + destroy_devices(); } module_init(zram_init); module_exit(zram_exit); module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 570c598f4ce9..6dbe2df506bf 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -20,12 +20,6 @@ #include "zcomp.h" -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; - /*-- Configurable parameters */ /* @@ -121,5 +115,9 @@ struct zram { */ u64 disksize; /* bytes */ char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ }; #endif diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index aeabaa5aedf7..48db922075e2 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -419,10 +419,10 @@ static int jmb38x_ms_issue_cmd(struct memstick_host *msh) } if (host->cmd_flags & DMA_DATA) { - if (1 != pci_map_sg(host->chip->pdev, &host->req->sg, 1, + if (1 != dma_map_sg(&host->chip->pdev->dev, &host->req->sg, 1, host->req->data_dir == READ - ? PCI_DMA_FROMDEVICE - : PCI_DMA_TODEVICE)) { + ? DMA_FROM_DEVICE + : DMA_TO_DEVICE)) { host->req->error = -ENOMEM; return host->req->error; } @@ -487,9 +487,9 @@ static void jmb38x_ms_complete_cmd(struct memstick_host *msh, int last) writel(0, host->addr + DMA_CONTROL); if (host->cmd_flags & DMA_DATA) { - pci_unmap_sg(host->chip->pdev, &host->req->sg, 1, + dma_unmap_sg(&host->chip->pdev->dev, &host->req->sg, 1, host->req->data_dir == READ - ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { t_val = readl(host->addr + INT_STATUS_ENABLE); if (host->req->data_dir == READ) @@ -925,7 +925,7 @@ static int jmb38x_ms_probe(struct pci_dev *pdev, int pci_dev_busy = 0; int rc, cnt; - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (rc) return rc; diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index e2a4f5f415b2..ef09ba0289d7 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -754,7 +754,7 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto error2; pci_set_master(pdev); - error = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + error = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (error) goto error3; @@ -787,8 +787,8 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id) } /* This is just a precation, so don't fail */ - dev->dummy_dma_page = pci_alloc_consistent(pdev, PAGE_SIZE, - &dev->dummy_dma_page_physical_address); + dev->dummy_dma_page = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, + &dev->dummy_dma_page_physical_address, GFP_KERNEL); r592_stop_dma(dev , 0); if (request_irq(dev->irq, &r592_irq, IRQF_SHARED, @@ -805,7 +805,7 @@ error7: free_irq(dev->irq, dev); error6: if (dev->dummy_dma_page) - pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page, + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, dev->dummy_dma_page_physical_address); kthread_stop(dev->io_thread); @@ -845,7 +845,7 @@ static void r592_remove(struct pci_dev *pdev) memstick_free_host(dev->host); if (dev->dummy_dma_page) - pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page, + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, dev->dummy_dma_page_physical_address); } diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index ea2a315df6b7..272fc934ce6b 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -61,6 +61,8 @@ static int rtc_suspend(struct device *dev) if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; + rtc->valid_alarm = !rtc_read_alarm(rtc, &rtc->alarm); + /* snapshot the current RTC and system time at suspend*/ err = rtc_read_time(rtc, &tm); if (err < 0) { @@ -105,6 +107,27 @@ static int rtc_resume(struct device *dev) if (timekeeping_rtc_skipresume()) return 0; + /* + * Ensure that the platform hasn't overwritten a pending alarm while + * suspended + */ + if (rtc->valid_alarm) { + long now, scheduled; + + rtc_read_time(rtc, &tm); + rtc_tm_to_time(&rtc->alarm.time, &scheduled); + rtc_tm_to_time(&tm, &now); + + /* Clear the alarm registers if it went off during suspend */ + if (scheduled <= now) { + rtc_time_to_tm(0, &rtc->alarm.time); + rtc->alarm.enabled = 0; + } + + if (rtc->ops && rtc->ops->set_alarm) + rtc->ops->set_alarm(rtc->dev.parent, &rtc->alarm); + } + rtc_hctosys_ret = -ENODEV; if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; @@ -141,6 +164,7 @@ static int rtc_resume(struct device *dev) if (sleep_time.tv_sec >= 0) timekeeping_inject_sleeptime64(&sleep_time); rtc_hctosys_ret = 0; + return 0; } diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index a6b14c24d7e3..31f11ab214a0 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -494,7 +494,10 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) struct rtc_time tm; ktime_t now, onesec; - __rtc_read_time(rtc, &tm); + err = __rtc_read_time(rtc, &tm); + if (err < 0) + goto out; + onesec = ktime_set(1, 0); now = rtc_tm_to_ktime(tm); rtc->uie_rtctimer.node.expires = ktime_add(now, onesec); @@ -872,13 +875,17 @@ void rtc_timer_do_work(struct work_struct *work) struct timerqueue_node *next; ktime_t now; struct rtc_time tm; + int err = 0; struct rtc_device *rtc = container_of(work, struct rtc_device, irqwork); mutex_lock(&rtc->ops_lock); again: - __rtc_read_time(rtc, &tm); + err = __rtc_read_time(rtc, &tm); + if (err < 0) + goto out; + now = rtc_tm_to_ktime(tm); while ((next = timerqueue_getnext(&rtc->timerqueue))) { if (next->expires.tv64 > now.tv64) @@ -903,7 +910,6 @@ again: /* Set next alarm */ if (next) { struct rtc_wkalrm alarm; - int err; int retry = 3; alarm.time = rtc_ktime_to_tm(next->expires); @@ -925,6 +931,7 @@ reprogram: } else rtc_alarm_disable(rtc); +out: pm_relax(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); } diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 4ffabb322a9a..6e76de1856fc 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -742,17 +742,17 @@ static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t) regs[6] &= ~MCP794XX_BIT_ALMX_IF; /* Set alarm match: second, minute, hour, day, date, month. */ regs[6] |= MCP794XX_MSK_ALMX_MATCH; - - if (t->enabled) - regs[0] |= MCP794XX_BIT_ALM0_EN; - else - regs[0] &= ~MCP794XX_BIT_ALM0_EN; + /* Disable interrupt. We will not enable until completely programmed */ + regs[0] &= ~MCP794XX_BIT_ALM0_EN; ret = ds1307->write_block_data(client, MCP794XX_REG_CONTROL, 10, regs); if (ret < 0) return ret; - return 0; + if (!t->enabled) + return 0; + regs[0] |= MCP794XX_BIT_ALM0_EN; + return i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, regs[0]); } static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled) diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index 8b6355ffaff9..7fc666a0bb2e 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -107,6 +107,8 @@ /* OMAP_RTC_OSC_REG bit fields: */ #define OMAP_RTC_OSC_32KCLK_EN BIT(6) +#define OMAP_RTC_OSC_OSC32K_GZ BIT(4) +#define OMAP_RTC_OSC_EXT_32K BIT(3) /* OMAP_RTC_IRQWAKEEN bit fields: */ #define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1) @@ -122,6 +124,7 @@ struct omap_rtc; struct omap_rtc_device_type { bool has_32kclk_en; + bool has_osc_ext_32k; bool has_irqwakeen; bool has_pmic_mode; bool has_power_up_reset; @@ -481,6 +484,7 @@ static const struct omap_rtc_device_type omap_rtc_default_type = { static const struct omap_rtc_device_type omap_rtc_am3352_type = { .has_32kclk_en = true, + .has_osc_ext_32k = true, .has_irqwakeen = true, .has_pmic_mode = true, .lock = am3352_rtc_lock, @@ -577,7 +581,15 @@ static int omap_rtc_probe(struct platform_device *pdev) if (rtc->type->has_32kclk_en) { reg = rtc_read(rtc, OMAP_RTC_OSC_REG); rtc_writel(rtc, OMAP_RTC_OSC_REG, - reg | OMAP_RTC_OSC_32KCLK_EN); + reg | OMAP_RTC_OSC_32KCLK_EN); + } + + /* Enable external clock as the source */ + if (rtc->type->has_osc_ext_32k) { + rtc_writel(rtc, OMAP_RTC_OSC_REG, + (OMAP_RTC_OSC_EXT_32K | + rtc_read(rtc, OMAP_RTC_OSC_REG)) & + (~OMAP_RTC_OSC_OSC32K_GZ)); } /* clear old status */ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a19c31d3f369..4d4a0df8344f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index a8f463c028ce..5fa92bc790ef 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a022f4accd76..17349500592d 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 0826e91dacda..22c166280883 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,8 +137,8 @@ static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, befs_btree_super * sup) { - struct buffer_head *bh = NULL; - befs_disk_btree_super *od_sup = NULL; + struct buffer_head *bh; + befs_disk_btree_super *od_sup; befs_debug(sb, "---> %s", __func__); @@ -250,7 +250,7 @@ int befs_btree_find(struct super_block *sb, befs_data_stream * ds, const char *key, befs_off_t * value) { - struct befs_btree_node *this_node = NULL; + struct befs_btree_node *this_node; befs_btree_super bt_super; befs_off_t node_off; int res; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5cfa7129d876..fa173aa682bb 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3390,13 +3390,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, * should have access to this page, we're safe to simply set * PG_locked without checking it first. */ - __set_page_locked(page); + __SetPageLocked(page); rc = add_to_page_cache_locked(page, mapping, page->index, GFP_KERNEL); /* give up if we can't stick it in the cache */ if (rc) { - __clear_page_locked(page); + __ClearPageLocked(page); return rc; } @@ -3417,10 +3417,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, if (*bytes + PAGE_CACHE_SIZE > rsize) break; - __set_page_locked(page); + __SetPageLocked(page); if (add_to_page_cache_locked(page, mapping, page->index, GFP_KERNEL)) { - __clear_page_locked(page); + __ClearPageLocked(page); break; } list_move_tail(&page->lru, tmplist); diff --git a/fs/configfs/item.c b/fs/configfs/item.c index e65f9ffbb999..4d6a30e76168 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item *item) +static void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } -EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item diff --git a/fs/efs/super.c b/fs/efs/super.c index 7fca462ea4e3..c8411a30f7da 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 8850254136ae..7002467bfbac 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } if (!journal) { - ret = generic_file_fsync(file, start, end, datasync); + if (test_opt(inode->i_sb, BARRIER)) + ret = generic_file_fsync(file, start, end, datasync); + else + ret = __generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc62232ec2..5d384921524d 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create) +int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + +static int is_exceed_eof(struct inode *inode, sector_t sector, + sector_t *last_block, int create) +{ + struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; + + *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= *last_block) { + if (!create) + return 1; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= *last_block) + return 1; + } + + return 0; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create, bool from_bmap) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; } - last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; - if (sector >= last_block) { - if (!create) + if (!from_bmap) { + if (is_exceed_eof(inode, sector, &last_block, create)) return 0; - - /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) - */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) - >> blocksize_bits; + } else { + last_block = inode->i_blocks >> + (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4afc4d9d2e41..4c71c8c76426 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -91,7 +91,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e15323bab..4307cd4f8da0 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); +extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create); + unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; @@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } +extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a0e33e..b90cc275bfaa 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -13,8 +13,12 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -176,6 +180,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -214,6 +219,62 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + loff_t ondisksize; /* block aligned on-disk size in bytes*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_KEEP_SIZE) { + ondisksize = inode->i_blocks << 9; + if ((offset + len) <= ondisksize) + goto error; + + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - ondisksize; + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_add_cluster(inode); + if (err) + goto error; + } + } else { + if ((offset + len) <= i_size_read(inode)) + goto error; + + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c06774658345..e69c84acad75 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -92,7 +92,7 @@ static struct fat_floppy_defaults { }, }; -static int fat_add_cluster(struct inode *inode) +int fat_add_cluster(struct inode *inode) { int err, cluster; @@ -114,10 +114,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; - sector_t phys; + sector_t phys, last_block; int err, offset; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { @@ -134,8 +134,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return -EIO; } + last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); - if (!offset) { + /* + * allocate a cluster according to the following. + * 1) no more available blocks + * 2) not part of fallocate region + */ + if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) @@ -147,7 +153,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; @@ -272,13 +278,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + BUG_ON(create != 0); + + err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -552,13 +583,43 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); + +static void fat_free_eofblocks(struct inode *inode) +{ + /* Release unwritten fallocated blocks on inode eviction. */ + if ((inode->i_blocks << 9) > + round_up(MSDOS_I(inode)->mmu_private, + MSDOS_SB(inode->i_sb)->cluster_size)) { + int err; + + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused " + "fallocated blocks, inode could be " + "corrupted. Please run fsck"); + } + + } +} + static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); - } + } else + fat_free_eofblocks(inode); + invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87724c1d7be6..0cf74df68617 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) goto out; ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b96bd8076b70..0bc333b4a594 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f09370c90a8..dac4523fa142 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index f131fc23ffc4..fffca9517321 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1182d1e26a9c..086cd0a61e80 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220babac..587c7ed4185d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -590,7 +591,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -614,7 +615,7 @@ alloc_new: */ length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -624,7 +625,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -636,7 +637,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -692,8 +693,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -710,8 +714,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2d7f76e52c37..0afb4cb7ce1b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + goto bail; } - - return 0; +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", + "Owner %llu has empty extent list (next_free_rec == 0)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; @@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", + "Owner %llu has extent list where extent # %d has no physical block start\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; @@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2526,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_block *eb; u32 range; - /* - * In normal tree rotation process, we will never touch the - * tree branch above subtree_index and ocfs2_extend_rotate_transaction - * doesn't reserve the credits for them either. - * - * But we do have a special case here which will update the rightmost - * records for all the bh in the path. - * So we have to allocate extra credits and access them. - */ - ret = ocfs2_extend_trans(handle, subtree_index); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); @@ -2872,8 +2847,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2925,7 +2899,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *right_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); - BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) + return 0; *empty_extent_path = NULL; @@ -2966,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + ret = ocfs2_extend_rotate_transaction(handle, 0, orig_credits, left_path); if (ret) { mlog_errno(ret); @@ -3039,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(et); if (ret) goto out; - /* - * There's two ways we handle this depending on - * whether path is the only existing one. - */ - ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); - if (ret) { - mlog_errno(ret); - goto out; - } ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { @@ -3130,6 +3093,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3199,7 +3186,7 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", + "Owner %llu has empty extent block at %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; @@ -3627,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, */ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } ret = ocfs2_remove_rightmost_path(handle, et, right_path, @@ -3665,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, BUG_ON(ctxt->c_contig_type == CONTIG_NONE); if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The merge code will need to create an empty * extent to take the place of the newly @@ -3713,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* The merge left us with an empty extent, remove it. */ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { @@ -3734,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, goto out; } + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so @@ -3769,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, } if (ctxt->c_split_covers_rec) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + ret = 0; + goto out; + } + /* * The merge may have left an empty extent in * our leaf. Try to rotate it away. @@ -3929,7 +3960,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; @@ -4311,13 +4342,13 @@ out: return ret; } -static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, +static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, - struct ocfs2_extent_rec *split_rec) + struct ocfs2_extent_rec *split_rec, + struct ocfs2_merge_ctxt *ctxt) { - int status; + int status = 0; enum ocfs2_contig_type ret = CONTIG_NONE; u32 left_cpos, right_cpos; struct ocfs2_extent_rec *rec = NULL; @@ -4336,8 +4367,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); - if (!left_path) + if (!left_path) { + status = -ENOMEM; + mlog_errno(status); goto exit; + } status = ocfs2_find_path(et->et_ci, left_path, left_cpos); @@ -4351,10 +4385,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); @@ -4392,8 +4423,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, goto free_left_path; right_path = ocfs2_new_path_from_path(path); - if (!right_path) + if (!right_path) { + status = -ENOMEM; + mlog_errno(status); goto free_left_path; + } status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) @@ -4406,8 +4440,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; @@ -4433,7 +4466,10 @@ free_right_path: free_left_path: ocfs2_free_path(left_path); exit: - return ret; + if (status == 0) + ctxt->c_contig_type = ret; + + return status; } static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, @@ -4960,10 +4996,9 @@ leftright: split_index = ocfs2_search_extent_list(el, cpos); if (split_index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u " - "which can no longer be found.\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5039,9 +5074,14 @@ int ocfs2_split_extent(handle_t *handle, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, - split_index, - split_rec); + ret = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec, + &ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The core merge / split code wants to know how much room is @@ -5143,10 +5183,9 @@ int ocfs2_change_extent_flag(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5213,9 +5252,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5322,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); @@ -5499,8 +5545,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5565,7 +5610,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5581,8 +5626,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -5910,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); - /* TODO: Perhaps we can calculate the bulk of the - * credits up front rather than extending like - * this. */ - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } - rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); @@ -5940,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, goto bail; } } + + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } i--; } @@ -5998,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -7096,12 +7137,20 @@ start: ocfs2_error(inode->i_sb, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7189,8 +7238,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f906a250da6a..f676b344fb30 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } @@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; @@ -925,13 +925,23 @@ clean_orphan: int update_isize = written > 0 ? 1 : 0; loff_t end = update_isize ? offset + written : 0; - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(ret); + goto out; + } + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { ret = tmp_ret; + mlog_errno(ret); goto out; } + ocfs2_inode_unlock(inode, 1); + tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { ret = tmp_ret; @@ -2176,16 +2186,6 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out_quota; - } /* * Fill our page array first. That way we've grabbed enough so @@ -2336,7 +2336,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2386,6 +2386,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + out_write_size: pos += copied; if (pos > i_size_read(inode)) { @@ -2407,6 +2415,7 @@ out_write_size: */ ocfs2_unlock_pages(wc); +out: ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (status) { + /* Clear the rest of the buffers on error */ + put_bh(bh); + bhs[i] = NULL; + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..3a60c83218db 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -36,7 +36,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/bitmap.h> - +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -1061,37 +1061,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); @@ -1119,18 +1088,18 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + after_hb = ktime_get_real(); + + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec, ret); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb.tv64, after_hb.tv64, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index af7598bff1b5..dfe162f5fd4c 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7fdc25a4d8c0..308ea0eb35fd 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. +/* + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ } while (0) #define mlog_errno(st) ({ \ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ccd4dcfc3645..a9513ff0a47f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -1617,7 +1607,7 @@ int __ocfs2_add_entry(handle_t *handle, struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; - int retval, status; + int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; @@ -1695,25 +1685,25 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, + retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, + retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (ocfs2_dir_indexed(dir)) { - status = ocfs2_dx_dir_insert(dir, + if (!retval && ocfs2_dir_indexed(dir)) + retval = ocfs2_dx_dir_insert(dir, handle, lookup); - if (status) { - mlog_errno(status); - goto bail; - } - } + } + + if (retval) { + mlog_errno(retval); + goto bail; } /* By now the buffer is marked for journaling */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fae17c640df3..e88ccf8c83ff 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* will exit holding res->spinlock, but may drop in function */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); -void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); /* will exit holding res->spinlock, but may drop in function */ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -795,8 +785,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..5e7e39a3e85e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1191,17 +1191,19 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); + assert_spin_locked(&inode->i_lock); + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; - return res; + return 1; } /* @@ -1350,32 +1352,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff531928269e..a8b1ce85a95a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -2137,6 +2137,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, struct inode *inode = NULL; struct inode *iter; struct ocfs2_inode_info *oi; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_recover_orphans(slot); @@ -2157,16 +2159,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto next; + } /* * We need to take and drop the inode lock to * force read inode from disk. */ - ret = ocfs2_inode_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); - goto next; + goto unlock_rw; } - ocfs2_inode_unlock(inode, 0); + + di = (struct ocfs2_dinode *)di_bh->b_data; if (inode->i_nlink == 0) { spin_lock(&oi->ip_lock); @@ -2174,43 +2182,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { - struct buffer_head *di_bh = NULL; - - ret = ocfs2_rw_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto next; - } - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - ocfs2_rw_unlock(inode, 1); - mlog_errno(ret); - goto next; - } - + } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); - ocfs2_inode_unlock(inode, 1); - ocfs2_rw_unlock(inode, 1); - brelse(di_bh); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto next; + goto unlock_inode; } - ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ - +unlock_inode: + ocfs2_inode_unlock(inode, 1); +unlock_rw: + ocfs2_rw_unlock(inode, 1); next: iput(inode); - + brelse(di_bh); + di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u used bits, but a count shows %u", + ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 176fe6afd94e..a1507488d521 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1291,6 +1291,15 @@ static int ocfs2_rename(struct inode *old_dir, } parents_locked = 1; + if (!new_dir->i_nlink) { + mlog(ML_ERROR, "new dir %llu has been removed, inode %llu " + "can not be moved into it.", + (unsigned long long)new_dir->i_ino, + (unsigned long long)old_inode->i_ino); + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1551,12 +1560,25 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } @@ -2670,30 +2692,22 @@ bail: } int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end) + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle = NULL; int status = 0; - status = ocfs2_inode_lock(inode, &di_bh, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - di = (struct ocfs2_dinode *) di_bh->b_data; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, le16_to_cpu(di->i_dio_orphaned_slot)); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto bail_unlock_inode; + goto bail; } mutex_lock(&orphan_dir_inode->i_mutex); @@ -2702,7 +2716,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); mlog_errno(status); - goto bail_unlock_inode; + goto bail; } handle = ocfs2_start_trans(osb, @@ -2749,10 +2763,6 @@ bail_unlock_orphan: brelse(orphan_dir_bh); iput(orphan_dir_inode); -bail_unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - bail: return status; } diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 5ddecce172fa..e173329eb830 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode); int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end); + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 460c6c37e683..2c923223ec11 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 3d0b63d34225..964b727f4ee2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d8c6af101f3f..6f66268d03ca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2361,10 +2357,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2547,10 +2541,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2674,11 +2666,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3108,11 +3099,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3378,10 +3367,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..0456ae399bf7 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -181,44 +181,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -233,20 +224,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -254,10 +242,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -384,11 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -834,9 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -1370,12 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -1905,13 +1890,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -2429,12 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } if (undo_fn) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2fc02f7c2949 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -192,6 +192,7 @@ enum { Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, + Opt_err_cont, Opt_err, }; @@ -224,6 +225,7 @@ static const match_table_t tokens = { {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err_cont, "errors=continue"}, {Opt_err, NULL} }; @@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NOINTR; break; case Opt_err_panic: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; break; case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; + break; + case Opt_err_cont: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; break; case Opt_data_ordered: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; @@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -2541,31 +2554,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) - panic("OCFS2: (device %s): panic forced after error\n", - sb->s_id); + int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; - - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return rv; + + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } + + return rv; } -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; @@ -2577,12 +2602,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); - ocfs2_handle_error(sb); + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than @@ -2599,7 +2624,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03bfbf3d27d..5613883c05c0 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; + if (!capable(CAP_SYS_ADMIN)) + return 0; + if (list && total_len <= list_size) { memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); memcpy(list + prefix_len, name, name_len); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83570..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index fd02a9ebfc30..3f57dac31ba6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + /* + * Parked tasks do not run; they sit in __kthread_parkme(). + * Without this check, we would report them as running, which is + * clearly wrong, so we report them as sleeping instead. + */ + if (tsk->state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); return task_state_array[fls(state)]; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0111ad0466ed..d766bfac06cb 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -588,8 +588,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = (struct reiserfs_inode_info *) - kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) -#define HASHDIST_DEFAULT 1 +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define HASHDIST_DEFAULT 0 +#define hashdist (0) #endif -extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif /* _LINUX_BOOTMEM_H */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 371e560d13cf..dfaa7b3e9ae9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -5,9 +5,9 @@ /* * Common definitions for all gcc versions go here. */ -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) /* Optimization barrier */ @@ -46,55 +46,63 @@ * the inline assembly constraint from =g to =r, in this particular * case either is valid. */ -#define RELOC_HIDE(ptr, off) \ - ({ unsigned long __ptr; \ - __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ - (typeof(ptr)) (__ptr + (off)); }) +#define RELOC_HIDE(ptr, off) \ +({ \ + unsigned long __ptr; \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); \ +}) /* Make the optimizer believe the variable can be manipulated arbitrarily. */ -#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) +#define OPTIMIZER_HIDE_VAR(var) \ + __asm__ ("" : "=r" (var) : "0" (var)) #ifdef __CHECKER__ -#define __must_be_array(arr) 0 +#define __must_be_array(a) 0 #else /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif /* * Force always-inline if the user requests it so via the .config, * or if gcc is too old: */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ +#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -# define inline inline __attribute__((always_inline)) notrace -# define __inline__ __inline__ __attribute__((always_inline)) notrace -# define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline)) notrace +#define __inline__ __inline__ __attribute__((always_inline)) notrace +#define __inline __inline __attribute__((always_inline)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -# define inline inline notrace -# define __inline__ __inline__ notrace -# define __inline __inline notrace +#define inline inline notrace +#define __inline__ __inline__ notrace +#define __inline __inline notrace #endif -#define __deprecated __attribute__((deprecated)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) +#define __always_inline inline __attribute__((always_inline)) +#define noinline __attribute__((noinline)) + +#define __deprecated __attribute__((deprecated)) +#define __packed __attribute__((packed)) +#define __weak __attribute__((weak)) +#define __alias(symbol) __attribute__((alias(#symbol))) /* - * it doesn't make sense on ARM (currently the only user of __naked) to trace - * naked functions because then mcount is called without stack and frame pointer - * being set up and there is no chance to restore the lr register to the value - * before mcount was called. + * it doesn't make sense on ARM (currently the only user of __naked) + * to trace naked functions because then mcount is called without + * stack and frame pointer being set up and there is no chance to + * restore the lr register to the value before mcount was called. + * + * The asm() bodies of naked functions often depend on standard calling + * conventions, therefore they must be noinline and noclone. * - * The asm() bodies of naked functions often depend on standard calling conventions, - * therefore they must be noinline and noclone. GCC 4.[56] currently fail to enforce - * this, so we must do so ourselves. See GCC PR44290. + * GCC 4.[56] currently fail to enforce this, so we must do so ourselves. + * See GCC PR44290. */ -#define __naked __attribute__((naked)) noinline __noclone notrace +#define __naked __attribute__((naked)) noinline __noclone notrace -#define __noreturn __attribute__((noreturn)) +#define __noreturn __attribute__((noreturn)) /* * From the GCC manual: @@ -106,19 +114,130 @@ * would be. * [...] */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define noinline __attribute__((noinline)) -#define __attribute_const__ __attribute__((__const__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) - -#define __gcc_header(x) #x -#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) -#define gcc_header(x) _gcc_header(x) -#include gcc_header(__GNUC__) +#define __pure __attribute__((pure)) +#define __aligned(x) __attribute__((aligned(x))) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) +#define __attribute_const__ __attribute__((__const__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +/* gcc version specific checks */ + +#if GCC_VERSION < 30200 +# error Sorry, your compiler is too old - please upgrade it. +#endif + +#if GCC_VERSION < 30300 +# define __used __attribute__((__unused__)) +#else +# define __used __attribute__((__used__)) +#endif + +#ifdef CONFIG_GCOV_KERNEL +# if GCC_VERSION < 30400 +# error "GCOV profiling support for gcc versions below 3.4 not included" +# endif /* __GNUC_MINOR__ */ +#endif /* CONFIG_GCOV_KERNEL */ + +#if GCC_VERSION >= 30400 +#define __must_check __attribute__((warn_unused_result)) +#endif + +#if GCC_VERSION >= 40000 + +/* GCC 4.1.[01] miscompiles __weak */ +#ifdef __KERNEL__ +# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 +# error Your version of gcc miscompiles the __weak directive +# endif +#endif + +#define __used __attribute__((__used__)) +#define __compiler_offsetof(a, b) \ + __builtin_offsetof(a, b) + +#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 +# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) +#endif + +#if GCC_VERSION >= 40300 +/* Mark functions as cold. gcc will assume any path leading to a call + * to them will be unlikely. This means a lot of manual unlikely()s + * are unnecessary now for any paths leading to the usual suspects + * like BUG(), printk(), panic() etc. [but let's keep them for now for + * older compilers] + * + * Early snapshots of gcc 4.3 don't support this and we can't detect this + * in the preprocessor, but we can live with this because they're unreleased. + * Maketime probing would be overkill here. + * + * gcc also has a __attribute__((__hot__)) to move hot functions into + * a special section, but I don't see any sense in this right now in + * the kernel context + */ +#define __cold __attribute__((__cold__)) + +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +#ifndef __CHECKER__ +# define __compiletime_warning(message) __attribute__((warning(message))) +# define __compiletime_error(message) __attribute__((error(message))) +#endif /* __CHECKER__ */ +#endif /* GCC_VERSION >= 40300 */ + +#if GCC_VERSION >= 40500 +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + * + * Early snapshots of gcc 4.5 don't support this and we can't detect + * this in the preprocessor, but we can live with this because they're + * unreleased. Really, we need to have autoconf for the kernel. + */ +#define unreachable() __builtin_unreachable() + +/* Mark a function definition as prohibited from being cloned. */ +#define __noclone __attribute__((__noclone__)) + +#endif /* GCC_VERSION >= 40500 */ + +#if GCC_VERSION >= 40600 +/* + * Tell the optimizer that something else uses this function or variable. + */ +#define __visible __attribute__((externally_visible)) +#endif + +/* + * GCC 'asm goto' miscompiles certain code sequences: + * + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 + * + * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. + * + * (asm goto is automatically volatile - the naming reflects this.) + */ +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) + +#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP +#if GCC_VERSION >= 40400 +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#endif +#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) +#define __HAVE_BUILTIN_BSWAP16__ +#endif +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ + +#if GCC_VERSION >= 50000 +#define KASAN_ABI_VERSION 4 +#elif GCC_VERSION >= 40902 +#define KASAN_ABI_VERSION 3 +#endif + +#endif /* gcc version >= 40000 specific checks */ #if !defined(__noclone) #define __noclone /* not needed */ @@ -129,5 +248,3 @@ * code */ #define uninitialized_var(x) x = x - -#define __always_inline inline __attribute__((always_inline)) diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h deleted file mode 100644 index 7d89febe4d79..000000000000 --- a/include/linux/compiler-gcc3.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead." -#endif - -#if GCC_VERSION < 30200 -# error Sorry, your compiler is too old - please upgrade it. -#endif - -#if GCC_VERSION >= 30300 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#if GCC_VERSION >= 30400 -#define __must_check __attribute__((warn_unused_result)) -#endif - -#ifdef CONFIG_GCOV_KERNEL -# if GCC_VERSION < 30400 -# error "GCOV profiling support for gcc versions below 3.4 not included" -# endif /* __GNUC_MINOR__ */ -#endif /* CONFIG_GCOV_KERNEL */ diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h deleted file mode 100644 index 769e19864632..000000000000 --- a/include/linux/compiler-gcc4.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead." -#endif - -/* GCC 4.1.[01] miscompiles __weak */ -#ifdef __KERNEL__ -# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 -# error Your version of gcc miscompiles the __weak directive -# endif -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) - -#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 -# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#endif - -#if GCC_VERSION >= 40300 -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ -#endif /* GCC_VERSION >= 40300 */ - -#if GCC_VERSION >= 40500 -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -#endif /* GCC_VERSION >= 40500 */ - -#if GCC_VERSION >= 40600 -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) -#endif - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#if GCC_VERSION >= 40400 -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#endif -#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) -#define __HAVE_BUILTIN_BSWAP16__ -#endif -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#if GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 -#endif diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h deleted file mode 100644 index efee493714eb..000000000000 --- a/include/linux/compiler-gcc5.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead." -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ - -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#define KASAN_ABI_VERSION 4 diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) return item->ci_name; } -extern void config_item_init(struct config_item *); extern void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type); diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h new file mode 100644 index 000000000000..bba7a4d692b3 --- /dev/null +++ b/include/linux/crc64_ecma.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CRC64_ECMA_H_ +#define __CRC64_ECMA_H_ + +#include <linux/types.h> + + +#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * @pdata: pointer to the data to compute checksum for. + * @nbytes: number of bytes in data buffer. + * @seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed); + +#endif /* __CRC64_ECMA_H_ */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 97a9373e61e8..37c422df2a0f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -87,6 +88,7 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ +#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to memcg */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f10b20f05159..44a840a53974 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); @@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag, spinlock_t **ptl); +extern int pmd_freeable(pmd_t pmd); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index e804306ef5e8..e5fe4c1416a2 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..bcfa4b63311b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -93,6 +93,9 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); +void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, + phys_addr_t *out_end); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. @@ -132,6 +135,21 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, __next_mem_range_rev(&i, nid, type_a, type_b, \ p_start, p_end, p_nid)) +/** + * for_each_reserved_mem_region - iterate over all reserved memblock areas + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over reserved areas of memblock. Available as soon as memblock + * is initialized. + */ +#define for_each_reserved_mem_region(i, p_start, p_end) \ + for (i = 0UL, \ + __next_reserved_mem_region(&i, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_reserved_mem_region(&i, p_start, p_end)) + #ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 72dff5fb0d0c..6c8918114804 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -463,6 +463,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) if (!memcg_kmem_enabled()) return true; + if (gfp & __GFP_NOACCOUNT) + return true; /* * __GFP_NOFAIL allocations will move on even if charging is not * possible. Therefore we don't even try, and have this allocation @@ -522,6 +524,8 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { if (!memcg_kmem_enabled()) return cachep; + if (gfp & __GFP_NOACCOUNT) + return cachep; if (gfp & __GFP_NOFAIL) return cachep; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..4efc3f56e6df --- /dev/null +++ b/include/linux/mm-arch-hooks.h @@ -0,0 +1,25 @@ +/* + * Generic mm no-op hooks. + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_MM_ARCH_HOOKS_H +#define _LINUX_MM_ARCH_HOOKS_H + +#include <asm/mm-arch-hooks.h> + +#ifndef arch_remap +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ +} +#define arch_remap arch_remap +#endif + +#endif /* _LINUX_MM_ARCH_HOOKS_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..be9247ca538b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -436,46 +436,6 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } -static inline struct page *compound_head_by_tail(struct page *tail) -{ - struct page *head = tail->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(tail))) - return head; - return tail; -} - -/* - * Since either compound page could be dismantled asynchronously in THP - * or we access asynchronously arbitrary positioned struct page, there - * would be tail flag race. To handle this race, we should call - * smp_rmb() before checking tail flag. compound_head_by_tail() did it. - */ -static inline struct page *compound_head(struct page *page) -{ - if (unlikely(PageTail(page))) - return compound_head_by_tail(page); - return page; -} - -/* - * If we access compound page synchronously such as access to - * allocated page, there is no need to handle tail flag race, so we can - * check tail flag directly without any synchronization primitive. - */ -static inline struct page *compound_head_fast(struct page *page) -{ - if (unlikely(PageTail(page))) - return page->first_page; - return page; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -1631,6 +1591,8 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); +extern void reserve_bootmem_region(unsigned long start, unsigned long end); + /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) { @@ -1720,7 +1682,8 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn) +static inline int __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { return 0; } @@ -1728,7 +1691,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); /* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn); +extern int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 54d74f6eb233..754c25966a0a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -762,6 +762,14 @@ typedef struct pglist_data { /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * If memory initialisation on large machines is deferred then this + * is the first PFN that needs to be initialised. + */ + unsigned long first_deferred_pfn; +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -1216,11 +1224,16 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool early_pfn_in_nid(unsigned long pfn, int nid); -#else -#define early_pfn_in_nid(pfn, nid) (1) -#endif +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * may treat start/end as pfns or sections. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); +extern int proc_watchdog_cpumask(struct ctl_table *, int, + void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f34e040b34e9..91b7f9b2b774 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -134,49 +134,68 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +/* Page flags policies wrt compound pages */ +#define PF_ANY(page, enforce) page +#define PF_HEAD(page, enforce) compound_head(page) +#define PF_NO_TAIL(page, enforce) ({ \ + if (enforce) \ + VM_BUG_ON_PAGE(PageTail(page), page); \ + else \ + page = compound_head(page); \ + page;}) +#define PF_NO_COMPOUND(page, enforce) ({ \ + if (enforce) \ + VM_BUG_ON_PAGE(PageCompound(page), page); \ + page;}) + /* * Macros to create function definitions for page flags */ -#define TESTPAGEFLAG(uname, lname) \ -static inline int Page##uname(const struct page *page) \ - { return test_bit(PG_##lname, &page->flags); } +#define TESTPAGEFLAG(uname, lname, policy) \ +static inline int Page##uname(struct page *page) \ + { return test_bit(PG_##lname, &policy(page, 0)->flags); } -#define SETPAGEFLAG(uname, lname) \ +#define SETPAGEFLAG(uname, lname, policy) \ static inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &page->flags); } + { set_bit(PG_##lname, &policy(page, 1)->flags); } -#define CLEARPAGEFLAG(uname, lname) \ +#define CLEARPAGEFLAG(uname, lname, policy) \ static inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &page->flags); } + { clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __SETPAGEFLAG(uname, lname) \ +#define __SETPAGEFLAG(uname, lname, policy) \ static inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &page->flags); } + { __set_bit(PG_##lname, &policy(page, 1)->flags); } -#define __CLEARPAGEFLAG(uname, lname) \ +#define __CLEARPAGEFLAG(uname, lname, policy) \ static inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &page->flags); } + { __clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTSETFLAG(uname, lname) \ +#define TESTSETFLAG(uname, lname, policy) \ static inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &page->flags); } + { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTCLEARFLAG(uname, lname) \ +#define TESTCLEARFLAG(uname, lname, policy) \ static inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &page->flags); } + { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname) \ +#define __TESTCLEARFLAG(uname, lname, policy) \ static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &page->flags); } + { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) +#define PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + SETPAGEFLAG(uname, lname, policy) \ + CLEARPAGEFLAG(uname, lname, policy) -#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) +#define __PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + __SETPAGEFLAG(uname, lname, policy) \ + __CLEARPAGEFLAG(uname, lname, policy) -#define TESTSCFLAG(uname, lname) \ - TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTSCFLAG(uname, lname, policy) \ + TESTSETFLAG(uname, lname, policy) \ + TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname) \ static inline int Page##uname(const struct page *page) { return 0; } @@ -205,47 +224,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -struct page; /* forward declaration */ - -TESTPAGEFLAG(Locked, locked) -PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) -PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) - __SETPAGEFLAG(Referenced, referenced) -PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) -PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) -PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) - TESTCLEARFLAG(Active, active) -__PAGEFLAG(Slab, slab) -PAGEFLAG(Checked, checked) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ -PAGEFLAG(SavePinned, savepinned); /* Xen */ -PAGEFLAG(Foreign, foreign); /* Xen */ -PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) -PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) - __SETPAGEFLAG(SwapBacked, swapbacked) - -__PAGEFLAG(SlobFree, slob_free) +/* Forward declarations */ +struct page; +static inline int PageCompound(struct page *page); +static inline int PageTail(struct page *page); + +static inline struct page *compound_head_by_tail(struct page *tail) +{ + struct page *head = tail->first_page; + + /* + * page->first_page may be a dangling pointer to an old + * compound page, so recheck that it is still a tail + * page before returning. + */ + smp_rmb(); + if (likely(PageTail(tail))) + return head; + return tail; +} + +/* + * Since either compound page could be dismantled asynchronously in THP + * or we access asynchronously arbitrary positioned struct page, there + * would be tail flag race. To handle this race, we should call + * smp_rmb() before checking tail flag. compound_head_by_tail() did it. + */ +static inline struct page *compound_head(struct page *page) +{ + if (unlikely(PageTail(page))) + return compound_head_by_tail(page); + return page; +} + +/* + * If we access compound page synchronously such as access to + * allocated page, there is no need to handle tail flag race, so we can + * check tail flag directly without any synchronization primitive. + */ +static inline struct page *compound_head_fast(struct page *page) +{ + if (unlikely(PageTail(page))) + return page->first_page; + return page; +} + +__PAGEFLAG(Locked, locked, PF_NO_TAIL) +PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) +PAGEFLAG(Referenced, referenced, PF_HEAD) + TESTCLEARFLAG(Referenced, referenced, PF_HEAD) + __SETPAGEFLAG(Referenced, referenced, PF_HEAD) +PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) + __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) +PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) +PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) + TESTCLEARFLAG(Active, active, PF_HEAD) +__PAGEFLAG(Slab, slab, PF_NO_TAIL) +__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) +PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ + +/* Xen */ +PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) +PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND) +PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND) + +PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) +PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ -PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) - __CLEARPAGEFLAG(Private, private) -PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) -PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) +PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) + __CLEARPAGEFLAG(Private, private, PF_ANY) +PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -PAGEFLAG(MappedToDisk, mappedtodisk) +TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) + TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) +PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -258,31 +330,33 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache) +PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) - TESTCLEARFLAG(Unevictable, unevictable) +PAGEFLAG(Unevictable, unevictable, PF_HEAD) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) + TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) +PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) + __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached) +PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached) #endif #ifdef CONFIG_MEMORY_FAILURE -PAGEFLAG(HWPoison, hwpoison) -TESTSCFLAG(HWPoison, hwpoison) +PAGEFLAG(HWPoison, hwpoison, PF_ANY) +TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) @@ -311,6 +385,7 @@ PAGEFLAG_FALSE(HWPoison) static inline int PageAnon(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } @@ -323,6 +398,7 @@ static inline int PageAnon(struct page *page) */ static inline int PageKsm(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } @@ -334,8 +410,9 @@ u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) { - int ret = test_bit(PG_uptodate, &(page)->flags); - + int ret; + page = compound_head(page); + ret = test_bit(PG_uptodate, &(page)->flags); /* * Must ensure that the data we read out of the page is loaded * _after_ we've loaded page->flags to check for PageUptodate. @@ -352,22 +429,24 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); - __set_bit(PG_uptodate, &(page)->flags); + __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the page * uptodate are actually visible before PageUptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &(page)->flags); + set_bit(PG_uptodate, &page->flags); } -CLEARPAGEFLAG(Uptodate, uptodate) +CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); @@ -396,8 +475,8 @@ static inline void set_page_writeback_keepwrite(struct page *page) * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages * and avoid handling those in real mode. */ -__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -__PAGEFLAG(Tail, tail) +__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +__PAGEFLAG(Tail, tail, PF_ANY) static inline int PageCompound(struct page *page) { @@ -421,8 +500,8 @@ static inline void ClearPageCompound(struct page *page) * because PageCompound is always set for compound pages and not for * pages on the LRU and/or pagecache. */ -TESTPAGEFLAG(Compound, compound) -__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound) +TESTPAGEFLAG(Compound, compound, PF_ANY) +__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY) /* * PG_reclaim is used in combination with PG_compound to mark the @@ -518,21 +597,9 @@ static inline int PageTransTail(struct page *page) } #else - -static inline int PageTransHuge(struct page *page) -{ - return 0; -} - -static inline int PageTransCompound(struct page *page) -{ - return 0; -} - -static inline int PageTransTail(struct page *page) -{ - return 0; -} +TESTPAGEFLAG_FALSE(TransHuge) +TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransTail) #endif /* @@ -655,6 +722,10 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +#undef PF_ANY +#undef PF_HEAD +#undef PF_NO_TAIL +#undef PF_NO_COMPOUND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..7c3790764795 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); -static inline void __set_page_locked(struct page *page) -{ - __set_bit(PG_locked, &page->flags); -} - -static inline void __clear_page_locked(struct page *page) -{ - __clear_bit(PG_locked, &page->flags); -} - static inline int trylock_page(struct page *page) { + page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } @@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page, static inline int wait_on_page_locked_killable(struct page *page) { - if (PageLocked(page)) - return wait_on_page_bit_killable(page, PG_locked); - return 0; + if (!PageLocked(page)) + return 0; + return wait_on_page_bit_killable(compound_head(page), PG_locked); } extern wait_queue_head_t *page_waitqueue(struct page *page); @@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit) static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + wait_on_page_bit(compound_head(page), PG_locked); } /* @@ -656,17 +647,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: - * the page is new, so we can just run __set_page_locked() against it. + * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; - __set_page_locked(page); + __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) - __clear_page_locked(page); + __ClearPageLocked(page); return error; } diff --git a/include/linux/poison.h b/include/linux/poison.h index 2110a81c5e2a..7b2a7fcde6a3 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -32,6 +32,10 @@ /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) + /********** mm/slab.c **********/ /* * Magic nums for obj red zoning. diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c89c53a113a8..bf36b6e644c4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_FREE = 8, /* free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -183,7 +184,8 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags, + int *is_pte_dirty); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) @@ -260,9 +262,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc); static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; return 0; } diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 587017e7939c..ad6e32f3eab2 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,6 +133,10 @@ struct rtc_device { /* Some hardware can't support UIE mode */ int uie_unsupported; +#ifdef CONFIG_PM_SLEEP + struct rtc_wkalrm alarm; + bool valid_alarm; +#endif #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL struct work_struct uie_task; struct timer_list uie_timer; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index eca1ec93775c..e839025a81cb 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -265,12 +265,12 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, gfp_t gfp_mask); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen); + const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip); + const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..ca761238b2a5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,8 +153,30 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) +/* + * The KMALLOC_LOOP_LOW is the definition for the for loop index start number + * to create the kmalloc_caches object in create_kmalloc_caches(). The first + * and the second are 96 and 192. You can see that in the kmalloc_index(), if + * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, + * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't + * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. + */ +#if KMALLOC_MIN_SIZE <= 32 +#define KMALLOC_LOOP_LOW 1 +#elif KMALLOC_MIN_SIZE <= 64 +#define KMALLOC_LOOP_LOW 2 +#else +#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW +#endif + #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +/* + * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. + * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be + * initialized. + */ +#define KMALLOC_LOOP_LOW 1 #endif /* @@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes - * 2 = 120 .. 192 bytes - * n = 2^(n-1) .. 2^n -1 + * 2 = 129 .. 192 bytes + * n = 2^(n-1)+1 .. 2^n */ static __always_inline int kmalloc_index(size_t size) { @@ -290,6 +312,16 @@ void *__kmalloc(size_t size, gfp_t flags); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); void kmem_cache_free(struct kmem_cache *, void *); +/* + * Bulk allocation and freeing operations. These are accellerated in an + * allocator specific way to avoid taking locks repeatedly or building + * metadata structures unnecessarily. + * + * Note that interrupts must be enabled when calling these functions. + */ +void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node); void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -27,6 +27,8 @@ struct smpboot_thread_data; * @pre_unpark: Optional unpark function, called before the thread is * unparked (cpu online). This is not guaranteed to be * called on the target cpu of the thread. Careful! + * @cpumask: Internal state. To update which threads are unparked, + * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -41,11 +43,14 @@ struct smp_hotplug_thread { void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); void (*pre_unpark)(unsigned int cpu); + cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif diff --git a/include/linux/string.h b/include/linux/string.h index e40099e585c9..12a5a60f1f3b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -117,6 +117,7 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp); extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); +extern char *kstrimdup(const char *s, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/include/linux/swap.h b/include/linux/swap.h index cee108cbe2d5..0428e4c84e1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32dc973..2b1cef88b827 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36f1046..7a94102b7a02 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -34,6 +34,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/ipc/msg.c b/ipc/msg.c index 2b6fdbb9e0e9..e0b5e216ace5 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -37,6 +37,7 @@ #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> +#include <linux/freezer.h> #include <asm/current.h> #include <linux/uaccess.h> @@ -915,7 +916,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); - schedule(); + freezable_schedule(); /* Lockless receive, part 1: * Disable preemption. We don't hold a reference to the queue diff --git a/kernel/exit.c b/kernel/exit.c index 22fcc05dec40..8a87bb43dbd0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,10 +711,10 @@ void do_exit(long code) current->comm, task_pid_nr(current), preempt_count()); - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c697f73d82d6..7c434c39f02a 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - smpboot_unpark_thread(cur, cpu); + if (cpumask_test_cpu(cpu, cur->cpumask)) + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); } @@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; + /* Unpark any threads that were voluntarily parked. */ + for_each_cpu_not(cpu, ht->cpumask) { + if (cpu_online(cpu)) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + if (tsk) + kthread_unpark(tsk); + } + } + /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) + return -ENOMEM; + cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { @@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); + free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); +/** + * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked + * @plug_thread: Hotplug thread descriptor + * @new: Revised mask to use + * + * The cpumask field in the smp_hotplug_thread must not be updated directly + * by the client, but only by calling this function. + * This function can only be called on a registered smp_hotplug_thread. + */ +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) +{ + struct cpumask *old = plug_thread->cpumask; + cpumask_var_t tmp; + unsigned int cpu; + + if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) + return -ENOMEM; + + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + + /* Park threads that were exclusively enabled on the old mask. */ + cpumask_andnot(tmp, old, new); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_park_thread(plug_thread, cpu); + + /* Unpark threads that are exclusively enabled on the new mask. */ + cpumask_andnot(tmp, new, old); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_unpark_thread(plug_thread, cpu); + + cpumask_copy(old, new); + + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); + + free_cpumask_var(tmp); + + return 0; +} +EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); + static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a88fb9..699571a74e3b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -881,6 +881,13 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2316f50b07a4..90299b393ff6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -19,6 +19,7 @@ #include <linux/sysctl.h> #include <linux/smpboot.h> #include <linux/sched/rt.h> +#include <linux/tick.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -56,6 +57,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 #endif +static struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -205,7 +212,7 @@ void touch_all_softlockup_watchdogs(void) * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; } @@ -608,11 +615,11 @@ void watchdog_nmi_enable_all(void) { int cpu; - if (!watchdog_user_enabled) + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_enable(cpu); put_online_cpus(); } @@ -625,7 +632,7 @@ void watchdog_nmi_disable_all(void) return; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_disable(cpu); put_online_cpus(); } @@ -684,7 +691,7 @@ static void update_watchdog_all_cpus(void) int cpu; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) update_watchdog(cpu); put_online_cpus(); } @@ -697,8 +704,12 @@ static int watchdog_enable_all_cpus(void) err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else + else { + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask)) + pr_err("Failed to set cpumask for watchdog threads\n"); watchdog_running = 1; + } } else { /* * Enable/disable the lockup detectors or @@ -869,12 +880,55 @@ out: mutex_unlock(&watchdog_proc_mutex); return err; } + +/* + * The cpumask is the mask of possible cpus that the watchdog can run + * on, not the mask of cpus it is actually running on. This allows the + * user to specify a mask that will include cpus that have not yet + * been brought online, if desired. + */ +int proc_watchdog_cpumask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + mutex_lock(&watchdog_proc_mutex); + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) { + /* Remove impossible cpus to keep sysctl output cleaner. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, + cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate + * a temporary cpumask, so we are likely not in a + * position to do much else to make things better. + */ + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask) != 0) + pr_err("cpumask update failed\n"); + } + } + mutex_unlock(&watchdog_proc_mutex); + return err; +} + #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (!cpumask_empty(tick_nohz_full_mask)) + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, + tick_nohz_full_mask); +#else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#endif + if (watchdog_enabled) watchdog_enable_all_cpus(); } diff --git a/lib/Kconfig b/lib/Kconfig index 601965a948e8..4359dc34d82a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -188,6 +188,13 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH diff --git a/lib/Makefile b/lib/Makefile index 6c37933336a0..5b52a4710446 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ diff --git a/lib/bitmap.c b/lib/bitmap.c index 64c0926f5dd8..a578a0189199 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -462,19 +462,20 @@ EXPORT_SYMBOL(bitmap_parse_user); * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. + * + * It is assumed that @buf is a pointer into a PAGE_SIZE area and that + * sufficient storage remains at @buf to accommodate the + * bitmap_print_to_pagebuf() output. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2; + ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; int n = 0; - if (len > 1) { - n = list ? scnprintf(buf, len, "%*pbl", nmaskbits, maskp) : - scnprintf(buf, len, "%*pb", nmaskbits, maskp); - buf[n++] = '\n'; - buf[n] = '\0'; - } + if (len > 1) + n = list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : + scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); return n; } EXPORT_SYMBOL(bitmap_print_to_pagebuf); @@ -506,12 +507,12 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, unsigned a, b; int c, old_c, totaldigits; const char __user __force *ubuf = (const char __user __force *)buf; - int exp_digit, in_range; + int at_start, in_range; totaldigits = c = 0; bitmap_zero(maskp, nmaskbits); do { - exp_digit = 1; + at_start = 1; in_range = 0; a = b = 0; @@ -540,11 +541,10 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, break; if (c == '-') { - if (exp_digit || in_range) + if (at_start || in_range) return -EINVAL; b = 0; in_range = 1; - exp_digit = 1; continue; } @@ -554,16 +554,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, b = b * 10 + (c - '0'); if (!in_range) a = b; - exp_digit = 0; + at_start = 0; totaldigits++; } if (!(a <= b)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; - while (a <= b) { - set_bit(a, maskp); - a++; + if (!at_start) { + while (a <= b) { + set_bit(a, maskp); + a++; + } } } while (buflen && c == ','); return 0; diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c new file mode 100644 index 000000000000..41629ea5a60c --- /dev/null +++ b/lib/crc64_ecma.c @@ -0,0 +1,341 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/crc64_ecma.h> + + +#define CRC64_BYTE_MASK 0xFF +#define CRC64_TABLE_SIZE 256 + + +struct crc64_table { + u64 seed; + u64 table[CRC64_TABLE_SIZE]; +}; + + +static struct crc64_table CRC64_ECMA_182 = { + CRC64_DEFAULT_INITVAL, + { + 0x0000000000000000ULL, + 0xb32e4cbe03a75f6fULL, + 0xf4843657a840a05bULL, + 0x47aa7ae9abe7ff34ULL, + 0x7bd0c384ff8f5e33ULL, + 0xc8fe8f3afc28015cULL, + 0x8f54f5d357cffe68ULL, + 0x3c7ab96d5468a107ULL, + 0xf7a18709ff1ebc66ULL, + 0x448fcbb7fcb9e309ULL, + 0x0325b15e575e1c3dULL, + 0xb00bfde054f94352ULL, + 0x8c71448d0091e255ULL, + 0x3f5f08330336bd3aULL, + 0x78f572daa8d1420eULL, + 0xcbdb3e64ab761d61ULL, + 0x7d9ba13851336649ULL, + 0xceb5ed8652943926ULL, + 0x891f976ff973c612ULL, + 0x3a31dbd1fad4997dULL, + 0x064b62bcaebc387aULL, + 0xb5652e02ad1b6715ULL, + 0xf2cf54eb06fc9821ULL, + 0x41e11855055bc74eULL, + 0x8a3a2631ae2dda2fULL, + 0x39146a8fad8a8540ULL, + 0x7ebe1066066d7a74ULL, + 0xcd905cd805ca251bULL, + 0xf1eae5b551a2841cULL, + 0x42c4a90b5205db73ULL, + 0x056ed3e2f9e22447ULL, + 0xb6409f5cfa457b28ULL, + 0xfb374270a266cc92ULL, + 0x48190ecea1c193fdULL, + 0x0fb374270a266cc9ULL, + 0xbc9d3899098133a6ULL, + 0x80e781f45de992a1ULL, + 0x33c9cd4a5e4ecdceULL, + 0x7463b7a3f5a932faULL, + 0xc74dfb1df60e6d95ULL, + 0x0c96c5795d7870f4ULL, + 0xbfb889c75edf2f9bULL, + 0xf812f32ef538d0afULL, + 0x4b3cbf90f69f8fc0ULL, + 0x774606fda2f72ec7ULL, + 0xc4684a43a15071a8ULL, + 0x83c230aa0ab78e9cULL, + 0x30ec7c140910d1f3ULL, + 0x86ace348f355aadbULL, + 0x3582aff6f0f2f5b4ULL, + 0x7228d51f5b150a80ULL, + 0xc10699a158b255efULL, + 0xfd7c20cc0cdaf4e8ULL, + 0x4e526c720f7dab87ULL, + 0x09f8169ba49a54b3ULL, + 0xbad65a25a73d0bdcULL, + 0x710d64410c4b16bdULL, + 0xc22328ff0fec49d2ULL, + 0x85895216a40bb6e6ULL, + 0x36a71ea8a7ace989ULL, + 0x0adda7c5f3c4488eULL, + 0xb9f3eb7bf06317e1ULL, + 0xfe5991925b84e8d5ULL, + 0x4d77dd2c5823b7baULL, + 0x64b62bcaebc387a1ULL, + 0xd7986774e864d8ceULL, + 0x90321d9d438327faULL, + 0x231c512340247895ULL, + 0x1f66e84e144cd992ULL, + 0xac48a4f017eb86fdULL, + 0xebe2de19bc0c79c9ULL, + 0x58cc92a7bfab26a6ULL, + 0x9317acc314dd3bc7ULL, + 0x2039e07d177a64a8ULL, + 0x67939a94bc9d9b9cULL, + 0xd4bdd62abf3ac4f3ULL, + 0xe8c76f47eb5265f4ULL, + 0x5be923f9e8f53a9bULL, + 0x1c4359104312c5afULL, + 0xaf6d15ae40b59ac0ULL, + 0x192d8af2baf0e1e8ULL, + 0xaa03c64cb957be87ULL, + 0xeda9bca512b041b3ULL, + 0x5e87f01b11171edcULL, + 0x62fd4976457fbfdbULL, + 0xd1d305c846d8e0b4ULL, + 0x96797f21ed3f1f80ULL, + 0x2557339fee9840efULL, + 0xee8c0dfb45ee5d8eULL, + 0x5da24145464902e1ULL, + 0x1a083bacedaefdd5ULL, + 0xa9267712ee09a2baULL, + 0x955cce7fba6103bdULL, + 0x267282c1b9c65cd2ULL, + 0x61d8f8281221a3e6ULL, + 0xd2f6b4961186fc89ULL, + 0x9f8169ba49a54b33ULL, + 0x2caf25044a02145cULL, + 0x6b055fede1e5eb68ULL, + 0xd82b1353e242b407ULL, + 0xe451aa3eb62a1500ULL, + 0x577fe680b58d4a6fULL, + 0x10d59c691e6ab55bULL, + 0xa3fbd0d71dcdea34ULL, + 0x6820eeb3b6bbf755ULL, + 0xdb0ea20db51ca83aULL, + 0x9ca4d8e41efb570eULL, + 0x2f8a945a1d5c0861ULL, + 0x13f02d374934a966ULL, + 0xa0de61894a93f609ULL, + 0xe7741b60e174093dULL, + 0x545a57dee2d35652ULL, + 0xe21ac88218962d7aULL, + 0x5134843c1b317215ULL, + 0x169efed5b0d68d21ULL, + 0xa5b0b26bb371d24eULL, + 0x99ca0b06e7197349ULL, + 0x2ae447b8e4be2c26ULL, + 0x6d4e3d514f59d312ULL, + 0xde6071ef4cfe8c7dULL, + 0x15bb4f8be788911cULL, + 0xa6950335e42fce73ULL, + 0xe13f79dc4fc83147ULL, + 0x521135624c6f6e28ULL, + 0x6e6b8c0f1807cf2fULL, + 0xdd45c0b11ba09040ULL, + 0x9aefba58b0476f74ULL, + 0x29c1f6e6b3e0301bULL, + 0xc96c5795d7870f42ULL, + 0x7a421b2bd420502dULL, + 0x3de861c27fc7af19ULL, + 0x8ec62d7c7c60f076ULL, + 0xb2bc941128085171ULL, + 0x0192d8af2baf0e1eULL, + 0x4638a2468048f12aULL, + 0xf516eef883efae45ULL, + 0x3ecdd09c2899b324ULL, + 0x8de39c222b3eec4bULL, + 0xca49e6cb80d9137fULL, + 0x7967aa75837e4c10ULL, + 0x451d1318d716ed17ULL, + 0xf6335fa6d4b1b278ULL, + 0xb199254f7f564d4cULL, + 0x02b769f17cf11223ULL, + 0xb4f7f6ad86b4690bULL, + 0x07d9ba1385133664ULL, + 0x4073c0fa2ef4c950ULL, + 0xf35d8c442d53963fULL, + 0xcf273529793b3738ULL, + 0x7c0979977a9c6857ULL, + 0x3ba3037ed17b9763ULL, + 0x888d4fc0d2dcc80cULL, + 0x435671a479aad56dULL, + 0xf0783d1a7a0d8a02ULL, + 0xb7d247f3d1ea7536ULL, + 0x04fc0b4dd24d2a59ULL, + 0x3886b22086258b5eULL, + 0x8ba8fe9e8582d431ULL, + 0xcc0284772e652b05ULL, + 0x7f2cc8c92dc2746aULL, + 0x325b15e575e1c3d0ULL, + 0x8175595b76469cbfULL, + 0xc6df23b2dda1638bULL, + 0x75f16f0cde063ce4ULL, + 0x498bd6618a6e9de3ULL, + 0xfaa59adf89c9c28cULL, + 0xbd0fe036222e3db8ULL, + 0x0e21ac88218962d7ULL, + 0xc5fa92ec8aff7fb6ULL, + 0x76d4de52895820d9ULL, + 0x317ea4bb22bfdfedULL, + 0x8250e80521188082ULL, + 0xbe2a516875702185ULL, + 0x0d041dd676d77eeaULL, + 0x4aae673fdd3081deULL, + 0xf9802b81de97deb1ULL, + 0x4fc0b4dd24d2a599ULL, + 0xfceef8632775faf6ULL, + 0xbb44828a8c9205c2ULL, + 0x086ace348f355aadULL, + 0x34107759db5dfbaaULL, + 0x873e3be7d8faa4c5ULL, + 0xc094410e731d5bf1ULL, + 0x73ba0db070ba049eULL, + 0xb86133d4dbcc19ffULL, + 0x0b4f7f6ad86b4690ULL, + 0x4ce50583738cb9a4ULL, + 0xffcb493d702be6cbULL, + 0xc3b1f050244347ccULL, + 0x709fbcee27e418a3ULL, + 0x3735c6078c03e797ULL, + 0x841b8ab98fa4b8f8ULL, + 0xadda7c5f3c4488e3ULL, + 0x1ef430e13fe3d78cULL, + 0x595e4a08940428b8ULL, + 0xea7006b697a377d7ULL, + 0xd60abfdbc3cbd6d0ULL, + 0x6524f365c06c89bfULL, + 0x228e898c6b8b768bULL, + 0x91a0c532682c29e4ULL, + 0x5a7bfb56c35a3485ULL, + 0xe955b7e8c0fd6beaULL, + 0xaeffcd016b1a94deULL, + 0x1dd181bf68bdcbb1ULL, + 0x21ab38d23cd56ab6ULL, + 0x9285746c3f7235d9ULL, + 0xd52f0e859495caedULL, + 0x6601423b97329582ULL, + 0xd041dd676d77eeaaULL, + 0x636f91d96ed0b1c5ULL, + 0x24c5eb30c5374ef1ULL, + 0x97eba78ec690119eULL, + 0xab911ee392f8b099ULL, + 0x18bf525d915feff6ULL, + 0x5f1528b43ab810c2ULL, + 0xec3b640a391f4fadULL, + 0x27e05a6e926952ccULL, + 0x94ce16d091ce0da3ULL, + 0xd3646c393a29f297ULL, + 0x604a2087398eadf8ULL, + 0x5c3099ea6de60cffULL, + 0xef1ed5546e415390ULL, + 0xa8b4afbdc5a6aca4ULL, + 0x1b9ae303c601f3cbULL, + 0x56ed3e2f9e224471ULL, + 0xe5c372919d851b1eULL, + 0xa26908783662e42aULL, + 0x114744c635c5bb45ULL, + 0x2d3dfdab61ad1a42ULL, + 0x9e13b115620a452dULL, + 0xd9b9cbfcc9edba19ULL, + 0x6a978742ca4ae576ULL, + 0xa14cb926613cf817ULL, + 0x1262f598629ba778ULL, + 0x55c88f71c97c584cULL, + 0xe6e6c3cfcadb0723ULL, + 0xda9c7aa29eb3a624ULL, + 0x69b2361c9d14f94bULL, + 0x2e184cf536f3067fULL, + 0x9d36004b35545910ULL, + 0x2b769f17cf112238ULL, + 0x9858d3a9ccb67d57ULL, + 0xdff2a94067518263ULL, + 0x6cdce5fe64f6dd0cULL, + 0x50a65c93309e7c0bULL, + 0xe388102d33392364ULL, + 0xa4226ac498dedc50ULL, + 0x170c267a9b79833fULL, + 0xdcd7181e300f9e5eULL, + 0x6ff954a033a8c131ULL, + 0x28532e49984f3e05ULL, + 0x9b7d62f79be8616aULL, + 0xa707db9acf80c06dULL, + 0x14299724cc279f02ULL, + 0x5383edcd67c06036ULL, + 0xe0ada17364673f59ULL + } +}; + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void) +{ + return CRC64_ECMA_182.seed; +} +EXPORT_SYMBOL(crc64_ecma_seed); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * pdata: pointer to the data to compute checksum for. + * nbytes: number of bytes in data buffer. + * seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed) +{ + unsigned int i; + u64 crc = seed; + + for (i = 0; i < nbytes; i++) + crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^ + (crc >> 8); + + return crc; +} +EXPORT_SYMBOL(crc64_ecma); + +MODULE_DESCRIPTION("CRC64 ECMA function"); +MODULE_AUTHOR("Freescale Semiconductor Inc."); +MODULE_LICENSE("GPL"); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c9f2e8c6ccc9..e2374ec4e6a3 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -669,9 +669,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen) + const void *buf, size_t buflen) { - return sg_copy_buffer(sgl, nents, buf, buflen, 0, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); @@ -697,16 +697,16 @@ EXPORT_SYMBOL(sg_copy_to_buffer); * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from - * @skip: Number of bytes to skip before copying * @buflen: The number of bytes to copy + * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip) + const void *buf, size_t buflen, off_t skip) { - return sg_copy_buffer(sgl, nents, buf, buflen, skip, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); @@ -715,8 +715,8 @@ EXPORT_SYMBOL(sg_pcopy_from_buffer); * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to - * @skip: Number of bytes to skip before copying * @buflen: The number of bytes to copy + * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * diff --git a/lib/sort.c b/lib/sort.c index 43c9fe73ae2e..86e3efeb35c6 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -15,6 +15,13 @@ static void u32_swap(void *a, void *b, int size) *(u32 *)b = t; } +static void u64_swap(void *a, void *b, int size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + static void generic_swap(void *a, void *b, int size) { char t; @@ -50,8 +57,18 @@ void sort(void *base, size_t num, size_t size, /* pre-scale counters for performance */ int i = (num/2 - 1) * size, n = num * size, c, r; - if (!swap_func) - swap_func = (size == 4 ? u32_swap : generic_swap); + if (!swap_func) { + switch (size) { + case 4: + swap_func = u32_swap; + break; + case 8: + swap_func = u64_swap; + break; + default: + swap_func = generic_swap; + } + } /* heapify */ for ( ; i >= 0; i -= size) { diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c index c227cc43ec0a..5241df36eedf 100644 --- a/lib/test-hexdump.c +++ b/lib/test-hexdump.c @@ -25,19 +25,19 @@ static const char * const test_data_1_le[] __initconst = { "4c", "d1", "19", "99", "43", "b1", "af", "0c", }; -static const char *test_data_2_le[] __initdata = { +static const char * const test_data_2_le[] __initconst = { "32be", "7bdb", "180a", "b293", "ba70", "24c4", "837d", "9b34", "9ca6", "ad31", "0f9c", "e9ac", "d14c", "9919", "b143", "0caf", }; -static const char *test_data_4_le[] __initdata = { +static const char * const test_data_4_le[] __initconst = { "7bdb32be", "b293180a", "24c4ba70", "9b34837d", "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", }; -static const char *test_data_8_le[] __initdata = { +static const char * const test_data_8_le[] __initconst = { "b293180a7bdb32be", "9b34837d24c4ba70", "e9ac0f9cad319ca6", "0cafb1439919d14c", }; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index da39c608a28c..8243e2fb1e6b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1360,6 +1360,21 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, } } +static noinline_for_stack +char *comm_name(char *buf, char *end, struct task_struct *tsk, + struct printf_spec spec, const char *fmt) +{ + char name[TASK_COMM_LEN]; + + /* Caller can pass NULL instead of current. */ + if (!tsk) + tsk = current; + /* Not using get_task_comm() in case I'm in IRQ context. */ + memcpy(name, tsk->comm, TASK_COMM_LEN); + name[sizeof(name) - 1] = '\0'; + return string(buf, end, name, spec); +} + int kptr_restrict __read_mostly; /* @@ -1447,6 +1462,7 @@ int kptr_restrict __read_mostly; * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock + * - 'T' task_struct->comm * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1458,7 +1474,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, { int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0); - if (!ptr && *fmt != 'K') { + if (!ptr && *fmt != 'K' && *fmt != 'T') { /* * Print (null) with the same width as a pointer so it makes * tabular output look nice. @@ -1597,6 +1613,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return dentry_name(buf, end, ((const struct file *)ptr)->f_path.dentry, spec, fmt); + case 'T': + return comm_name(buf, end, ptr, spec, fmt); } spec.flags |= SMALL; if (spec.field_width == -1) { diff --git a/mm/Kconfig b/mm/Kconfig index 390214da4546..52ffb863383c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -635,3 +635,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/bootmem.c b/mm/bootmem.c index 477be696511d..a23dd1934654 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/compaction.c b/mm/compaction.c index 018f08da99a2..6ef2fdf1d6b6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -732,18 +732,18 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * splitting and collapsing (collapsing has already happened * if PageLRU is set) but the lock is not necessarily taken * here and it is wasteful to take it just to check transhuge. - * Check TransHuge without lock and skip the whole pageblock if - * it's either a transhuge or hugetlbfs page, as calling + * Check PageCompound without lock and skip the whole pageblock + * if it's either a transhuge or hugetlbfs page, as calling * compound_order() without preventing THP from splitting the * page underneath us may return surprising results. */ - if (PageTransHuge(page)) { - if (!locked) - low_pfn = ALIGN(low_pfn + 1, - pageblock_nr_pages) - 1; + if (PageCompound(page)) { + int nr; + if (locked) + nr = 1 << compound_order(page); else - low_pfn += (1 << compound_order(page)) - 1; - + nr = pageblock_nr_pages; + low_pfn = ALIGN(low_pfn + 1, nr) - 1; continue; } @@ -763,11 +763,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!locked) break; - /* Recheck PageLRU and PageTransHuge under lock */ + /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; + if (PageCompound(page)) { + int nr = 1 << compound_order(page); + low_pfn = ALIGN(low_pfn + 1, nr) - 1; continue; } } @@ -778,7 +779,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (__isolate_lru_page(page, isolate_mode) != 0) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_PAGE(PageCompound(page), page); /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42d560a..6ad0a8053b96 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -615,11 +615,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret; - __set_page_locked(page); + __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow); if (unlikely(ret)) - __clear_page_locked(page); + __ClearPageLocked(page); else { /* * The page might have been evicted from cache only @@ -742,6 +742,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { + page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_atomic(); @@ -806,18 +807,20 @@ EXPORT_SYMBOL_GPL(page_endio); */ void __lock_page(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - return __wait_on_bit_lock(page_waitqueue(page), &wait, + return __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 078832cf3636..87509bc6c5fc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1384,6 +1384,36 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr) + +{ + spinlock_t *ptl; + struct mm_struct *mm = tlb->mm; + int ret = 1; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pmd_t orig_pmd; + + orig_pmd = pmdp_get_and_clear(mm, addr, pmd); + + /* No hugepage in swapcache */ + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + spin_unlock(ptl); + ret = 0; + } + + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1599,6 +1629,11 @@ unlock: return NULL; } +int pmd_freeable(pmd_t pmd) +{ + return !pmd_dirty(pmd); +} + static int __split_huge_page_splitting(struct page *page, struct vm_area_struct *vma, unsigned long address) @@ -1710,7 +1745,7 @@ static void __split_huge_page_refcount(struct page *page, */ page_tail->_mapcount = page->_mapcount; - BUG_ON(page_tail->mapping); + BUG_ON(page_tail->mapping != TAIL_MAPPING); page_tail->mapping = page->mapping; page_tail->index = page->index + i; @@ -2799,7 +2834,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2820,8 +2855,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 271e4432734c..716465ae57aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3789,6 +3789,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { return NULL; } + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/internal.h b/mm/internal.h index a25e359a4039..a48cbefde8ca 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); @@ -361,10 +362,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -376,15 +374,34 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) +static inline void mminit_verify_zonelist(void) { } +#endif /* CONFIG_DEBUG_MEMORY_INIT */ -static inline void mminit_verify_zonelist(void) +/* + * Deferred struct page initialisation requires init functions that are freed + * before kswapd is available. Reuse the memory hotplug section annotation + * to mark the required code. + * + * __defermem_init is code that always exists but is annotated __meminit to + * avoid section warnings. + * __defer_init code gets marked __meminit when deferring struct page + * initialistion but is otherwise in the init section. + */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +#define __defermem_init __meminit +#define __defer_init __meminit + +void deferred_init_memmap(int nid); +#else +#define __defermem_init +#define __defer_init __init + +static inline void deferred_init_memmap(int nid) { } -#endif /* CONFIG_DEBUG_MEMORY_INIT */ +#endif /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4986b0acab21..c242adf6bc85 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,7 +7,6 @@ #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) #define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5405aff5a590..f0fe4f2c1fa7 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -115,7 +115,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOACCOUNT)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); - __set_page_locked(new_page); + __SetPageLocked(new_page); } return new_page; diff --git a/mm/madvise.c b/mm/madvise.c index d551475517bf..22e8f0ca7040 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,14 @@ #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlb.h> + +struct madvise_free_private { + struct vm_area_struct *vma; + struct mmu_gather *tlb; +}; /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -31,6 +39,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -254,6 +263,164 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct madvise_free_private *fp = walk->private; + struct mmu_gather *tlb = fp->tlb; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = fp->vma; + spinlock_t *ptl; + pte_t *pte, ptent; + struct page *page; + swp_entry_t entry; + unsigned long next; + int nr_swap = 0; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma, addr, pmd); + else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ + } + + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry)) + continue; + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + if (PageSwapCache(page)) { + if (!trylock_page(page)) + continue; + + if (!try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + if (PageActive(page)) + deactivate_page(page); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +next: + cond_resched(); + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_free_private fp = { + .vma = vma, + .tlb = tlb, + }; + + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = &fp, + }; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (vma->vm_file) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -377,6 +544,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -396,6 +571,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/memblock.c b/mm/memblock.c index 9318b567ed79..0a988ed82886 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -779,6 +779,38 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *rsv = &memblock.reserved; + + if (*idx >= 0 && *idx < rsv->cnt) { + struct memblock_region *r = &rsv->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes @@ -1316,7 +1348,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 501820c815b3..d0fa01e4d956 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back @@ -28,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -1182,7 +1183,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others - * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageLocked() * - to avoid races with __SetPageSlab*() (and more non-atomic ops) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. diff --git a/mm/memory.c b/mm/memory.c index 22e037e3364e..d1fa0c1fad0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3067,7 +3067,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = fault_page->mapping; + mapping = page_rmapping(fault_page); unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 457bde530cbe..c6a8d95c5dc7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1332,7 +1332,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) is belongs to the same zone. + * Confirm all pages in a range [start, end) belong to the same zone. */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { @@ -1343,10 +1343,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; + /* Find the first valid pfn in this pageblock */ + for (i = 0; i < MAX_ORDER_NR_PAGES; i++) { + if (pfn_valid(pfn + i)) + break; + } if (i == MAX_ORDER_NR_PAGES) continue; page = pfn_to_page(pfn + i); diff --git a/mm/migrate.c b/mm/migrate.c index f53838fe3dfe..022adc253cd4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1746,7 +1746,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 5f420f7fafa1..fdadf918de76 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memory.h> #include <linux/notifier.h> +#include <linux/sched.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/mprotect.c b/mm/mprotect.c index 88584838e704..e7d6f1171ecb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -322,6 +324,15 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index 034e2d360652..a7c93eceb1c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> #include <linux/uaccess.h> +#include <linux/mm-arch-hooks.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, old_addr, - moved_len, true); - return err; + } else { + if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, + old_addr, moved_len, true); + return err; + } } + arch_remap(mm, old_addr, old_addr + old_len, + new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 90b50468333e..bae652713ee5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -77,7 +77,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -92,7 +92,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } @@ -121,6 +121,9 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ebffa0e4a9c0..417cbe7e24b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -235,6 +235,77 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -373,6 +444,7 @@ void prep_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); + p->mapping = TAIL_MAPPING; p->first_page = page; /* Make sure p->first_page is always valid for PageTail() */ smp_wmb(); @@ -765,6 +837,12 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + page->mapping = NULL; + return 1; + } + page->mapping = NULL; if (!IS_ENABLED(CONFIG_DEBUG_VM)) return 0; if (unlikely(!PageTail(page))) { @@ -778,6 +856,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return 0; } +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + SetPageReserved(page); + } + } +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); @@ -832,7 +979,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +static void __defer_init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -852,6 +1000,197 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +/* Only safe to use early in boot when initialisation is single-threaded */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + +void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __defermem_init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + +/* Initialise remaining memory on a node */ +void __defermem_init deferred_init_memmap(int nid) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + + if (first_init_pfn == ULONG_MAX) + return; + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + goto free_range; + + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + goto free_range; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; + goto free_range; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + + page = pfn_to_page(pfn); + cond_resched(); + } + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + goto free_range; + } + + __init_single_page(page, pfn, zid, nid); + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + } + + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4105,6 +4444,9 @@ static void setup_zone_migrate_reserve(struct zone *zone) zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) + return; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); @@ -4167,15 +4509,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4187,14 +4530,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4209,17 +4549,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } @@ -4477,57 +4814,30 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0 && nid != node) - return false; - return true; -} -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -5046,6 +5356,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6013,9 +6324,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 755a42c76eb4..563fd8c43731 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); - if (!is_migrate_isolate_page(buddy)) { + if (!pfn_valid_within(page_to_pfn(buddy)) + || !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); kernel_map_pages(page, (1 << order), 1); set_page_refcounted(page); @@ -177,8 +178,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page) + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } diff --git a/mm/rmap.c b/mm/rmap.c index 24dd3f9fee27..dad23a43e42c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -712,6 +712,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) } struct page_referenced_arg { + int dirtied; int mapcount; int referenced; unsigned long vm_flags; @@ -726,6 +727,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + int dirty = 0; struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { @@ -749,6 +751,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; + + /* + * Use pmd_freeable instead of raw pmd_dirty because in some + * of architecture, pmd_dirty is not defined unless + * CONFIG_TRANSPARENT_HUGEPAGE is enabled + */ + if (!pmd_freeable(*pmd)) + dirty++; + spin_unlock(ptl); } else { pte_t *pte; @@ -778,6 +789,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } + + if (pte_dirty(*pte)) + dirty++; + pte_unmap_unlock(pte, ptl); } @@ -786,6 +801,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pra->vm_flags |= vma->vm_flags; } + if (dirty) + pra->dirtied++; + pra->mapcount--; if (!pra->mapcount) return SWAP_SUCCESS; /* To break the loop */ @@ -810,6 +828,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) * @is_locked: caller holds lock on the page * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page + * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. @@ -817,7 +836,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { int ret; int we_locked = 0; @@ -832,6 +852,9 @@ int page_referenced(struct page *page, }; *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; + if (!page_mapped(page)) return 0; @@ -859,6 +882,9 @@ int page_referenced(struct page *page, if (we_locked) unlock_page(page); + if (is_pte_dirty) + *is_pte_dirty = pra.dirtied; + return pra.referenced; } @@ -1187,6 +1213,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spinlock_t *ptl; int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + int dirty = 0; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1216,7 +1243,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) + dirty = pte_dirty(pteval); + if (dirty) set_page_dirty(page); /* Update high watermark before we lower rss */ @@ -1245,6 +1273,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; + if (flags & TTU_FREE) { + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!dirty && !PageDirty(page)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } else { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + } + if (PageSwapCache(page)) { /* * Store the swap location in the pte. @@ -1285,6 +1326,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, MM_FILEPAGES); +discard: page_remove_rmap(page); page_cache_release(page); diff --git a/mm/shmem.c b/mm/shmem.c index e02682267046..3f974a1a4cda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __set_page_locked(newpage); + __SetPageLocked(newpage); SetPageUptodate(newpage); SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); @@ -1173,7 +1173,7 @@ repeat: } __SetPageSwapBacked(page); - __set_page_locked(page); + __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); diff --git a/mm/slab.c b/mm/slab.c index 7eb38dd1cefa..504adb18522d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void) kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); slab_early_init = 0; @@ -3415,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 4c3ac12dd644..88b55497738c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, #ifndef CONFIG_SLOB /* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(unsigned long); /* Find the kmalloc slab corresponding for a certain size */ @@ -162,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold diff --git a/mm/slab_common.c b/mm/slab_common.c index 999bb3424d44..8873985f905e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -105,6 +105,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + } + return true; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { @@ -784,25 +807,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } /* - * Create the kmalloc array. Some of the regular kmalloc arrays - * may already have been created because they were needed to - * enable allocations for slab creation. + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is + * kmalloc-67108864. */ -void __init create_kmalloc_caches(unsigned long flags) +static struct { + const char *name; + unsigned long size; +} const kmalloc_info[] __initconst = { + {NULL, 0}, {"kmalloc-96", 96}, + {"kmalloc-192", 192}, {"kmalloc-8", 8}, + {"kmalloc-16", 16}, {"kmalloc-32", 32}, + {"kmalloc-64", 64}, {"kmalloc-128", 128}, + {"kmalloc-256", 256}, {"kmalloc-512", 512}, + {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, + {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, + {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, + {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, + {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, + {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, + {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, + {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, + {"kmalloc-67108864", 67108864} +}; + +/* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ +void __init setup_kmalloc_cache_index_table(void) { int i; - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); @@ -833,39 +876,41 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + + for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache(NULL, - 1 << i, flags); + kmalloc_caches[i] = create_kmalloc_cache( + kmalloc_info[i].name, + kmalloc_info[i].size, + flags); } /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches + * "i == 2" is the "kmalloc-192" case which is the last special + * case for initialization and it's the point to jump to + * allocate the minimize size of the object. In slab allocator, + * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 + * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is + * defined, it may be larger than 2^5 and here is also the + * trick to skip the empty gap. */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + if (i == 2) + i = (KMALLOC_SHIFT_LOW - 1); } /* Kmalloc array is now usable */ slab_state = UP; - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - char *n; - - if (s) { - n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); - - BUG_ON(!n); - s->name = n; - } - } - #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache *s = kmalloc_caches[i]; diff --git a/mm/slob.c b/mm/slob.c index 4765f65019c7..495df8e006ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 54c0876b43d5..f920dc583f5d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) */ static __always_inline void slab_lock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } static __always_inline void slab_unlock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } @@ -2750,6 +2752,72 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + if (!kmem_cache_debug(s)) { + struct kmem_cache_cpu *c; + + /* Drain objects in the per cpu slab */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + while (size) { + void *object = c->freelist; + + if (unlikely(!object)) { + /* + * Check if there are remotely freed objects + * available in the page. + */ + object = get_freelist(s, c->page); + + if (!object) { + /* + * All objects in use. Let's check if + * we have other per cpu partial pages + * that have available objects. + */ + c->page = c->partial; + if (!c->page) { + /* No per cpu objects left */ + c->freelist = NULL; + break; + } + + /* Next per cpu partial page */ + c->partial = c->page->next; + c->freelist = get_freelist(s, c->page); + continue; + } + + } + + *p++ = object; + size--; + + if (unlikely(flags & __GFP_ZERO)) + memset(object, 0, s->object_size); + + c->freelist = get_freepointer(s, object); + + } + c->tid = next_tid(c->tid); + + local_irq_enable(); + } + + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -3700,6 +3768,7 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP diff --git a/mm/swap.c b/mm/swap.c index a7251a8ed532..8773de093171 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -44,6 +44,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -797,6 +798,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } + +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); + } +} + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -823,6 +842,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + activate_page_drain(cpu); } @@ -852,6 +875,26 @@ void deactivate_file_page(struct page *page) } } +/** + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + void lru_add_drain(void) { lru_add_drain_cpu(get_cpu()); @@ -881,6 +924,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8bc8e66138da..a2611ce55413 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -357,7 +357,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -371,7 +371,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/mm/util.c b/mm/util.c index 68ff8a5361e7..c7434060039b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> +#include <linux/ctype.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/security.h> @@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) EXPORT_SYMBOL(kstrndup); /** + * kstrimdup - Trim and copy a %NUL terminated string. + * @s: the string to trim and duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Returns an address, which the caller must kfree, containing + * a duplicate of the passed string with leading and/or trailing + * whitespace (as defined by isspace) removed. + */ +char *kstrimdup(const char *s, gfp_t gfp) +{ + char *buf; + char *begin = skip_spaces(s); + size_t len = strlen(begin); + + while (len && isspace(begin[len - 1])) + len--; + + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, begin, len); + buf[len] = '\0'; + + return buf; +} +EXPORT_SYMBOL(kstrimdup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate @@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd71bac..7d20d3656e21 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -754,13 +754,17 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, - struct scan_control *sc) + struct scan_control *sc, + bool *freeable) { int referenced_ptes, referenced_page; unsigned long vm_flags; + int pte_dirty; + + VM_BUG_ON_PAGE(!PageLocked(page), page); referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); + &vm_flags, &pte_dirty); referenced_page = TestClearPageReferenced(page); /* @@ -801,6 +805,10 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_KEEP; } + if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) && + !PageDirty(page)) + *freeable = true; + /* Reclaim if clean, defer dirty pages to writeback */ if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; @@ -869,6 +877,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool freeable = false; cond_resched(); @@ -992,7 +1001,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!force_reclaim) - references = page_check_references(page, sc); + references = page_check_references(page, sc, + &freeable); switch (references) { case PAGEREF_ACTIVATE: @@ -1009,22 +1019,31 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (!add_to_swap(page, page_list)) - goto activate_locked; - may_enter_fs = 1; - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + if (!freeable) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (!add_to_swap(page, page_list)) + goto activate_locked; + may_enter_fs = 1; + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } else { + if (likely(!PageTransHuge(page))) + goto unmap; + /* try_to_unmap isn't aware of THP page */ + if (unlikely(split_huge_page_to_list(page, + page_list))) + goto keep_locked; + } } - +unmap: /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + if (page_mapped(page) && (mapping || freeable)) { + switch (try_to_unmap(page, + freeable ? TTU_FREE : ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1032,7 +1051,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, case SWAP_MLOCK: goto cull_mlocked; case SWAP_SUCCESS: - ; /* try to free the page below */ + /* try to free the page below */ + if (!freeable) + break; + /* + * Freeable anon page doesn't have mapping + * due to skipping of swapcache so we free + * page in here rather than __remove_mapping. + */ + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!page_freeze_refs(page, 1)) + goto keep_locked; + __ClearPageLocked(page); + count_vm_event(PGLAZYFREED); + goto free_it; } } @@ -1142,7 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: nr_reclaimed++; @@ -1401,6 +1433,32 @@ int isolate_lru_page(struct page *page) return ret; } +static int __too_many_isolated(struct zone *zone, int file, + struct scan_control *sc, int safe) +{ + unsigned long inactive, isolated; + + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_ANON + 2 * file); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_ANON + file); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file); + isolated = zone_page_state(zone, NR_ISOLATED_ANON + file); + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + + return isolated > inactive; +} + /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get resheduled. When there are massive number of tasks doing page @@ -1409,33 +1467,24 @@ int isolate_lru_page(struct page *page) * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, - struct scan_control *sc) + struct scan_control *sc) { - unsigned long inactive, isolated; - if (current_is_kswapd()) return 0; if (!global_reclaim(sc)) return 0; - if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); - } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); - } - /* - * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they - * won't get blocked by normal direct-reclaimers, forming a circular - * deadlock. + * __too_many_isolated(safe=0) is fast but inaccurate, because it + * doesn't account for the vm_stat_diff[] counters. So if it looks + * like too_many_isolated() is about to return true, fall back to the + * slower, more accurate zone_page_state_snapshot(). */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) - inactive >>= 3; + if (unlikely(__too_many_isolated(zone, file, sc, 0))) + return __too_many_isolated(zone, file, sc, 1); - return isolated > inactive; + return 0; } static noinline_for_stack void @@ -1772,7 +1821,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags, NULL)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -2646,7 +2695,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -3348,7 +3398,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int kswapd(void *p) +static int __defermem_init kswapd(void *p) { unsigned long order, new_order; unsigned balanced_order; @@ -3383,6 +3433,8 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); + deferred_init_memmap(pgdat->node_id); + order = new_order = 0; balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; @@ -3538,7 +3590,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +int __defermem_init kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f5cd974e11a..1fd0886a389f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -759,6 +759,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 08bd7a3d464a..33d512646379 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -45,10 +45,6 @@ * */ -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> diff --git a/mm/zswap.c b/mm/zswap.c index 4249e82ff934..f8583f1fc938 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -490,7 +490,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -501,7 +501,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index d7016279ec2b..fc169fd2a3cc 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -186,6 +186,27 @@ if (-f $conf) { unshift(@ARGV, @conf_args) if @conf_args; } +my @ignore_emails = (); +my $ignore_file = which_conf(".get_maintainer.ignore"); +if (-f $ignore_file) { + open(my $ignore, '<', "$ignore_file") + or warn "$P: Can't find a readable .get_maintainer.ignore file $!\n"; + while (<$ignore>) { + my $line = $_; + + $line =~ s/\s*\n?$//; + $line =~ s/^\s*//; + $line =~ s/\s+$//; + $line =~ s/#.*$//; + + next if ($line =~ m/^\s*$/); + if (rfc822_valid($line)) { + push(@ignore_emails, $line); + } + } + close($ignore); +} + if (!GetOptions( 'email!' => \$email, 'git!' => \$email_git, @@ -513,6 +534,16 @@ if ($web) { exit($exit); +sub ignore_email_address { + my ($address) = @_; + + foreach my $ignore (@ignore_emails) { + return 1 if ($ignore eq $address); + } + + return 0; +} + sub range_is_maintained { my ($start, $end) = @_; @@ -1868,6 +1899,7 @@ sub vcs_assign { my $percent = $sign_offs * 100 / $divisor; $percent = 100 if ($percent > 100); + next if (ignore_email_address($line)); $count++; last if ($sign_offs < $email_git_min_signatures || $count > $email_git_max_maintainers || |