aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-05-29 11:03:18 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2017-05-29 11:03:18 +1000
commit136dd39c36ee340c9bf690c20fce2c37047fae38 (patch)
tree01fb6553a60d509bbcadd40dabc6a65d9c238a7e
parent64353942d9d7c5394baeb60724e8cc30a84f70b7 (diff)
parent7b329ab23c38522cdb0669e33da70bf4d8147c5a (diff)
Merge remote-tracking branch 'tip/auto-latest'
-rw-r--r--Documentation/IRQ-domain.txt41
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt9
-rw-r--r--arch/arm/kernel/smp.c3
-rw-r--r--arch/arm64/kernel/smp.c3
-rw-r--r--arch/metag/kernel/smp.c3
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/kaslr.c190
-rw-r--r--arch/x86/boot/string.c8
-rw-r--r--arch/x86/events/core.c22
-rw-r--r--arch/x86/events/intel/core.c63
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/amd_nb.h3
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/include/asm/tlbbatch.h16
-rw-r--r--arch/x86/include/asm/tlbflush.h14
-rw-r--r--arch/x86/kernel/apic/io_apic.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c229
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/mm/tlb.c78
-rw-r--r--arch/x86/platform/uv/tlb_uv.c14
-rw-r--r--drivers/acpi/apei/ghes.c39
-rw-r--r--drivers/acpi/pci_root.c2
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/cpufreq/pasemi-cpufreq.c2
-rw-r--r--drivers/cpuidle/cpuidle.c1
-rw-r--r--drivers/iommu/intel-iommu.c4
-rw-r--r--drivers/iommu/of_iommu.c2
-rw-r--r--drivers/ras/ras.c2
-rw-r--r--drivers/xen/manage.c1
-rw-r--r--include/linux/clocksource.h1
-rw-r--r--include/linux/cpumask.h28
-rw-r--r--include/linux/kernel.h6
-rw-r--r--include/linux/llist.h19
-rw-r--r--include/linux/mm_types_task.h15
-rw-r--r--include/linux/sched/clock.h11
-rw-r--r--init/main.c27
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/irq/irqdomain.c81
-rw-r--r--kernel/irq/msi.c10
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/clock.c128
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c153
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h19
-rw-r--r--kernel/sched/topology.c430
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/tick-sched.c40
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--lib/cpumask.c32
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/vmscan.c2
61 files changed, 1355 insertions, 751 deletions
diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt
index 82001a25a14b..1f246eb25ca5 100644
--- a/Documentation/IRQ-domain.txt
+++ b/Documentation/IRQ-domain.txt
@@ -231,5 +231,42 @@ needs to:
4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap,
they are unused with hierarchy irq_domain.
-Hierarchy irq_domain may also be used to support other architectures,
-such as ARM, ARM64 etc.
+Hierarchy irq_domain is in no way x86 specific, and is heavily used to
+support other architectures, such as ARM, ARM64 etc.
+
+=== Debugging ===
+
+If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on
+CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in
+your debugfs mount point, called irq_domain_mapping. This file
+contains a live snapshot of all the IRQ domains in the system:
+
+ name mapped linear-max direct-max devtree-node
+ pl061 8 8 0 /smb/gpio@e0080000
+ pl061 8 8 0 /smb/gpio@e1050000
+ pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ GICv2 448 448 0 /interrupt-controller@e1101000
+
+it also iterates over the interrupts to display their mapping in the
+domains, and makes the domain stacking visible:
+
+
+irq hwirq chip name chip data active type domain
+ 1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2
+ 3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2
+[...]
+ 96 0x81808 MSI 0x (null) RADIX MSI
+ 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m
+ 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2
+ 97 0x08800 MSI 0x (null) * RADIX MSI
+ 97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m
+ 97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2
+
+Here, interrupts 1-5 are only using a single domain, while 96 and 97
+are build out of a stack of three domain, each level performing a
+particular function.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 15f79c27748d..4e4c3402412e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2127,6 +2127,12 @@
memmap=nn[KMG]@ss[KMG]
[KNL] Force usage of a specific region of memory.
Region of memory to be used is from ss to ss+nn.
+ If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
+ which limits max address to nn[KMG].
+ Multiple different regions can be specified,
+ comma delimited.
+ Example:
+ memmap=100M@2G,100M#3G,1G!1024G
memmap=nn[KMG]#ss[KMG]
[KNL,ACPI] Mark specific memory as ACPI data.
@@ -2139,6 +2145,9 @@
memmap=64K$0x18690000
or
memmap=0x10000$0x18690000
+ Some bootloaders may need an escape character before '$',
+ like Grub2, otherwise '$' and the following number
+ will be eaten.
memmap=nn[KMG]!ss[KMG]
[KNL,X86] Mark specific memory as protected.
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 572a8df1b766..c9a0a5299827 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
*/
static void ipi_cpu_stop(unsigned int cpu)
{
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
raw_spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6e0e16a3a7d4..321119881abf 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -961,8 +961,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING)
+ if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 232a12bf3f99..2dbbb7c66043 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
{
unsigned int cpu = smp_processor_id();
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..1069f74fca47 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 73ccf63b0f48..9dc1ce6ba3c0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 54c24f0a43d3..e0eba12bffe7 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -9,16 +9,41 @@
* contain the entire properly aligned running kernel image.
*
*/
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
+ * as empty.
+ */
+#define _LINUX_EXPORT_H
+#define EXPORT_SYMBOL(sym)
+
#include "misc.h"
#include "error.h"
-#include "../boot.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <generated/utsrelease.h>
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+extern unsigned long get_cmd_line_ptr(void);
+
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -62,6 +87,11 @@ struct mem_vector {
static bool memmap_too_large;
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+unsigned long long mem_limit = ULLONG_MAX;
+
+
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
@@ -85,49 +115,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
return true;
}
-/**
- * _memparse - Parse a string with mem suffixes into a number
- * @ptr: Where parse begins
- * @retptr: (output) Optional pointer to next char after parse completes
- *
- * Parses a string into a number. The number stored at @ptr is
- * potentially suffixed with K, M, G, T, P, E.
- */
-static unsigned long long _memparse(const char *ptr, char **retptr)
+char *skip_spaces(const char *str)
{
- char *endptr; /* Local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
- case 'E':
- case 'e':
- ret <<= 10;
- case 'P':
- case 'p':
- ret <<= 10;
- case 'T':
- case 't':
- ret <<= 10;
- case 'G':
- case 'g':
- ret <<= 10;
- case 'M':
- case 'm':
- ret <<= 10;
- case 'K':
- case 'k':
- ret <<= 10;
- endptr++;
- default:
- break;
- }
-
- if (retptr)
- *retptr = endptr;
-
- return ret;
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
static int
parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
@@ -142,40 +137,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
return -EINVAL;
oldp = p;
- *size = _memparse(p, &p);
+ *size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
switch (*p) {
- case '@':
- /* Skip this region, usable */
- *start = 0;
- *size = 0;
- return 0;
case '#':
case '$':
case '!':
- *start = _memparse(p + 1, &p);
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
return 0;
}
return -EINVAL;
}
-static void mem_avoid_memmap(void)
+static void mem_avoid_memmap(char *str)
{
- char arg[128];
+ static int i;
int rc;
- int i;
- char *str;
- /* See if we have any memmap areas */
- rc = cmdline_find_option("memmap", arg, sizeof(arg));
- if (rc <= 0)
+ if (i >= MAX_MEMMAP_REGIONS)
return;
- i = 0;
- str = arg;
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
unsigned long long start, size;
@@ -188,9 +184,14 @@ static void mem_avoid_memmap(void)
if (rc < 0)
break;
str = k;
- /* A usable region that should not be skipped */
- if (size == 0)
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
continue;
+ }
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
@@ -202,6 +203,57 @@ static void mem_avoid_memmap(void)
memmap_too_large = true;
}
+static int handle_mem_memmap(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline )
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
@@ -323,7 +375,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- mem_avoid_memmap();
+ handle_mem_memmap();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -432,7 +484,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
{
struct mem_vector region, overlap;
struct slot_area slot_area;
- unsigned long start_orig;
+ unsigned long start_orig, end;
+ struct boot_e820_entry cur_entry;
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
@@ -446,8 +499,15 @@ static void process_e820_entry(struct boot_e820_entry *entry,
if (entry->addr + entry->size < minimum)
return;
- region.start = entry->addr;
- region.size = entry->size;
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->addr, mem_limit);
+ if (entry->addr >= end)
+ return;
+ cur_entry.addr = entry->addr;
+ cur_entry.size = end - entry->addr;
+
+ region.start = cur_entry.addr;
+ region.size = cur_entry.size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -461,7 +521,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above this e820 region? */
- if (region.start > entry->addr + entry->size)
+ if (region.start > cur_entry.addr + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5457b02fc050..630e3664906b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
/**
* strlen - Find the length of a string
* @s: The string to be sized
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..628b8c556aab 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -2255,7 +2265,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2277,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2299,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4e37a1..da9047eec7ba 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3160,6 +3160,19 @@ err:
return -ENOMEM;
}
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
if (!cpuc->shared_regs)
return;
@@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL,
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index be3d36254040..53728eea1bed 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -562,6 +562,9 @@ struct x86_pmu {
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
/*
* CPU Hotplug hooks
*/
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 00c88a01301d..da181ad1d5f8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,6 +3,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/refcount.h>
struct amd_nb_bus_dev_range {
u8 bus;
@@ -55,7 +56,7 @@ struct threshold_bank {
struct threshold_block *blocks;
/* initialized to the number of CPUs on the node sharing this bank */
- atomic_t cpus;
+ refcount_t cpus;
};
struct amd_northbridge {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8638e2419cc..d406894cd9a2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,8 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..01a6de16fb96
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,16 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_SMP
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+#endif
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..8f6e2f87511b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -307,11 +307,15 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -325,6 +329,14 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
}
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
+{
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+}
+
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+
#endif /* SMP */
#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..247880fc29f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047e4b68..d00f299f2ada 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank, unsigned int cpu)
{
- unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ unsigned int i, hwid_mcatype;
struct smca_hwid *s_hwid;
- u32 high, instance_id;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
smca_get_name(s_hwid->bank_type));
smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = instance_id;
+ smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high, smca_addr;
+ u32 smca_low, smca_high;
struct threshold_block b;
int new;
@@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
goto set_offset;
}
- smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
- if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
- /*
- * OS is required to set the MCAX bit to acknowledge that it is
- * now using the new MSR ranges and new registers under each
- * bank. It also means that the OS will configure deferred
- * errors in the new MCx_CONFIG register. If the bit is not set,
- * uncorrectable errors will cause a system panic.
- *
- * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
- */
- smca_high |= BIT(0);
-
- /*
- * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
- * registers with the option of additionally logging to
- * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
- *
- * This bit is usually set by BIOS to retain the old behavior
- * for OSes that don't use the new registers. Linux supports the
- * new registers so let's disable that additional logging here.
- *
- * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
- * portion of the MSR).
- */
- smca_high &= ~BIT(2);
-
- /*
- * SMCA sets the Deferred Error Interrupt type per bank.
- *
- * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
- * if the DeferredIntType bit field is available.
- *
- * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
- * high portion of the MSR). OS should set this to 0x1 to enable
- * APIC based interrupt. First, check that no interrupt has been
- * set.
- */
- if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
- smca_high |= BIT(5);
-
- wrmsr(smca_addr, smca_low, smca_high);
- }
-
/* Gather LVT offset for thresholding: */
if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
goto out;
@@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
- get_smca_bank_info(bank);
+ smca_configure(bank, cpu);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(cpu, address, low, high, bank, block);
@@ -755,37 +741,19 @@ out_err:
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- u32 msr_status = msr_ops.status(bank);
- u32 msr_addr = msr_ops.addr(bank);
struct mce m;
- u64 status;
-
- WARN_ON_ONCE(deferred_err && threshold_err);
-
- if (deferred_err && mce_flags.smca) {
- msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
- msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
- }
-
- rdmsrl(msr_status, status);
-
- if (!(status & MCI_STATUS_VAL))
- return;
mce_setup(&m);
m.status = status;
+ m.misc = misc;
m.bank = bank;
m.tsc = rdtsc();
- if (threshold_err)
- m.misc = misc;
-
if (m.status & MCI_STATUS_ADDRV) {
- rdmsrl(msr_addr, m.addr);
+ m.addr = addr;
/*
* Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
}
mce_log(&m);
-
- wrmsrl(msr_status, 0);
}
static inline void __smp_deferred_error_interrupt(void)
@@ -832,45 +798,85 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
exiting_ack_irq();
}
-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
- unsigned int bank;
- u32 msr_status;
- u64 status;
+ u64 status, addr = 0;
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
- : msr_ops.status(bank);
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;
- rdmsrl(msr_status, status);
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);
- if (!(status & MCI_STATUS_VAL) ||
- !(status & MCI_STATUS_DEFERRED))
- continue;
+ __log_error(bank, status, addr, misc);
- __log_error(bank, true, false, 0);
- break;
- }
+ wrmsrl(status, 0);
+
+ return status & MCI_STATUS_DEFERRED;
}
/*
- * APIC Interrupt Handler
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
*/
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;
+
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);
+
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}
+
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
*/
-
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
struct thresh_restart tr;
- /* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
@@ -893,23 +899,18 @@ static void amd_threshold_interrupt(void)
(high & MASK_LOCKED_HI))
continue;
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- if (high & MASK_OVERFLOW_HI)
- goto log;
- }
- }
- return;
+ if (!(high & MASK_OVERFLOW_HI))
+ continue;
-log:
- __log_error(bank, false, true, ((u64)high << 32) | low);
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(bank, ((u64)high << 32) | low);
- /* Reset threshold block after logging error. */
- memset(&tr, 0, sizeof(tr));
- tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
- threshold_restart_bank(&tr);
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ threshold_restart_bank(&tr);
+ }
+ }
}
/*
@@ -1202,7 +1203,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
per_cpu(threshold_banks, cpu)[bank] = b;
- atomic_inc(&b->cpus);
+ refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1225,7 +1226,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
if (is_shared_bank(bank)) {
- atomic_set(&b->cpus, 1);
+ refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
@@ -1289,7 +1290,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
goto free_out;
if (is_shared_bank(bank)) {
- if (!atomic_dec_and_test(&b->cpus)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..045e4f993bd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..743e4c6b4529 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -237,24 +237,26 @@ static void flush_tlb_func(void *info)
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
- (f->flush_end - f->flush_start) / PAGE_SIZE;
- addr = f->flush_start;
- while (addr < f->flush_end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
- }
- } else
+
+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
leave_mm(smp_processor_id());
+ return;
+ }
+ if (f->flush_end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages =
+ (f->flush_end - f->flush_start) / PAGE_SIZE;
+ addr = f->flush_start;
+ while (addr < f->flush_end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
+ }
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
+ }
}
void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -354,33 +356,6 @@ out:
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
- }
-
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
-}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -420,6 +395,23 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
}
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, NULL, 0, TLB_FLUSH_ALL);
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..795671593528 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d0855c09f32f..d2c8a9286fa8 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -89,14 +89,14 @@ bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);
/*
- * All error sources notified with SCI shares one notifier function,
- * so they need to be linked and checked one by one. This is applied
- * to NMI too.
+ * All error sources notified with HED (Hardware Error Device) share a
+ * single notifier callback, so they need to be linked and checked one
+ * by one. This holds true for NMI too.
*
* RCU is used for these lists, so ghes_list_mutex is only used for
* list changing, not for traversing.
*/
-static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);
/*
@@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data)
return IRQ_HANDLED;
}
-static int ghes_notify_sci(struct notifier_block *this,
- unsigned long event, void *data)
+static int ghes_notify_hed(struct notifier_block *this, unsigned long event,
+ void *data)
{
struct ghes *ghes;
int ret = NOTIFY_DONE;
rcu_read_lock();
- list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+ list_for_each_entry_rcu(ghes, &ghes_hed, list) {
if (!ghes_proc(ghes))
ret = NOTIFY_OK;
}
@@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this,
return ret;
}
-static struct notifier_block ghes_notifier_sci = {
- .notifier_call = ghes_notify_sci,
+static struct notifier_block ghes_notifier_hed = {
+ .notifier_call = ghes_notify_hed,
};
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_POLLED:
case ACPI_HEST_NOTIFY_EXTERNAL:
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
break;
+
case ACPI_HEST_NOTIFY_NMI:
if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev)
goto err_edac_unreg;
}
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
- if (list_empty(&ghes_sci))
- register_acpi_hed_notifier(&ghes_notifier_sci);
- list_add_rcu(&ghes->list, &ghes_sci);
+ if (list_empty(&ghes_hed))
+ register_acpi_hed_notifier(&ghes_notifier_hed);
+ list_add_rcu(&ghes->list, &ghes_hed);
mutex_unlock(&ghes_list_mutex);
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_add(ghes);
break;
@@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_EXTERNAL:
free_irq(ghes->irq, ghes);
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
list_del_rcu(&ghes->list);
- if (list_empty(&ghes_sci))
- unregister_acpi_hed_notifier(&ghes_notifier_sci);
+ if (list_empty(&ghes_hed))
+ unregister_acpi_hed_notifier(&ghes_notifier_hed);
mutex_unlock(&ghes_list_mutex);
synchronize_rcu();
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_remove(ghes);
break;
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 919be0aa2578..240544253ccd 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device,
struct acpi_pci_root *root;
acpi_handle handle = device->handle;
int no_aspm = 0;
- bool hotadd = system_state != SYSTEM_BOOTING;
+ bool hotadd = system_state == SYSTEM_RUNNING;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5548f9686016..0440d95c9b5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
if (!pfn_valid_within(pfn))
return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- if (system_state == SYSTEM_BOOTING)
+ if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
#endif
page = pfn_to_page(pfn);
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 35dd4d7ffee0..b257fc7d5204 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy)
* We don't support CPU hotplug. Don't unmap after the system
* has already made it to a running state.
*/
- if (system_state != SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return 0;
if (sdcasr_mapbase)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7ed334..60bb64f4329d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
+ sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fc2765ccdb57..8500deda9175 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
- if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
- if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 9f44ee8ea1bc..b8dcf44df441 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np)
* it never will be. We don't want to defer indefinitely, nor attempt
* to dereference __iommu_of_table after it's been freed.
*/
- if (system_state > SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return false;
return of_match_node(&__iommu_of_table, np);
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 94f8038864b4..ed4c343d08c4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
-int __init parse_ras_param(char *str)
+static int __init parse_ras_param(char *str)
{
#ifdef CONFIG_RAS_CEC
parse_cec_param(str);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c1ec8ee80924..9e35032351a0 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -190,6 +190,7 @@ static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
+ case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f2b10d9ebd04..81490456c242 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -96,6 +96,7 @@ struct clocksource {
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
void (*mark_unstable)(struct clocksource *cs);
+ void (*tick_stable)(struct clocksource *cs);
/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad238c0b..4bf4479a3a80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node);
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+
+/**
+ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask poiter
+ * @start: the start location
+ *
+ * The implementation does not assume any bit in @mask is set (including @start).
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_wrap(cpu, mask, start) \
+ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
+ (cpu) < nr_cpumask_bits; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
@@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+ __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
@@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
/**
* cpumask_test_cpu - test for a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08aba704..1c91f26e2996 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,9 +490,13 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/* Values used for system_state */
+/*
+ * Values used for system_state. Ordering of the states must not be changed
+ * as code checks for <, <=, >, >= STATE.
+ */
extern enum system_states {
SYSTEM_BOOTING,
+ SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 171baa90f6f6..d11738110a7a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list)
for ((pos) = (node); pos; (pos) = (pos)->next)
/**
+ * llist_for_each_safe - iterate over some deleted entries of a lock-less list
+ * safe against removal of list entry
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @n: another &struct llist_node to use as temporary storage
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_safe(pos, n, node) \
+ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+
+/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
* @pos: the type * to use as a loop cursor.
* @node: the fist entry of deleted list entries.
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 136dfdf63ba1..fc412fbd80bd 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -14,6 +14,10 @@
#include <asm/page.h>
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#include <asm/tlbbatch.h>
+#endif
+
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
@@ -67,12 +71,15 @@ struct page_frag {
struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
- * Each bit set is a CPU that potentially has a TLB entry for one of
- * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ * The arch code makes the following promise: generic code can modify a
+ * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
+ * needed barriers), then call arch_tlbbatch_flush(), and the entries
+ * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
+ * returns.
*/
- struct cpumask cpumask;
+ struct arch_tlbflush_unmap_batch arch;
- /* True if any bit in cpumask is set */
+ /* True if a flush is needed. */
bool flush_required;
/*
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 34fe92ce1ebd..a55600ffdf4b 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_init(void);
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
static inline void sched_clock_tick(void)
{
}
@@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void)
{
}
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+static inline void sched_clock_idle_wakeup_event(void)
{
}
@@ -53,7 +49,6 @@ static inline u64 local_clock(void)
return sched_clock();
}
#else
-extern void sched_clock_init_late(void);
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);
@@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void);
*/
extern u64 __sched_clock_offset;
-
extern void sched_clock_tick(void);
+extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+extern void sched_clock_idle_wakeup_event(void);
/*
* As outlined in clock.c, provides a fast, high resolution, nanosecond
diff --git a/init/main.c b/init/main.c
index f866510472d7..df58a416dd1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done);
static noinline void __ref rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -397,12 +398,32 @@ static noinline void __ref rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
@@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void)
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
cad_pid = task_pid(current);
diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..f8c27d3ef3a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9172,7 +9172,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
+ struct pmu *pmu;
int idx;
int ret;
@@ -9456,9 +9456,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
+ if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
@@ -9471,8 +9469,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
sizeof(unsigned long),
GFP_KERNEL);
- if (!event->addr_filters_offs)
+ if (!event->addr_filters_offs) {
+ err = -ENOMEM;
goto err_per_task;
+ }
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..0fbdd8582f08 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
- if (system_state == SYSTEM_BOOTING &&
+ if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 31805f237396..70b9da72018b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
EXPORT_SYMBOL_GPL(irq_find_mapping);
#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
+{
+ struct irq_domain *domain;
+ struct irq_data *data;
+
+ domain = desc->irq_data.domain;
+ data = &desc->irq_data;
+
+ while (domain) {
+ unsigned int irq = data->irq;
+ unsigned long hwirq = data->hwirq;
+ struct irq_chip *chip;
+ bool direct;
+
+ if (data == &desc->irq_data)
+ seq_printf(m, "%5d ", irq);
+ else
+ seq_printf(m, "%5d+ ", irq);
+ seq_printf(m, "0x%05lx ", hwirq);
+
+ chip = irq_data_get_irq_chip(data);
+ seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
+
+ seq_printf(m, data ? "0x%p " : " %p ",
+ irq_data_get_irq_chip_data(data));
+
+ seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
+ direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
+ seq_printf(m, "%6s%-8s ",
+ (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
+ direct ? "(DIRECT)" : "");
+ seq_printf(m, "%s\n", domain->name);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ domain = domain->parent;
+ data = data->parent_data;
+#else
+ domain = NULL;
+#endif
+ }
+}
+
static int virq_debug_show(struct seq_file *m, void *private)
{
unsigned long flags;
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void *data, **slot;
+ void **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -760,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private)
mutex_lock(&irq_domain_mutex);
list_for_each_entry(domain, &irq_domain_list, link) {
struct device_node *of_node;
+ const char *name;
+
int count = 0;
+
of_node = irq_domain_get_of_node(domain);
+ if (of_node)
+ name = of_node_full_name(of_node);
+ else if (is_fwnode_irqchip(domain->fwnode))
+ name = container_of(domain->fwnode, struct irqchip_fwid,
+ fwnode)->name;
+ else
+ name = "";
+
radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
count++;
seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
domain == irq_default_domain ? '*' : ' ', domain->name,
domain->revmap_size + count, domain->revmap_size,
domain->revmap_direct_max_irq,
- of_node ? of_node_full_name(of_node) : "");
+ name);
}
mutex_unlock(&irq_domain_mutex);
@@ -782,30 +834,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- domain = desc->irq_data.domain;
-
- if (domain) {
- struct irq_chip *chip;
- int hwirq = desc->irq_data.hwirq;
- bool direct;
-
- seq_printf(m, "%5d ", i);
- seq_printf(m, "0x%05x ", hwirq);
-
- chip = irq_desc_get_chip(desc);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- data = irq_desc_get_chip_data(desc);
- seq_printf(m, data ? "0x%p " : " %p ", data);
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", desc->irq_data.domain->name);
- }
-
+ virq_debug_show_one(m, desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ddc2f5427f75..fe4d48ec5bc4 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ struct irq_domain *domain;
+
if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
msi_domain_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
- return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+ if (domain && info->chip && info->chip->name)
+ domain->name = info->chip->name;
+
+ return domain;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9995b2dfaaff..e782e9049e73 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level)
unsigned long long k;
unsigned long timeout;
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
+#include <linux/init.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -124,14 +125,27 @@ int sched_clock_stable(void)
return static_branch_likely(&__sched_clock_stable);
}
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
+
static void __set_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
/*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, __gtod_offset,
@@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void)
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
static void __sched_clock_work(struct work_struct *work)
{
+ struct sched_clock_data *scd;
+ int cpu;
+
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
+
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
static_branch_disable(&__sched_clock_stable);
}
@@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
static void __clear_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
-
- /*
- * Attempt to make the stable->unstable transition continuous.
- *
- * Trouble is, this is typically called from the TSC watchdog
- * timer, which is late per definition. This means the tick
- * values can already be screwy.
- *
- * Still do what we can.
- */
- __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
-
- printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, __gtod_offset,
- scd->tick_raw, __sched_clock_offset);
+ if (!sched_clock_stable())
+ return;
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-
- if (sched_clock_stable())
- schedule_work(&sched_clock_work);
+ schedule_work(&sched_clock_work);
}
void clear_sched_clock_stable(void)
@@ -183,7 +211,11 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-void sched_clock_init_late(void)
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
{
sched_clock_running = 2;
/*
@@ -197,7 +229,10 @@ void sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+
+ return 0;
}
+late_initcall(sched_clock_init_late);
/*
* min, max except they take wrapping into account
@@ -347,21 +382,38 @@ void sched_clock_tick(void)
{
struct sched_clock_data *scd;
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
+ scd = this_scd();
+ __scd_stamp(scd);
+ sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+ u64 gtod, clock;
+
+ if (!sched_clock_stable())
+ return;
+
/*
- * Update these values even if sched_clock_stable(), because it can
- * become unstable at any point in time at which point we need some
- * values to fall back on.
+ * Called under watchdog_lock.
*
- * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
*/
- scd = this_scd();
- scd->tick_raw = sched_clock();
- scd->tick_gtod = ktime_get_ns();
-
- if (!sched_clock_stable() && likely(sched_clock_running))
- sched_clock_local(scd);
+ local_irq_disable();
+ gtod = ktime_get_ns();
+ clock = sched_clock();
+ __gtod_offset = (clock + __sched_clock_offset) - gtod;
+ local_irq_enable();
}
/*
@@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void)
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
- * We just idled delta nanoseconds (called with irqs disabled):
+ * We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(u64 delta_ns)
+void sched_clock_idle_wakeup_event(void)
{
- if (timekeeping_suspended)
+ unsigned long flags;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(timekeeping_suspended))
return;
+ local_irq_save(flags);
sched_clock_tick();
- touch_softlockup_watchdog_sched();
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..c3e50cada84d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
@@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
@@ -4197,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -5958,7 +5949,6 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
@@ -5968,7 +5958,7 @@ void __init sched_init_smp(void)
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5984,7 +5974,6 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -6000,7 +5989,6 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -6199,7 +6187,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6251,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..df6c2912bd60 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1533,7 +1533,7 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..47a0c552c77b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work)
return;
- down_read(&mm->mmap_sem);
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
@@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
return 1;
}
@@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}
+/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
@@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
+ if (!cpumask_intersects(sched_group_span(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
@@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
runnable_load = 0;
max_spare_cap = 0;
- for_each_cpu(i, sched_group_cpus(group)) {
+ for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Check if we have any choice: */
if (group->group_weight == 1)
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5721,7 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5812,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5829,7 +5814,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
@@ -6970,10 +6955,28 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6986,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
@@ -6997,6 +7000,13 @@ static void update_blocked_averages(int cpu)
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
}
rq_unlock_irqrestore(rq, &rf);
}
@@ -7229,7 +7239,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ for_each_cpu(cpu, sched_group_span(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
@@ -7408,7 +7418,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
@@ -7572,7 +7582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
@@ -7927,7 +7937,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -8033,7 +8043,6 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- struct cpumask *sg_cpus, *sg_mask;
int cpu, balance_cpu = -1;
/*
@@ -8043,11 +8052,9 @@ static int should_we_balance(struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE)
return 1;
- sg_cpus = sched_group_cpus(sg);
- sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
- for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ if (!idle_cpu(cpu))
continue;
balance_cpu = cpu;
@@ -8083,7 +8090,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
+ .dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
@@ -9523,10 +9530,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..dc4d1483b038 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..581d5c7a5264 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
@@ -1819,7 +1830,7 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..f8cf1d87f065 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -606,11 +606,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-extern cpumask_var_t fallback_doms;
-extern cpumask_var_t sched_domains_tmpmask;
extern void init_defrootdomain(void);
-extern int init_sched_domains(const struct cpumask *cpu_map);
+extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -1025,7 +1023,11 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- unsigned long cpumask[0]; /* iteration mask */
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[0]; /* balance mask */
};
struct sched_group {
@@ -1046,16 +1048,15 @@ struct sched_group {
unsigned long cpumask[0];
};
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
*/
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgc->cpumask);
}
@@ -1066,7 +1067,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg)
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
}
extern int group_balance_cpu(struct sched_group *sg);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
@@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
@@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %*pbl level %s\n",
+ printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
}
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
}
@@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_cpus(group))) {
+ if (!cpumask_weight(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %lu)",
- group->sgc->capacity);
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_OVERLAP) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
}
+ printk(KERN_CONT " }");
+
group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
} while (group != sd->groups);
printk(KERN_CONT "\n");
@@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
return;
}
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
for (;;) {
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -477,46 +496,214 @@ enum s_alloc {
};
/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
*
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
*
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
+ * Also see should_we_balance().
*/
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+int group_balance_cpu(struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each cpu. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
continue;
- cpumask_set_cpu(i, sched_group_mask(sg));
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
}
/*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
*/
-int group_balance_cpu(struct sched_group *sg)
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child)
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ else
+ cpumask_copy(sg_span, sched_domain_span(sd));
+
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first_and(sched_group_span(sg), mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ struct sched_group *first = NULL, *last = NULL, *sg;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
@@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sibling = *per_cpu_ptr(sdd->sd, i);
- /* See the comment near build_group_mask(). */
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
+ sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
+ sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance CPU. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
+ init_overlap_sched_group(sd, sg);
if (!first)
first = sg;
@@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
last->next = first;
}
- sd->groups = groups;
+ sd->groups = first;
return 0;
@@ -589,23 +759,106 @@ fail:
return -ENOMEM;
}
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * DIE [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
+ struct sched_group *sg;
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_inc(&sg->ref);
+ atomic_inc(&sg->sgc->ref);
- /* For claim_allocations: */
- atomic_set(&(*sg)->sgc->ref, 1);
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
}
- return cpu;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
}
/*
@@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
struct cpumask *covered;
int i;
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct sched_group *sg;
- int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
+ sg = get_group(i, sdd);
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
+ cpumask_or(covered, covered, sched_group_span(sg));
if (!first)
first = sg;
@@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
}
last->next = first;
+ sd->groups = first;
return 0;
}
@@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
do {
int cpu, max_cpu = -1;
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
+ for_each_cpu(cpu, sched_group_span(sg)) {
if (max_cpu < 0)
max_cpu = cpu;
else if (sched_asym_prefer(cpu, max_cpu))
@@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
+#ifdef CONFIG_SCHED_DEBUG
+ sgc->id = j;
+#endif
+
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
@@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur;
* cpumask) fails, then fallback to a single sched domain,
* as determined by the single cpumask fallback_doms.
*/
-cpumask_var_t fallback_doms;
+static cpumask_var_t fallback_doms;
/*
* arch_update_cpu_topology lets virtualized architectures update the
@@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated CPUs, but could be used to
* exclude other special cases in the future.
*/
-int init_sched_domains(const struct cpumask *cpu_map)
+int sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
struct call_function_data {
struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu))) {
+ free_cpumask_var(cfd->cpumask);
+ return -ENOMEM;
+ }
cfd->csd = alloc_percpu(struct call_single_data);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->csd);
return 0;
}
@@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
+ cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
- llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
+
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..2de9c553682f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
+ /*
+ * In case the current tick fired too early past its expected
+ * expiration, make sure we don't bypass the next clock reprogramming
+ * to the same deadline.
+ */
+ ts->next_tick = 0;
}
#endif
update_process_times(user_mode(regs));
@@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
}
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+
+ /*
+ * Reset to make sure next tick stop doesn't get fooled by past
+ * cached clock deadline.
+ */
+ ts->next_tick = 0;
}
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
@@ -771,8 +783,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event))
- goto out;
+ if (ts->tick_stopped && (expires == ts->next_tick)) {
+ /* Sanity check: make sure clockevent is actually programmed */
+ if (likely(dev->next_event <= ts->next_tick))
+ goto out;
+
+ WARN_ON_ONCE(1);
+ printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
+ basemono, ts->next_tick, dev->next_event,
+ hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ }
/*
* nohz_stop_sched_tick can be called several times before
@@ -791,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
+ ts->next_tick = tick;
+
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
@@ -806,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
else
tick_program_event(tick, 1);
out:
- /* Update the estimated sleep length */
+ /*
+ * Update the estimated sleep length until the next timer
+ * (not only the tick).
+ */
ts->sleep_length = ktime_sub(dev->next_event, now);
return tick;
}
@@ -864,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ /*
+ * Make sure the CPU doesn't get fooled by obsolete tick
+ * deadline if it comes back online later.
+ */
+ ts->next_tick = 0;
return false;
}
@@ -1172,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (regs)
tick_sched_handle(ts, regs);
+ else
+ ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @next_tick: Next tick to be fired when in dynticks mode.
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t last_tick;
+ ktime_t next_tick;
int inidle;
int tick_stopped;
unsigned long idle_jiffies;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 81dedaab36cc..4731a0895760 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
}
EXPORT_SYMBOL(cpumask_any_but);
+/**
+ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
+ * @n: the cpu prior to the place to search
+ * @mask: the cpumask pointer
+ * @start: the start point of the iteration
+ * @wrap: assume @n crossing @start terminates the iteration
+ *
+ * Returns >= nr_cpu_ids on completion
+ *
+ * Note: the @wrap argument is required for the start condition when
+ * we cannot assume @start is set in @mask.
+ */
+int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+ int next;
+
+again:
+ next = cpumask_next(n, mask);
+
+ if (wrap && n < start && next >= start) {
+ return nr_cpumask_bits;
+
+ } else if (next >= nr_cpumask_bits) {
+ wrap = true;
+ n = -1;
+ goto again;
+ }
+
+ return next;
+}
+EXPORT_SYMBOL(cpumask_next_wrap);
+
/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 690d75b132fa..2fb007be0212 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
/*
* It is valid to assume CPU-locality during early bootup:
*/
- if (system_state != SYSTEM_RUNNING)
+ if (system_state < SYSTEM_SCHEDULING)
goto out;
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..130c238fe384 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..c3c1c6ac62da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3652,7 +3652,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;