diff options
Diffstat (limited to 'kernel')
34 files changed, 1722 insertions, 368 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 8c201f414226..b2301bdc9773 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1851,7 +1851,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index f0233dc40b17..ddc22878433d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -290,13 +290,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -330,7 +323,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..fd840c40abf7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1417,6 +1417,9 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..9a1a98dd9e97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 78cf061f8179..310241ca7991 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -19,18 +19,21 @@ struct bpf_netns_link { * with netns_bpf_mutex held. */ struct net *net; + struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); /* Must be called with netns_bpf_mutex held. */ -static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) { - struct bpf_netns_link *net_link = - container_of(link, struct bpf_netns_link, link); + struct bpf_prog_array *run_array; - net_link->net = NULL; + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); } static void bpf_netns_link_release(struct bpf_link *link) @@ -40,22 +43,18 @@ static void bpf_netns_link_release(struct bpf_link *link) enum netns_bpf_attach_type type = net_link->netns_type; struct net *net; - /* Link auto-detached by dying netns. */ - if (!net_link->net) - return; - mutex_lock(&netns_bpf_mutex); - /* Recheck after potential sleep. We can race with cleanup_net - * here, but if we see a non-NULL struct net pointer pre_exit - * has not happened yet and will block on netns_bpf_mutex. + /* We can race with cleanup_net, but if we see a non-NULL + * struct net pointer, pre_exit has not run yet and wait for + * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; - net->bpf.links[type] = NULL; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + list_del(&net_link->node); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -76,6 +75,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; struct net *net; int ret = 0; @@ -93,8 +93,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, goto out_unlock; } + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + WRITE_ONCE(run_array->items[0].prog, new_prog); + old_prog = xchg(&link->prog, new_prog); - rcu_assign_pointer(net->bpf.progs[type], new_prog); bpf_prog_put(old_prog); out_unlock: @@ -142,14 +145,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = { .show_fdinfo = bpf_netns_link_show_fdinfo, }; +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; enum netns_bpf_attach_type type; - struct bpf_prog *attached; struct net *net; + int ret; if (attr->query.query_flags) return -EINVAL; @@ -162,36 +189,25 @@ int netns_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(net)) return PTR_ERR(net); - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; + return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; + struct bpf_prog *attached; struct net *net; int ret; + if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; @@ -200,19 +216,47 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); + ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } + if (ret) + goto out_unlock; + + attached = net->bpf.progs[type]; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; + if (attached) + bpf_prog_put(attached); + out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -221,63 +265,74 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ - if (net->bpf.links[type]) + if (!list_empty(&net->bpf.links[type])) return -EINVAL; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (!attached) + attached = net->bpf.progs[type]; + if (!attached || attached != old) return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { - struct bpf_prog *prog; + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); /* Allow attaching only one prog or link for now */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ - prog = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (prog) { + if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - err = flow_dissector_bpf_prog_attach(net, link->prog); + err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; default: err = -EINVAL; @@ -286,7 +341,15 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - net->bpf.links[type] = link; + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = link->prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + + list_add_tail(&net_link->node, &net->bpf.links[type]); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -345,23 +408,34 @@ out_put_net: return err; } +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; - struct bpf_link *link; + struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { - link = net->bpf.links[type]; - if (link) - bpf_netns_link_auto_detach(link); - else - __netns_bpf_prog_detach(net, type); + netns_bpf_run_array_detach(net, type); + list_for_each_entry(net_link, &net->bpf.links[type], node) + net_link->net = NULL; /* auto-detach link */ + if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..cae9d505e04a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -20,11 +20,14 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - struct sock __rcu **socks; + uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); - socks = sk->sk_user_data; - if (socks) { + sk_user_data = (uintptr_t)sk->sk_user_data; + if (sk_user_data & SK_USER_DATA_BPF) { + struct sock __rcu **socks; + + socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -252,6 +255,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; + uintptr_t sk_user_data; struct socket *socket; int err, fd; @@ -305,7 +309,9 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; + WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..0af88bbc1c15 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -132,15 +132,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; - if (!data_sz || !PAGE_ALIGNED(data_sz)) - return ERR_PTR(-EINVAL); - -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (data_sz > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return ERR_PTR(-ENOMEM); @@ -166,9 +157,16 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || - attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + !is_power_of_2(attr->max_entries) || + !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (attr->max_entries > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); if (!rb_map) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..0fd80ac81f70 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,7 +2121,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; @@ -2893,13 +2893,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -3139,7 +3137,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3165,7 +3164,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3221,7 +3220,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3290,11 +3290,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3328,7 +3328,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3363,7 +3363,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3394,7 +3394,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3451,7 +3451,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3497,7 +3497,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3540,7 +3541,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3555,7 +3557,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3608,15 +3611,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..94cead5a43e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -399,8 +399,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_BTF_ID; + type == PTR_TO_SOCK_COMMON; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -9801,7 +9800,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err; + int err, num_exentries; if (env->subprog_cnt <= 1) return 0; @@ -9876,6 +9875,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 61774aec46b4..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 93f578a8e613..67f060b86a73 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -70,7 +70,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -539,3 +539,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 8cfa01243ed2..6bc74a2d5127 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -6,7 +6,6 @@ #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> #include <linux/init.h> #include <linux/genalloc.h> #include <linux/set_memory.h> @@ -69,12 +68,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, 1 << order, - order, false); - else - page = alloc_pages(gfp, order); + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; @@ -118,8 +112,7 @@ remove_mapping: dma_common_free_remap(addr, pool_size); #endif free_page: __maybe_unused - if (!dma_release_from_contiguous(NULL, page, 1 << order)) - __free_pages(page, order); + __free_pages(page, order); out: return ret; } @@ -203,7 +196,7 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dev_to_pool(struct device *dev) +static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) { u64 phys_mask; gfp_t gfp; @@ -217,47 +210,79 @@ static inline struct gen_pool *dev_to_pool(struct device *dev) return atomic_pool_kernel; } -static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +{ + if (bad_pool == atomic_pool_kernel) + return atomic_pool_dma32 ? : atomic_pool_dma; + + if (bad_pool == atomic_pool_dma32) + return atomic_pool_dma; + + return NULL; +} + +static inline struct gen_pool *dma_guess_pool(struct device *dev, + struct gen_pool *bad_pool) { - struct gen_pool *pool = dev_to_pool(dev); + if (bad_pool) + return dma_get_safer_pool(bad_pool); - if (unlikely(!pool)) - return false; - return gen_pool_has_addr(pool, (unsigned long)start, size); + return dma_guess_pool_from_device(dev); } void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { - struct gen_pool *pool = dev_to_pool(dev); - unsigned long val; + struct gen_pool *pool = NULL; + unsigned long val = 0; void *ptr = NULL; - - if (!pool) { - WARN(1, "%pGg atomic pool not initialised!\n", &flags); - return NULL; + phys_addr_t phys; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) { + WARN(1, "Failed to get suitable pool for %s\n", + dev_name(dev)); + break; + } + + val = gen_pool_alloc(pool, size); + if (!val) + continue; + + phys = gen_pool_virt_to_phys(pool, val); + if (dma_coherent_ok(dev, phys, size)) + break; + + gen_pool_free(pool, val, size); + val = 0; } - val = gen_pool_alloc(pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(pool, val); + if (val) { *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; memset(ptr, 0, size); + + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); } - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); return ptr; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - struct gen_pool *pool = dev_to_pool(dev); + struct gen_pool *pool = NULL; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) + return false; - if (!dma_in_atomic_pool(dev, start, size)) - return false; - gen_pool_free(pool, (unsigned long)start, size); - return true; + if (gen_pool_has_addr(pool, (unsigned long)start, size)) { + gen_pool_free(pool, (unsigned long)start, size); + return true; + } + } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..5f8b0c52fd2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2199,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..0cc3d9cd6cc2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } @@ -1954,8 +1960,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -2035,19 +2041,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirqs_enabled = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->hardirq_context = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..4616d4ad609d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include <linux/compat.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/fs.h> -#include <linux/file.h> #include <linux/jhash.h> -#include <linux/init.h> -#include <linux/futex.h> -#include <linux/mount.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/signal.h> -#include <linux/export.h> -#include <linux/magic.h> -#include <linux/pid.h> -#include <linux/nsproxy.h> -#include <linux/ptrace.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> -#include <linux/refcount.h> #include <asm/futex.h> @@ -476,7 +459,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +483,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +609,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } @@ -677,10 +660,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1611,13 +1590,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + return ret; spin_lock(&hb->lock); @@ -1640,9 +1619,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: return ret; } @@ -1709,10 +1685,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1706,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1720,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,11 +1755,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret; } @@ -1992,20 +1961,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2025,13 +1992,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,12 +2055,10 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2106,8 +2069,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2216,12 +2177,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret ? ret : task_count; } @@ -2567,7 +2522,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2580,7 +2535,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2594,8 +2549,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2692,12 +2646,11 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2706,9 +2659,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2803,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2910,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2927,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3060,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3217,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3227,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3317,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..2a9fec53e159 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,26 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +344,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..bb14e64f62a4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -644,19 +644,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +674,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index d4999b38d1be..65ca5539c470 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -7,8 +7,11 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ - $(call cc-option,-fno-stack-protector,) +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o -obj-$(CONFIG_KCSAN_SELFTEST) += test.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index be9e625227f3..75fe701f4127 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -3,8 +3,7 @@ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H -#include <linux/jiffies.h> -#include <linux/sched.h> +#include <linux/types.h> /* * Special rules for certain memory where concurrent conflicting accesses are @@ -13,8 +12,7 @@ */ static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* volatile globals that have been observed in data races. */ - return ptr == &jiffies || ptr == ¤t->state; + return false; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 732623c30359..9147ff6a12e5 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -291,6 +291,20 @@ static inline unsigned int get_delay(void) 0); } +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { + kcsan_save_irqtrace(current); kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint - watchpoints); + kcsan_restore_irqtrace(current); } else { /* * The other thread may not print any diagnostics, as it has @@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); if (!kcsan_interrupt_watcher) local_irq_save(irq_flags); @@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); out: user_access_restore(ua_flags); } @@ -753,6 +776,7 @@ EXPORT_SYMBOL(__kcsan_check_access); */ #define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ void __tsan_read##size(void *ptr) \ { \ check_access(ptr, size, 0); \ @@ -761,6 +785,7 @@ EXPORT_SYMBOL(__kcsan_check_access); void __tsan_unaligned_read##size(void *ptr) \ __alias(__tsan_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ void __tsan_write##size(void *ptr) \ { \ check_access(ptr, size, KCSAN_ACCESS_WRITE); \ @@ -776,12 +801,14 @@ DEFINE_TSAN_READ_WRITE(4); DEFINE_TSAN_READ_WRITE(8); DEFINE_TSAN_READ_WRITE(16); +void __tsan_read_range(void *ptr, size_t size); void __tsan_read_range(void *ptr, size_t size) { check_access(ptr, size, 0); } EXPORT_SYMBOL(__tsan_read_range); +void __tsan_write_range(void *ptr, size_t size); void __tsan_write_range(void *ptr, size_t size) { check_access(ptr, size, KCSAN_ACCESS_WRITE); @@ -798,6 +825,7 @@ EXPORT_SYMBOL(__tsan_write_range); * the size-check of compiletime_assert_rwonce_type(). */ #define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ void __tsan_volatile_read##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -810,6 +838,7 @@ EXPORT_SYMBOL(__tsan_write_range); void __tsan_unaligned_volatile_read##size(void *ptr) \ __alias(__tsan_volatile_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ void __tsan_volatile_write##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -835,14 +864,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16); * The below are not required by KCSAN, but can still be emitted by the * compiler. */ +void __tsan_func_entry(void *call_pc); void __tsan_func_entry(void *call_pc) { } EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); void __tsan_func_exit(void) { } EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); void __tsan_init(void) { } diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..fed6fcb5768c --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver <elver@google.com> + */ + +#include <kunit/test.h> +#include <linux/jiffies.h> +#include <linux/kcsan-checks.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <linux/torture.h> +#include <linux/tracepoint.h> +#include <linux/types.h> +#include <trace/events/printk.h> + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], "<none>"); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, "<none>"); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 763d6d08d94b..29480010dc30 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -9,6 +9,7 @@ #define _KERNEL_KCSAN_KCSAN_H #include <linux/kcsan.h> +#include <linux/sched.h> /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 @@ -23,6 +24,12 @@ extern unsigned int kcsan_udelay_interrupt; extern bool kcsan_enabled; /* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + +/* * Initialize debugfs file. */ void kcsan_debugfs_init(void); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 6b2fb1a6d8cd..9d07e175de0f 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task) if (!task) return; + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + pr_err("\n"); debug_show_held_locks(task); print_irqtrace_events(task); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/selftest.c index d26a052d3383..d26a052d3383 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/selftest.c diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..2e97febeef77 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8b0b28b4546b..b4d34c9030df 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3658,7 +3660,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3688,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks)) return; @@ -3727,7 +3729,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3754,9 +3756,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + this_cpu_write(hardirqs_enabled, 1); + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3766,8 +3768,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks)) return; @@ -3789,13 +3789,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + this_cpu_write(hardirqs_enabled, 0); + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3808,7 +3810,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3820,7 +3822,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3829,17 +3831,17 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + if (lockdep_hardirqs_enabled()) + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3848,8 +3850,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3859,13 +3859,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? @@ -3887,7 +3889,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3896,7 +3898,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3934,7 +3936,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4027,7 +4029,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4870,11 +4872,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; diff --git a/kernel/module.c b/kernel/module.c index bee1c25ca5c5..aa183c9ac0a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1510,8 +1510,7 @@ static inline bool sect_empty(const Elf_Shdr *sect) } struct module_sect_attr { - struct module_attribute mattr; - char *name; + struct bin_attribute battr; unsigned long address; }; @@ -1521,13 +1520,18 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? - (void *)sattr->address : NULL); + container_of(battr, struct module_sect_attr, battr); + + if (pos != 0) + return -EINVAL; + + return sprintf(buf, "0x%px\n", + kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1535,7 +1539,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs->attrs[section].battr.attr.name); kfree(sect_attrs); } @@ -1544,42 +1548,41 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - struct attribute **gattr; + struct bin_attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; + sysfs_bin_attr_init(&sattr->battr); sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) goto out; sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUSR; - *(gattr++) = &(sattr++)->mattr.attr; + sattr->battr.read = module_sect_read; + sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; } *gattr = NULL; @@ -1669,7 +1672,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; nattr->attr.mode = S_IRUGO; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *) info->sechdrs[i].sh_addr; @@ -4379,7 +4382,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; } return err; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..2142c6767682 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2962,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -3026,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -4097,6 +4102,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4116,9 +4122,16 @@ static void __sched notrace __schedule(bool preempt) /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4142,38 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4444,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 658aa7a2ae6f..04fa8dbcfa4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..5e9aaa648a74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -224,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 398e6eadb861..026ac01af9da 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> +#include <linux/random.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -521,8 +522,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ - if (expires >= WHEEL_TIMEOUT_CUTOFF) - expires = WHEEL_TIMEOUT_MAX; + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1); } @@ -584,7 +585,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - base->next_expiry = timer->expires; + if (time_before(timer->expires, base->clk)) { + /* + * Prevent from forward_timer_base() moving the base->clk + * backward + */ + base->next_expiry = base->clk; + } else { + base->next_expiry = timer->expires; + } wake_up_nohz_cpu(base->cpu); } @@ -896,10 +905,13 @@ static inline void forward_timer_base(struct timer_base *base) * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) + if (time_after(base->next_expiry, jnow)) { base->clk = jnow; - else + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; base->clk = base->next_expiry; + } #endif } @@ -1731,6 +1743,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** |