diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 11 | ||||
-rw-r--r-- | mm/compaction.c | 5 | ||||
-rw-r--r-- | mm/debug.c | 28 | ||||
-rw-r--r-- | mm/dmapool.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 305 | ||||
-rw-r--r-- | mm/frame_vector.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 1 | ||||
-rw-r--r-- | mm/kasan/Makefile | 21 | ||||
-rw-r--r-- | mm/kasan/common.c (renamed from mm/kasan/kasan.c) | 694 | ||||
-rw-r--r-- | mm/kasan/generic.c | 325 | ||||
-rw-r--r-- | mm/kasan/generic_report.c | 150 | ||||
-rw-r--r-- | mm/kasan/init.c | 500 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 116 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 199 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 19 | ||||
-rw-r--r-- | mm/kasan/report.c | 286 | ||||
-rw-r--r-- | mm/kasan/tags.c | 161 | ||||
-rw-r--r-- | mm/kasan/tags_report.c | 58 | ||||
-rw-r--r-- | mm/madvise.c | 4 | ||||
-rw-r--r-- | mm/memblock.c | 19 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mempool.c | 18 | ||||
-rw-r--r-- | mm/migrate.c | 4 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 49 | ||||
-rw-r--r-- | mm/mprotect.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 20 | ||||
-rw-r--r-- | mm/page_alloc.c | 140 | ||||
-rw-r--r-- | mm/page_io.c | 15 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 42 | ||||
-rw-r--r-- | mm/slab.c | 99 | ||||
-rw-r--r-- | mm/slab.h | 27 | ||||
-rw-r--r-- | mm/slab_common.c | 109 | ||||
-rw-r--r-- | mm/slub.c | 231 | ||||
-rw-r--r-- | mm/swap.c | 20 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 42 | ||||
-rw-r--r-- | mm/usercopy.c | 9 | ||||
-rw-r--r-- | mm/util.c | 9 | ||||
-rw-r--r-- | mm/vmalloc.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 10 | ||||
-rw-r--r-- | mm/vmstat.c | 10 | ||||
-rw-r--r-- | mm/workingset.c | 95 | ||||
-rw-r--r-- | mm/zsmalloc.c | 41 |
50 files changed, 2659 insertions, 1282 deletions
@@ -421,6 +421,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + size_t i; struct page *page = NULL; int ret = -ENOMEM; @@ -480,6 +481,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); + /* + * CMA can allocate multiple page blocks, which results in different + * blocks being marked with different tags. Reset the tags to ignore + * those page blocks. + */ + if (page) { + for (i = 0; i < count; i++) + page_kasan_tag_reset(page + i); + } + if (ret && !(gfp_mask & __GFP_NOWARN)) { pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); diff --git a/mm/compaction.c b/mm/compaction.c index eb8e7f5d3a08..a9d94eef4bf6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/page_owner.h> +#include <linux/psi.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2029,11 +2030,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/debug.c b/mm/debug.c index c55abc893fdc..97609290dd51 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason) */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", + pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) @@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason) #ifdef CONFIG_MEMCG if (page->mem_cgroup) - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); + pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } @@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %p start %p end %p\n" - "next %p prev %p mm %p\n" - "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n" + pr_emerg("vma %px start %px end %px\n" + "next %px prev %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, @@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %p mmap %p seqnum %llu task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU - "get_unmapped_area %p\n" + "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %p flags %lx core_state %p\n" + "binfmt %px flags %lx core_state %px\n" #ifdef CONFIG_AIO - "ioctx_table %p\n" + "ioctx_table %px\n" #endif #ifdef CONFIG_MEMCG - "owner %p " + "owner %px " #endif - "exe_file %p\n" + "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %p\n" + "mmu_notifier_mm %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" diff --git a/mm/dmapool.c b/mm/dmapool.c index 4d90a64b2fdc..207b89212f27 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -379,7 +379,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, #endif spin_unlock_irqrestore(&pool->lock, flags); - if (mem_flags & __GFP_ZERO) + if (want_init_on_alloc(mem_flags)) memset(retval, 0, pool->size); return retval; @@ -429,6 +429,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } offset = vaddr - page->vaddr; + if (want_init_on_free()) + memset(vaddr, 0, pool->size); #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); diff --git a/mm/filemap.c b/mm/filemap.c index a30dbf93de99..4920d32b1edd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,8 @@ #include <linux/memcontrol.h> #include <linux/cleancache.h> #include <linux/rmap.h> +#include <linux/delayacct.h> +#include <linux/psi.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -421,19 +423,17 @@ static void __filemap_fdatawait_range(struct address_space *mapping, return; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); ClearPageError(page); } @@ -840,12 +840,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; @@ -1001,8 +998,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; int ret = 0; + if (bit_nr == PG_locked && + !PageUptodate(page) && PageWorkingset(page)) { + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1041,6 +1048,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), @@ -1449,7 +1462,10 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_CREAT: If page is not present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased - * refcount. Otherwise, NULL is returned. + * refcount. + * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do + * its own locking dance if the page is already in cache, or unlock the page + * before returning if we had to add the page to pagecache. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1502,7 +1518,7 @@ no_page: if (!page) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -1516,6 +1532,13 @@ no_page: if (err == -EEXIST) goto repeat; } + + /* + * add_to_page_cache_lru locks the page, and for mmap we expect + * an unlocked page. + */ + if (page && (fgp_flags & FGP_FOR_MMAP)) + unlock_page(page); } return page; @@ -1776,9 +1799,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1786,8 +1810,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1800,6 +1825,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1841,18 +1869,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /** * find_get_entries_tag - find and return entries that match @tag @@ -2277,62 +2315,98 @@ out: EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * @gfp_mask: memory allocation flags - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - */ -static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) +#define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) { - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; + int flags = vmf->flags; - do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); - if (!page) - return -ENOMEM; + if (fpin) + return fpin; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} - put_page(page); +/* + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. + */ +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; - } while (ret == AOP_TRUNCATED_PAGE); + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; - return ret; + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; } -#define MMAP_LOTSAMISS (100) /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. */ -static void do_sync_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - pgoff_t offset) +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (!ra->ra_pages) - return; + return fpin; - if (vma->vm_flags & VM_SEQ_READ) { + if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2344,37 +2418,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - struct page *page, - pgoff_t offset) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2404,6 +2485,7 @@ int filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2425,23 +2507,26 @@ int filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf->vma, ra, file, page, offset); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: - page = find_get_page(mapping, offset); - if (!page) - goto no_cached_page; + page = pagecache_get_page(mapping, offset, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); + if (!page) { + if (fpin) + goto out_retry; + return VM_FAULT_OOM; + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2459,6 +2544,16 @@ retry_find: goto page_not_uptodate; /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ @@ -2472,30 +2567,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, offset, vmf->gfp_mask); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; - page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. @@ -2504,12 +2575,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2518,6 +2592,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); @@ -2688,7 +2774,7 @@ static struct page *wait_on_page_read(struct page *page) static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *, struct page *), + int (*filler)(struct file *, struct page *), void *data, gfp_t gfp) { @@ -2795,7 +2881,7 @@ out: */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *, struct page *), + int (*filler)(struct file *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -2817,7 +2903,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { - filler_t *filler = (filler_t *)mapping->a_ops->readpage; + filler_t *filler = mapping->a_ops->readpage; return do_read_cache_page(mapping, index, filler, NULL, gfp); } @@ -2837,6 +2923,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) unsigned long limit = rlimit(RLIMIT_FSIZE); loff_t pos; + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + if (!iov_iter_count(from)) return 0; diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c2..c431ca81dad5 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; + start = untagged_addr(start); + down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); @@ -662,6 +662,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!nr_pages) return 0; + start = untagged_addr(start); + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* @@ -816,6 +818,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; int ret, major = 0; + address = untagged_addr(address); + if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283963032eff..5facac263442 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2330,6 +2330,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 3289db38bc87..613dfe681e9f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,11 +1,24 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n -UBSAN_SANITIZE_kasan.o := n +UBSAN_SANITIZE_common.o := n +UBSAN_SANITIZE_generic.o := n +UBSAN_SANITIZE_generic_report.o := n +UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_kasan.o = -pg +CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_generic.o = -pg +CFLAGS_REMOVE_generic_report.o = -pg +CFLAGS_REMOVE_tags.o = -pg + # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 -CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o quarantine.o +CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) + +obj-$(CONFIG_KASAN) := common.o init.o report.o +obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o +obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c index 71a4319256b6..b888980f011d 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/common.c @@ -1,11 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * This file contains shadow memory manipulation code. + * This file contains common generic and tag-based KASAN code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by - * Andrey Konovalov <adech.fo@gmail.com> + * Andrey Konovalov <andreyknvl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,8 +14,7 @@ * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING +#define __KASAN_INTERNAL #include <linux/export.h> #include <linux/interrupt.h> @@ -36,10 +36,55 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> +#include <linux/uaccess.h> #include "kasan.h" #include "../slab.h" +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = save_stack(flags); +} + void kasan_enable_current(void) { current->kasan_depth++; @@ -50,27 +95,85 @@ void kasan_disable_current(void) current->kasan_depth--; } +void kasan_check_read(const volatile void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_read); + +void kasan_check_write(const volatile void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + check_memory_region((unsigned long)addr, len, true, _RET_IP_); + + return __memset(addr, c, len); +} + +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); + + return __memmove(dest, src, len); +} + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); + + return __memcpy(dest, src, len); +} + /* * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. */ -static void kasan_poison_shadow(const void *address, size_t size, u8 value) +void kasan_poison_shadow(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_poison_object_data) pass tagged + * addresses to this function. + */ + address = reset_tag(address); + shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); - memset(shadow_start, value, shadow_end - shadow_start); + __memset(shadow_start, value, shadow_end - shadow_start); } void kasan_unpoison_shadow(const void *address, size_t size) { - kasan_poison_shadow(address, size, 0); + u8 tag = get_tag(address); + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_unpoison_object_data) pass tagged + * addresses to this function. + */ + address = reset_tag(address); + + kasan_poison_shadow(address, size, tag); if (size & KASAN_SHADOW_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); - *shadow = size & KASAN_SHADOW_MASK; + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + *shadow = tag; + else + *shadow = size & KASAN_SHADOW_MASK; } } @@ -116,199 +219,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark) kasan_unpoison_shadow(sp, size); } -/* - * All functions below always inlined so compiler could - * perform better optimizations in each of __asan_loadX/__assn_storeX - * depending on memory access size X. - */ - -static __always_inline bool memory_is_poisoned_1(unsigned long addr) -{ - s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; - return unlikely(last_accessible_byte >= shadow_value); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, - unsigned long size) -{ - u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); - - /* - * Access crosses 8(shadow size)-byte boundary. Such access maps - * into 2 shadow bytes, so we need to check them both. - */ - if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) - return *shadow_addr || memory_is_poisoned_1(addr + size - 1); - - return memory_is_poisoned_1(addr + size - 1); -} - -static __always_inline bool memory_is_poisoned_16(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return *shadow_addr || memory_is_poisoned_1(addr + 15); - - return *shadow_addr; -} - -static __always_inline unsigned long bytes_is_nonzero(const u8 *start, - size_t size) -{ - while (size) { - if (unlikely(*start)) - return (unsigned long)start; - start++; - size--; - } - - return 0; -} - -static __always_inline unsigned long memory_is_nonzero(const void *start, - const void *end) -{ - unsigned int words; - unsigned long ret; - unsigned int prefix = (unsigned long)start % 8; - - if (end - start <= 16) - return bytes_is_nonzero(start, end - start); - - if (prefix) { - prefix = 8 - prefix; - ret = bytes_is_nonzero(start, prefix); - if (unlikely(ret)) - return ret; - start += prefix; - } - - words = (end - start) / 8; - while (words) { - if (unlikely(*(u64 *)start)) - return bytes_is_nonzero(start, 8); - start += 8; - words--; - } - - return bytes_is_nonzero(start, (end - start) % 8); -} - -static __always_inline bool memory_is_poisoned_n(unsigned long addr, - size_t size) -{ - unsigned long ret; - - ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), - kasan_mem_to_shadow((void *)addr + size - 1) + 1); - - if (unlikely(ret)) { - unsigned long last_byte = addr + size - 1; - s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); - - if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) - return true; - } - return false; -} - -static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) -{ - if (__builtin_constant_p(size)) { - switch (size) { - case 1: - return memory_is_poisoned_1(addr); - case 2: - case 4: - case 8: - return memory_is_poisoned_2_4_8(addr, size); - case 16: - return memory_is_poisoned_16(addr); - default: - BUILD_BUG(); - } - } - - return memory_is_poisoned_n(addr, size); -} - -static __always_inline void check_memory_region_inline(unsigned long addr, - size_t size, bool write, - unsigned long ret_ip) +void kasan_alloc_pages(struct page *page, unsigned int order) { - if (unlikely(size == 0)) - return; - - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, ret_ip); - return; - } + u8 tag; + unsigned long i; - if (likely(!memory_is_poisoned(addr, size))) + if (unlikely(PageHighMem(page))) return; - kasan_report(addr, size, write, ret_ip); -} - -static void check_memory_region(unsigned long addr, - size_t size, bool write, - unsigned long ret_ip) -{ - check_memory_region_inline(addr, size, write, ret_ip); -} - -void kasan_check_read(const volatile void *p, unsigned int size) -{ - check_memory_region((unsigned long)p, size, false, _RET_IP_); -} -EXPORT_SYMBOL(kasan_check_read); - -void kasan_check_write(const volatile void *p, unsigned int size) -{ - check_memory_region((unsigned long)p, size, true, _RET_IP_); -} -EXPORT_SYMBOL(kasan_check_write); - -#undef memset -void *memset(void *addr, int c, size_t len) -{ - check_memory_region((unsigned long)addr, len, true, _RET_IP_); - - return __memset(addr, c, len); -} - -#undef memmove -void *memmove(void *dest, const void *src, size_t len) -{ - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); - - return __memmove(dest, src, len); -} - -#undef memcpy -void *memcpy(void *dest, const void *src, size_t len) -{ - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); - - return __memcpy(dest, src, len); -} - -void kasan_alloc_pages(struct page *page, unsigned int order) -{ - if (likely(!PageHighMem(page))) - kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); + tag = random_tag(); + for (i = 0; i < (1 << order); i++) + page_kasan_tag_set(page + i, tag); + kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); } void kasan_free_pages(struct page *page, unsigned int order) @@ -323,9 +245,12 @@ void kasan_free_pages(struct page *page, unsigned int order) * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. */ -static size_t optimal_redzone(size_t object_size) +static inline unsigned int optimal_redzone(unsigned int object_size) { - int rz = + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return 0; + + return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : object_size <= 512 - 64 ? 64 : @@ -333,33 +258,34 @@ static size_t optimal_redzone(size_t object_size) object_size <= (1 << 14) - 256 ? 256 : object_size <= (1 << 15) - 512 ? 512 : object_size <= (1 << 16) - 1024 ? 1024 : 2048; - return rz; } -void kasan_cache_create(struct kmem_cache *cache, size_t *size, +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, unsigned long *flags) { + unsigned int orig_size = *size; + unsigned int redzone_size; int redzone_adjust; - int orig_size = *size; /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; *size += sizeof(struct kasan_alloc_meta); /* Add free meta. */ - if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { + if (IS_ENABLED(CONFIG_KASAN_GENERIC) && + (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta))) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); } - redzone_adjust = optimal_redzone(cache->object_size) - - (*size - cache->object_size); + redzone_size = optimal_redzone(cache->object_size); + redzone_adjust = redzone_size - (*size - cache->object_size); if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + - optimal_redzone(cache->object_size))); + *size = min_t(unsigned int, KMALLOC_MAX_SIZE, + max(*size, cache->object_size + redzone_size)); /* * If the metadata doesn't fit, don't enable KASAN at all. @@ -375,26 +301,34 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, *flags |= SLAB_KASAN; } -void kasan_cache_shrink(struct kmem_cache *cache) +size_t kasan_metadata_size(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + (cache->kasan_info.free_meta_offset ? + sizeof(struct kasan_free_meta) : 0); } -void kasan_cache_shutdown(struct kmem_cache *cache) +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) { - quarantine_remove_cache(cache); + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); + return (void *)object + cache->kasan_info.alloc_meta_offset; } -size_t kasan_metadata_size(struct kmem_cache *cache) +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) { - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + return (void *)object + cache->kasan_info.free_meta_offset; } void kasan_poison_slab(struct page *page) { + unsigned long i; + + for (i = 0; i < (1 << compound_order(page)); i++) + page_kasan_tag_reset(page + i); kasan_poison_shadow(page_address(page), PAGE_SIZE << compound_order(page), KASAN_KMALLOC_REDZONE); @@ -412,147 +346,178 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) KASAN_KMALLOC_REDZONE); } -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -static inline void filter_irq_stacks(struct stack_trace *trace) -{ - int i; - - if (!trace->nr_entries) - return; - for (i = 0; i < trace->nr_entries; i++) - if (in_irqentry_text(trace->entries[i])) { - /* Include the irqentry function into the stack. */ - trace->nr_entries = i + 1; - break; - } -} - -static inline depot_stack_handle_t save_stack(gfp_t flags) -{ - unsigned long entries[KASAN_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = KASAN_STACK_DEPTH, - .skip = 0 - }; - - save_stack_trace(&trace); - filter_irq_stacks(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; - - return depot_save_stack(&trace, flags); -} - -static inline void set_track(struct kasan_track *track, gfp_t flags) -{ - track->pid = current->pid; - track->stack = save_stack(flags); -} - -struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, - const void *object) +/* + * This function assigns a tag to an object considering the following: + * 1. A cache might have a constructor, which might save a pointer to a slab + * object somewhere (e.g. in the object itself). We preassign a tag for + * each object in caches with constructors during slab creation and reuse + * the same tag each time a particular object is allocated. + * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be + * accessed after being freed. We preassign tags for objects in these + * caches as well. + * 3. For SLAB allocator we can't preassign tags randomly since the freelist + * is stored as an array of indexes instead of a linked list. Assign tags + * based on objects indexes, so that objects that are next to each other + * get different tags. + */ +static u8 assign_tag(struct kmem_cache *cache, const void *object, + bool init, bool keep_tag) { - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); - return (void *)object + cache->kasan_info.alloc_meta_offset; -} + /* + * 1. When an object is kmalloc()'ed, two hooks are called: + * kasan_slab_alloc() and kasan_kmalloc(). We assign the + * tag only in the first one. + * 2. We reuse the same tag for krealloc'ed objects. + */ + if (keep_tag) + return get_tag(object); -struct kasan_free_meta *get_free_info(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - return (void *)object + cache->kasan_info.free_meta_offset; + /* + * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU + * set, assign a tag when the object is being allocated (init == false). + */ + if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) + return init ? KASAN_TAG_KERNEL : random_tag(); + + /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ +#ifdef CONFIG_SLAB + /* For SLAB assign tags based on the object index in the freelist. */ + return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); +#else + /* + * For SLUB assign a random tag during slab creation, otherwise reuse + * the already assigned tag. + */ + return init ? random_tag() : get_tag(object); +#endif } -void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) +void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, + const void *object) { struct kasan_alloc_meta *alloc_info; if (!(cache->flags & SLAB_KASAN)) - return; + return (void *)object; alloc_info = get_alloc_info(cache, object); __memset(alloc_info, 0, sizeof(*alloc_info)); -} -void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) -{ - kasan_kmalloc(cache, object, cache->object_size, flags); + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + object = set_tag(object, + assign_tag(cache, object, true, false)); + + return (void *)object; } -static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) +static inline bool shadow_invalid(u8 tag, s8 shadow_byte) { - unsigned long size = cache->object_size; - unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + return shadow_byte < 0 || + shadow_byte >= KASAN_SHADOW_SCALE_SIZE; - /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return; + /* else CONFIG_KASAN_SW_TAGS: */ + if ((u8)shadow_byte == KASAN_TAG_INVALID) + return true; + if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte)) + return true; - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); + return false; } -bool kasan_slab_free(struct kmem_cache *cache, void *object) +static bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine) { s8 shadow_byte; + u8 tag; + void *tagged_object; + unsigned long rounded_up_size; + + tag = get_tag(object); + tagged_object = object; + object = reset_tag(object); + + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != + object)) { + kasan_report_invalid_free(tagged_object, ip); + return true; + } /* RCU slabs could be legally used after free within the RCU period */ if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); - if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, - __builtin_return_address(1)); + if (shadow_invalid(tag, shadow_byte)) { + kasan_report_invalid_free(tagged_object, ip); return true; } - kasan_poison_slab_free(cache, object); + rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); - if (unlikely(!(cache->flags & SLAB_KASAN))) + if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || + unlikely(!(cache->flags & SLAB_KASAN))) return false; set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); quarantine_put(get_free_info(cache, object), cache); - return true; + + return IS_ENABLED(CONFIG_KASAN_GENERIC); +} + +bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +{ + return __kasan_slab_free(cache, object, ip, true); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, - gfp_t flags) +static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags, bool keep_tag) { unsigned long redzone_start; unsigned long redzone_end; + u8 tag = 0xff; if (gfpflags_allow_blocking(flags)) quarantine_reduce(); if (unlikely(object == NULL)) - return; + return NULL; redzone_start = round_up((unsigned long)(object + size), KASAN_SHADOW_SCALE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, KASAN_SHADOW_SCALE_SIZE); - kasan_unpoison_shadow(object, size); + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + tag = assign_tag(cache, object, false, keep_tag); + + /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ + kasan_unpoison_shadow(set_tag(object, tag), size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); if (cache->flags & SLAB_KASAN) set_track(&get_alloc_info(cache, object)->alloc_track, flags); + + return set_tag(object, tag); +} + +void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, + gfp_t flags) +{ + return __kasan_kmalloc(cache, object, cache->object_size, flags, false); +} + +void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) +{ + return __kasan_kmalloc(cache, object, size, flags, true); } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -562,7 +527,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) quarantine_reduce(); if (unlikely(ptr == NULL)) - return; + return NULL; page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), @@ -572,42 +537,49 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) kasan_unpoison_shadow(ptr, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); + + return (void *)ptr; } -void kasan_krealloc(const void *object, size_t size, gfp_t flags) +void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; if (unlikely(object == ZERO_SIZE_PTR)) - return; + return (void *)object; page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size, flags); + return kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size, flags); + return __kasan_kmalloc(page->slab_cache, object, size, + flags, true); } -void kasan_poison_kfree(void *ptr) +void kasan_poison_kfree(void *ptr, unsigned long ip) { struct page *page; page = virt_to_head_page(ptr); - if (unlikely(!PageSlab(page))) + if (unlikely(!PageSlab(page))) { + if (ptr != page_address(page)) { + kasan_report_invalid_free(ptr, ip); + return; + } kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), KASAN_FREE_PAGE); - else - kasan_poison_slab_free(page->slab_cache, ptr); + } else { + __kasan_slab_free(page->slab_cache, ptr, ip, false); + } } -void kasan_kfree_large(const void *ptr) +void kasan_kfree_large(void *ptr, unsigned long ip) { - struct page *page = virt_to_page(ptr); - - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), - KASAN_FREE_PAGE); + if (ptr != page_address(virt_to_head_page(ptr))) + kasan_report_invalid_free(ptr, ip); + /* The object will be poisoned by page_alloc. */ } int kasan_module_alloc(void *addr, size_t size) @@ -626,11 +598,12 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_ZERO, + GFP_KERNEL, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { + __memset(ret, KASAN_SHADOW_INIT, shadow_size); find_vm_area(addr)->flags |= VM_KASAN; kmemleak_ignore(ret); return 0; @@ -645,97 +618,14 @@ void kasan_free_shadow(const struct vm_struct *vm) vfree(kasan_mem_to_shadow(vm->addr)); } -static void register_global(struct kasan_global *global) -{ - size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); - - kasan_unpoison_shadow(global->beg, global->size); - - kasan_poison_shadow(global->beg + aligned_size, - global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); -} - -void __asan_register_globals(struct kasan_global *globals, size_t size) -{ - int i; - - for (i = 0; i < size; i++) - register_global(&globals[i]); -} -EXPORT_SYMBOL(__asan_register_globals); - -void __asan_unregister_globals(struct kasan_global *globals, size_t size) -{ -} -EXPORT_SYMBOL(__asan_unregister_globals); - -#define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ - { \ - check_memory_region_inline(addr, size, false, _RET_IP_);\ - } \ - EXPORT_SYMBOL(__asan_load##size); \ - __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ - { \ - check_memory_region_inline(addr, size, true, _RET_IP_); \ - } \ - EXPORT_SYMBOL(__asan_store##size); \ - __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_store##size##_noabort) - -DEFINE_ASAN_LOAD_STORE(1); -DEFINE_ASAN_LOAD_STORE(2); -DEFINE_ASAN_LOAD_STORE(4); -DEFINE_ASAN_LOAD_STORE(8); -DEFINE_ASAN_LOAD_STORE(16); - -void __asan_loadN(unsigned long addr, size_t size) -{ - check_memory_region(addr, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__asan_loadN); - -__alias(__asan_loadN) -void __asan_loadN_noabort(unsigned long, size_t); -EXPORT_SYMBOL(__asan_loadN_noabort); - -void __asan_storeN(unsigned long addr, size_t size) -{ - check_memory_region(addr, size, true, _RET_IP_); -} -EXPORT_SYMBOL(__asan_storeN); - -__alias(__asan_storeN) -void __asan_storeN_noabort(unsigned long, size_t); -EXPORT_SYMBOL(__asan_storeN_noabort); - -/* to shut up compiler complaints */ -void __asan_handle_no_return(void) {} -EXPORT_SYMBOL(__asan_handle_no_return); - -/* Emitted by compiler to poison large objects when they go out of scope. */ -void __asan_poison_stack_memory(const void *addr, size_t size) -{ - /* - * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded - * by redzones, so we simply round up size to simplify logic. - */ - kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), - KASAN_USE_AFTER_SCOPE); -} -EXPORT_SYMBOL(__asan_poison_stack_memory); +extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -/* Emitted by compiler to unpoison large objects when they go into scope. */ -void __asan_unpoison_stack_memory(const void *addr, size_t size) +void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { - kasan_unpoison_shadow(addr, size); + unsigned long flags = user_access_save(); + __kasan_report(addr, size, is_write, ip); + user_access_restore(flags); } -EXPORT_SYMBOL(__asan_unpoison_stack_memory); #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c new file mode 100644 index 000000000000..504c79363a34 --- /dev/null +++ b/mm/kasan/generic.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core generic KASAN code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <andreyknvl@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/linkage.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/bug.h> + +#include "kasan.h" +#include "../slab.h" + +/* + * All functions below always inlined so compiler could + * perform better optimizations in each of __asan_loadX/__assn_storeX + * depending on memory access size X. + */ + +static __always_inline bool memory_is_poisoned_1(unsigned long addr) +{ + s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(shadow_value)) { + s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; + return unlikely(last_accessible_byte >= shadow_value); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) +{ + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); + + return memory_is_poisoned_1(addr + size - 1); +} + +static __always_inline bool memory_is_poisoned_16(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); + + return *shadow_addr; +} + +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, + size_t size) +{ + while (size) { + if (unlikely(*start)) + return (unsigned long)start; + start++; + size--; + } + + return 0; +} + +static __always_inline unsigned long memory_is_nonzero(const void *start, + const void *end) +{ + unsigned int words; + unsigned long ret; + unsigned int prefix = (unsigned long)start % 8; + + if (end - start <= 16) + return bytes_is_nonzero(start, end - start); + + if (prefix) { + prefix = 8 - prefix; + ret = bytes_is_nonzero(start, prefix); + if (unlikely(ret)) + return ret; + start += prefix; + } + + words = (end - start) / 8; + while (words) { + if (unlikely(*(u64 *)start)) + return bytes_is_nonzero(start, 8); + start += 8; + words--; + } + + return bytes_is_nonzero(start, (end - start) % 8); +} + +static __always_inline bool memory_is_poisoned_n(unsigned long addr, + size_t size) +{ + unsigned long ret; + + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), + kasan_mem_to_shadow((void *)addr + size - 1) + 1); + + if (unlikely(ret)) { + unsigned long last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + + if (unlikely(ret != (unsigned long)last_shadow || + ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + return true; + } + return false; +} + +static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +{ + if (__builtin_constant_p(size)) { + switch (size) { + case 1: + return memory_is_poisoned_1(addr); + case 2: + case 4: + case 8: + return memory_is_poisoned_2_4_8(addr, size); + case 16: + return memory_is_poisoned_16(addr); + default: + BUILD_BUG(); + } + } + + return memory_is_poisoned_n(addr, size); +} + +static __always_inline void check_memory_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + if (unlikely(size == 0)) + return; + + if (unlikely((void *)addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + kasan_report(addr, size, write, ret_ip); + return; + } + + if (likely(!memory_is_poisoned(addr, size))) + return; + + kasan_report(addr, size, write, ret_ip); +} + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + check_memory_region_inline(addr, size, write, ret_ip); +} + +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_shutdown(struct kmem_cache *cache) +{ + if (!__kmem_cache_empty(cache)) + quarantine_remove_cache(cache); +} + +static void register_global(struct kasan_global *global) +{ + size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(global->beg, global->size); + + kasan_poison_shadow(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); +} + +void __asan_register_globals(struct kasan_global *globals, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + register_global(&globals[i]); +} +EXPORT_SYMBOL(__asan_register_globals); + +void __asan_unregister_globals(struct kasan_global *globals, size_t size) +{ +} +EXPORT_SYMBOL(__asan_unregister_globals); + +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, false, _RET_IP_);\ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_store##size##_noabort) + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); +DEFINE_ASAN_LOAD_STORE(16); + +void __asan_loadN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_loadN); + +__alias(__asan_loadN) +void __asan_loadN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_loadN_noabort); + +void __asan_storeN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_storeN); + +__alias(__asan_storeN) +void __asan_storeN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_storeN_noabort); + +/* to shut up compiler complaints */ +void __asan_handle_no_return(void) {} +EXPORT_SYMBOL(__asan_handle_no_return); + +/* Emitted by compiler to poison alloca()ed objects. */ +void __asan_alloca_poison(unsigned long addr, size_t size) +{ + size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - + rounded_up_size; + size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); + + const void *left_redzone = (const void *)(addr - + KASAN_ALLOCA_REDZONE_SIZE); + const void *right_redzone = (const void *)(addr + rounded_up_size); + + WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + + kasan_unpoison_shadow((const void *)(addr + rounded_down_size), + size - rounded_down_size); + kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + kasan_poison_shadow(right_redzone, + padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); +} +EXPORT_SYMBOL(__asan_alloca_poison); + +/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +{ + if (unlikely(!stack_top || stack_top > stack_bottom)) + return; + + kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); +} +EXPORT_SYMBOL(__asan_allocas_unpoison); + +/* Emitted by the compiler to [un]poison local variables. */ +#define DEFINE_ASAN_SET_SHADOW(byte) \ + void __asan_set_shadow_##byte(const void *addr, size_t size) \ + { \ + __memset((void *)addr, 0x##byte, size); \ + } \ + EXPORT_SYMBOL(__asan_set_shadow_##byte) + +DEFINE_ASAN_SET_SHADOW(00); +DEFINE_ASAN_SET_SHADOW(f1); +DEFINE_ASAN_SET_SHADOW(f2); +DEFINE_ASAN_SET_SHADOW(f3); +DEFINE_ASAN_SET_SHADOW(f5); +DEFINE_ASAN_SET_SHADOW(f8); diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c new file mode 100644 index 000000000000..36c645939bc9 --- /dev/null +++ b/mm/kasan/generic_report.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains generic KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <andreyknvl@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> +#include <linux/module.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +void *find_first_bad_addr(void *addr, size_t size) +{ + void *p = addr; + + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) + p += KASAN_SHADOW_SCALE_SIZE; + return p; +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + u8 *shadow_addr; + + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + /* + * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ + bug_type = "out-of-bounds"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; + case KASAN_GLOBAL_REDZONE: + bug_type = "global-out-of-bounds"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use-after-free"; + break; + case KASAN_ALLOCA_LEFT: + case KASAN_ALLOCA_RIGHT: + bug_type = "alloca-out-of-bounds"; + break; + } + + return bug_type; +} + +static const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info->access_addr)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/init.c b/mm/kasan/init.c new file mode 100644 index 000000000000..2b54351a6a3a --- /dev/null +++ b/mm/kasan/init.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains some kasan initialization code. + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/pfn.h> +#include <linux/slab.h> + +#include <asm/page.h> +#include <asm/pgalloc.h> + +#include "kasan.h" + +/* + * This page serves two purposes: + * - It used as early shadow memory. The entire shadow region populated + * with this page, before we will be able to setup normal shadow memory. + * - Latter it reused it as zero shadow to cover large ranges of memory + * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). + */ +unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss; + +#if CONFIG_PGTABLE_LEVELS > 4 +p4d_t kasan_early_shadow_p4d[PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return false; +} +#endif +#if CONFIG_PGTABLE_LEVELS > 3 +pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return false; +} +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return false; +} +#endif +pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static inline bool kasan_early_shadow_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page)); +} + +static __init void *early_alloc(size_t size, int node) +{ + return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, node); +} + +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_t zero_pte; + + zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)), + PAGE_KERNEL); + zero_pte = pte_wrprotect(zero_pte); + + while (addr + PAGE_SIZE <= end) { + set_pte_at(&init_mm, addr, pte, zero_pte); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } +} + +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, addr); + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pmd_none(*pmd)) { + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); + } + zero_pte_populate(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + pud_t *pud = pud_offset(p4d, addr); + unsigned long next; + + do { + next = pud_addr_end(addr, end); + if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pmd_t *pmd; + + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pud_none(*pud)) { + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_pmd_populate(pud, addr, next); + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + pud_t *pud; + pmd_t *pmd; + + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (p4d_none(*p4d)) { + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_pud_populate(p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + return 0; +} + +/** + * kasan_populate_early_shadow - populate shadow memory region with + * kasan_early_shadow_page + * @shadow_start - start of the memory range to populate + * @shadow_end - end of the memory range to populate + */ +int __ref kasan_populate_early_shadow(const void *shadow_start, + const void *shadow_end) +{ + unsigned long addr = (unsigned long)shadow_start; + unsigned long end = (unsigned long)shadow_end; + pgd_t *pgd = pgd_offset_k(addr); + unsigned long next; + + do { + next = pgd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + /* + * kasan_early_shadow_pud should be populated with pmds + * at this moment. + * [pud,pmd]_populate*() below needed only for + * 3,2 - level page tables where we don't have + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + * + * The ifndef is required to avoid build breakage. + * + * With 5level-fixup.h, pgd_populate() is not nop and + * we reference kasan_early_shadow_p4d. It's not defined + * unless 5-level paging enabled. + * + * The ifndef can be dropped once all KASAN-enabled + * architectures will switch to pgtable-nop4d.h. + */ +#ifndef __ARCH_HAS_5LEVEL_HACK + pgd_populate(&init_mm, pgd, + lm_alias(kasan_early_shadow_p4d)); +#endif + p4d = p4d_offset(pgd, addr); + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pgd_none(*pgd)) { + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_early_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; +} diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c70851a9a6a4..3ce956efa0cb 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -8,10 +8,22 @@ #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) +#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ +#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ +#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ + +#ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#else +#define KASAN_FREE_PAGE KASAN_TAG_INVALID +#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#endif + #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ /* @@ -22,7 +34,14 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 -#define KASAN_USE_AFTER_SCOPE 0xF8 + +/* + * alloca redzone shadow values + */ +#define KASAN_ALLOCA_LEFT 0xCA +#define KASAN_ALLOCA_RIGHT 0xCB + +#define KASAN_ALLOCA_REDZONE_SIZE 32 /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION @@ -97,12 +116,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } +static inline bool addr_has_shadow(const void *addr) +{ + return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +void kasan_poison_shadow(const void *address, size_t size, u8 value); + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip); + +void *find_first_bad_addr(void *addr, size_t size); +const char *get_bug_type(struct kasan_access_info *info); + void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -void kasan_report_double_free(struct kmem_cache *cache, void *object, - void *ip); +void kasan_report_invalid_free(void *object, unsigned long ip); -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) +#if defined(CONFIG_KASAN_GENERIC) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); @@ -113,4 +145,80 @@ static inline void quarantine_reduce(void) { } static inline void quarantine_remove_cache(struct kmem_cache *cache) { } #endif +#ifdef CONFIG_KASAN_SW_TAGS + +void print_tags(u8 addr_tag, const void *addr); + +u8 random_tag(void); + +#else + +static inline void print_tags(u8 addr_tag, const void *addr) { } + +static inline u8 random_tag(void) +{ + return 0; +} + +#endif + +#ifndef arch_kasan_set_tag +static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +{ + return addr; +} +#endif +#ifndef arch_kasan_reset_tag +#define arch_kasan_reset_tag(addr) ((void *)(addr)) +#endif +#ifndef arch_kasan_get_tag +#define arch_kasan_get_tag(addr) 0 +#endif + +#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) +#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr)) +#define get_tag(addr) arch_kasan_get_tag(addr) + +/* + * Exported functions for interfaces called from assembly or from generated + * code. Declarations here to avoid warning about missing declarations. + */ +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); +void __asan_register_globals(struct kasan_global *globals, size_t size); +void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); +void __asan_handle_no_return(void); +void __asan_alloca_poison(unsigned long addr, size_t size); +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); + +void __asan_load1(unsigned long addr); +void __asan_store1(unsigned long addr); +void __asan_load2(unsigned long addr); +void __asan_store2(unsigned long addr); +void __asan_load4(unsigned long addr); +void __asan_store4(unsigned long addr); +void __asan_load8(unsigned long addr); +void __asan_store8(unsigned long addr); +void __asan_load16(unsigned long addr); +void __asan_store16(unsigned long addr); + +void __asan_load1_noabort(unsigned long addr); +void __asan_store1_noabort(unsigned long addr); +void __asan_load2_noabort(unsigned long addr); +void __asan_store2_noabort(unsigned long addr); +void __asan_load4_noabort(unsigned long addr); +void __asan_store4_noabort(unsigned long addr); +void __asan_load8_noabort(unsigned long addr); +void __asan_store8_noabort(unsigned long addr); +void __asan_load16_noabort(unsigned long addr); +void __asan_store16_noabort(unsigned long addr); + +void __asan_set_shadow_00(const void *addr, size_t size); +void __asan_set_shadow_f1(const void *addr, size_t size); +void __asan_set_shadow_f2(const void *addr, size_t size); +void __asan_set_shadow_f3(const void *addr, size_t size); +void __asan_set_shadow_f5(const void *addr, size_t size); +void __asan_set_shadow_f8(const void *addr, size_t size); + #endif diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c deleted file mode 100644 index 554e4c0f23a2..000000000000 --- a/mm/kasan/kasan_init.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * This file contains some kasan initialization code. - * - * Copyright (c) 2015 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/bootmem.h> -#include <linux/init.h> -#include <linux/kasan.h> -#include <linux/kernel.h> -#include <linux/memblock.h> -#include <linux/mm.h> -#include <linux/pfn.h> - -#include <asm/page.h> -#include <asm/pgalloc.h> - -/* - * This page serves two purposes: - * - It used as early shadow memory. The entire shadow region populated - * with this page, before we will be able to setup normal shadow memory. - * - Latter it reused it as zero shadow to cover large ranges of memory - * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). - */ -unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; - -#if CONFIG_PGTABLE_LEVELS > 4 -p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; -#endif -#if CONFIG_PGTABLE_LEVELS > 3 -pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; -#endif -#if CONFIG_PGTABLE_LEVELS > 2 -pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; -#endif -pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; - -static __init void *early_alloc(size_t size, int node) -{ - return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, node); -} - -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, - unsigned long end) -{ - pte_t *pte = pte_offset_kernel(pmd, addr); - pte_t zero_pte; - - zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); - zero_pte = pte_wrprotect(zero_pte); - - while (addr + PAGE_SIZE <= end) { - set_pte_at(&init_mm, addr, pte, zero_pte); - addr += PAGE_SIZE; - pte = pte_offset_kernel(pmd, addr); - } -} - -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, - unsigned long end) -{ - pmd_t *pmd = pmd_offset(pud, addr); - unsigned long next; - - do { - next = pmd_addr_end(addr, end); - - if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); - continue; - } - - if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); - } - zero_pte_populate(pmd, addr, next); - } while (pmd++, addr = next, addr != end); -} - -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, - unsigned long end) -{ - pud_t *pud = pud_offset(p4d, addr); - unsigned long next; - - do { - next = pud_addr_end(addr, end); - if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { - pmd_t *pmd; - - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); - pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); - continue; - } - - if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); - } - zero_pmd_populate(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, - unsigned long end) -{ - p4d_t *p4d = p4d_offset(pgd, addr); - unsigned long next; - - do { - next = p4d_addr_end(addr, end); - if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { - pud_t *pud; - pmd_t *pmd; - - p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); - pud = pud_offset(p4d, addr); - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); - pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, - lm_alias(kasan_zero_pte)); - continue; - } - - if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); - } - zero_pud_populate(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -/** - * kasan_populate_zero_shadow - populate shadow memory region with - * kasan_zero_page - * @shadow_start - start of the memory range to populate - * @shadow_end - end of the memory range to populate - */ -void __init kasan_populate_zero_shadow(const void *shadow_start, - const void *shadow_end) -{ - unsigned long addr = (unsigned long)shadow_start; - unsigned long end = (unsigned long)shadow_end; - pgd_t *pgd = pgd_offset_k(addr); - unsigned long next; - - do { - next = pgd_addr_end(addr, end); - - if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - /* - * kasan_zero_pud should be populated with pmds - * at this moment. - * [pud,pmd]_populate*() below needed only for - * 3,2 - level page tables where we don't have - * puds,pmds, so pgd_populate(), pud_populate() - * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_zero_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK - pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); -#endif - p4d = p4d_offset(pgd, addr); - p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); - pud = pud_offset(p4d, addr); - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); - pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); - continue; - } - - if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); - } - zero_p4d_populate(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..57334ef2d7ef 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * KASAN quarantine. * @@ -103,7 +104,7 @@ static int quarantine_head; static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; -static DEFINE_SPINLOCK(quarantine_lock); +static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ @@ -190,7 +191,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - spin_lock(&quarantine_lock); + raw_spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -203,7 +204,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock(&quarantine_lock); + raw_spin_unlock(&quarantine_lock); } local_irq_restore(flags); @@ -230,7 +231,7 @@ void quarantine_reduce(void) * expected case). */ srcu_idx = srcu_read_lock(&remove_cache_srcu); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of @@ -254,7 +255,7 @@ void quarantine_reduce(void) quarantine_head = 0; } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); srcu_read_unlock(&remove_cache_srcu, srcu_idx); @@ -310,17 +311,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); /* Scanning whole quarantine can take a while. */ - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); cond_resched(); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6bcfb01ba038..0772820ad098 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -1,11 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * This file contains error reporting code. + * This file contains common generic and tag-based KASAN error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by - * Andrey Konovalov <adech.fo@gmail.com> + * Andrey Konovalov <andreyknvl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -39,125 +40,43 @@ #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) #define SHADOW_ROWS_AROUND_ADDR 2 -static const void *find_first_bad_addr(const void *addr, size_t size) -{ - u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); - const void *first_bad_addr = addr; - - while (!shadow_val && first_bad_addr < addr + size) { - first_bad_addr += KASAN_SHADOW_SCALE_SIZE; - shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); - } - return first_bad_addr; -} +static unsigned long kasan_flags; -static bool addr_has_shadow(struct kasan_access_info *info) -{ - return (info->access_addr >= - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); -} +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 -static const char *get_shadow_bug_type(struct kasan_access_info *info) +bool kasan_save_enable_multi_shot(void) { - const char *bug_type = "unknown-crash"; - u8 *shadow_addr; - - info->first_bad_addr = find_first_bad_addr(info->access_addr, - info->access_size); - - shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - - /* - * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look - * at the next shadow byte to determine the type of the bad access. - */ - if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) - shadow_addr++; - - switch (*shadow_addr) { - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - /* - * In theory it's still possible to see these shadow values - * due to a data race in the kernel code. - */ - bug_type = "out-of-bounds"; - break; - case KASAN_PAGE_REDZONE: - case KASAN_KMALLOC_REDZONE: - bug_type = "slab-out-of-bounds"; - break; - case KASAN_GLOBAL_REDZONE: - bug_type = "global-out-of-bounds"; - break; - case KASAN_STACK_LEFT: - case KASAN_STACK_MID: - case KASAN_STACK_RIGHT: - case KASAN_STACK_PARTIAL: - bug_type = "stack-out-of-bounds"; - break; - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - bug_type = "use-after-free"; - break; - case KASAN_USE_AFTER_SCOPE: - bug_type = "use-after-scope"; - break; - } - - return bug_type; + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); -static const char *get_wild_bug_type(struct kasan_access_info *info) +void kasan_restore_multi_shot(bool enabled) { - const char *bug_type = "unknown-crash"; - - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - - return bug_type; + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); -static const char *get_bug_type(struct kasan_access_info *info) +static int __init kasan_set_multi_shot(char *str) { - if (addr_has_shadow(info)) - return get_shadow_bug_type(info); - return get_wild_bug_type(info); + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; } +__setup("kasan_multi_shot", kasan_set_multi_shot); static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = get_bug_type(info); - pr_err("BUG: KASAN: %s in %pS\n", - bug_type, (void *)info->ip); - pr_err("%s of size %zu at addr %p by task %s/%d\n", + get_bug_type(info), (void *)info->ip); + pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, info->access_addr, current->comm, task_pid_nr(current)); } -static inline bool kernel_or_module_addr(const void *addr) -{ - if (addr >= (void *)_stext && addr < (void *)_end) - return true; - if (is_module_address((unsigned long)addr)) - return true; - return false; -} - -static inline bool init_task_stack_addr(const void *addr) -{ - return addr >= (void *)&init_thread_union.stack && - (addr <= (void *)&init_thread_union.stack + - sizeof(init_thread_union.stack)); -} - static DEFINE_SPINLOCK(report_lock); -static void kasan_start_report(unsigned long *flags) +static void start_report(unsigned long *flags) { /* * Make sure we don't end up in loop. @@ -167,7 +86,7 @@ static void kasan_start_report(unsigned long *flags) pr_err("==================================================================\n"); } -static void kasan_end_report(unsigned long *flags) +static void end_report(unsigned long *flags) { pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -206,7 +125,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, const char *rel_type; int rel_bytes; - pr_err("The buggy address belongs to the object at %p\n" + pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); @@ -225,7 +144,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, } pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%p, %p)\n", + " %d-byte region [%px, %px)\n", rel_bytes, rel_type, cache->object_size, (void *)object_addr, (void *)(object_addr + cache->object_size)); } @@ -245,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object, describe_object_addr(cache, object, addr); } +static inline bool kernel_or_module_addr(const void *addr) +{ + if (addr >= (void *)_stext && addr < (void *)_end) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; +} + +static inline bool init_task_stack_addr(const void *addr) +{ + return addr >= (void *)&init_thread_union.stack && + (addr <= (void *)&init_thread_union.stack + + sizeof(init_thread_union.stack)); +} + static void print_address_description(void *addr) { struct page *page = addr_to_page(addr); @@ -302,7 +237,7 @@ static void print_shadow_for_address(const void *addr) char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), - (i == 0) ? ">%p: " : " %p: ", kaddr); + (i == 0) ? ">%px: " : " %px: ", kaddr); /* * We should not pass a shadow pointer to generic * function, because generic functions may try to @@ -322,127 +257,68 @@ static void print_shadow_for_address(const void *addr) } } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - void *ip) +static bool report_enabled(void) { - unsigned long flags; - - kasan_start_report(&flags); - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); - pr_err("\n"); - print_address_description(object); - pr_err("\n"); - print_shadow_for_address(object); - kasan_end_report(&flags); + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -static void kasan_report_error(struct kasan_access_info *info) +void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; - kasan_start_report(&flags); - - print_error_description(info); + start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); + print_tags(get_tag(object), reset_tag(object)); + object = reset_tag(object); pr_err("\n"); - - if (!addr_has_shadow(info)) { - dump_stack(); - } else { - print_address_description((void *)info->access_addr); - pr_err("\n"); - print_shadow_for_address(info->first_bad_addr); - } - - kasan_end_report(&flags); -} - -static unsigned long kasan_flags; - -#define KASAN_BIT_REPORTED 0 -#define KASAN_BIT_MULTI_SHOT 1 - -bool kasan_save_enable_multi_shot(void) -{ - return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); - -void kasan_restore_multi_shot(bool enabled) -{ - if (!enabled) - clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); - -static int __init kasan_set_multi_shot(char *str) -{ - set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); - return 1; -} -__setup("kasan_multi_shot", kasan_set_multi_shot); - -static inline bool kasan_report_enabled(void) -{ - if (current->kasan_depth) - return false; - if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - return true; - return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + end_report(&flags); } -void kasan_report(unsigned long addr, size_t size, - bool is_write, unsigned long ip) +void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { struct kasan_access_info info; + void *tagged_addr; + void *untagged_addr; + unsigned long flags; - if (likely(!kasan_report_enabled())) + if (likely(!report_enabled())) return; disable_trace_on_warning(); - info.access_addr = (void *)addr; - info.first_bad_addr = (void *)addr; + tagged_addr = (void *)addr; + untagged_addr = reset_tag(tagged_addr); + + info.access_addr = tagged_addr; + if (addr_has_shadow(untagged_addr)) + info.first_bad_addr = find_first_bad_addr(tagged_addr, size); + else + info.first_bad_addr = untagged_addr; info.access_size = size; info.is_write = is_write; info.ip = ip; - kasan_report_error(&info); -} + start_report(&flags); + print_error_description(&info); + if (addr_has_shadow(untagged_addr)) + print_tags(get_tag(tagged_addr), info.first_bad_addr); + pr_err("\n"); -#define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, false, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_load##size##_noabort) - -#define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, true, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_store##size##_noabort) - -DEFINE_ASAN_REPORT_LOAD(1); -DEFINE_ASAN_REPORT_LOAD(2); -DEFINE_ASAN_REPORT_LOAD(4); -DEFINE_ASAN_REPORT_LOAD(8); -DEFINE_ASAN_REPORT_LOAD(16); -DEFINE_ASAN_REPORT_STORE(1); -DEFINE_ASAN_REPORT_STORE(2); -DEFINE_ASAN_REPORT_STORE(4); -DEFINE_ASAN_REPORT_STORE(8); -DEFINE_ASAN_REPORT_STORE(16); - -void __asan_report_load_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__asan_report_load_n_noabort); + if (addr_has_shadow(untagged_addr)) { + print_address_description(untagged_addr); + pr_err("\n"); + print_shadow_for_address(info.first_bad_addr); + } else { + dump_stack(); + } -void __asan_report_store_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, true, _RET_IP_); + end_report(&flags); } -EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c new file mode 100644 index 000000000000..63fca3172659 --- /dev/null +++ b/mm/kasan/tags.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core tag-based KASAN code. + * + * Copyright (c) 2018 Google, Inc. + * Author: Andrey Konovalov <andreyknvl@google.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/linkage.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/bug.h> + +#include "kasan.h" +#include "../slab.h" + +static DEFINE_PER_CPU(u32, prng_state); + +void kasan_init_tags(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(prng_state, cpu) = (u32)get_cycles(); +} + +/* + * If a preemption happens between this_cpu_read and this_cpu_write, the only + * side effect is that we'll give a few allocated in different contexts objects + * the same tag. Since tag-based KASAN is meant to be used a probabilistic + * bug-detection debug feature, this doesn't have significant negative impact. + * + * Ideally the tags use strong randomness to prevent any attempts to predict + * them during explicit exploit attempts. But strong randomness is expensive, + * and we did an intentional trade-off to use a PRNG. This non-atomic RMW + * sequence has in fact positive effect, since interrupts that randomly skew + * PRNG at unpredictable points do only good. + */ +u8 random_tag(void) +{ + u32 state = this_cpu_read(prng_state); + + state = 1664525 * state + 1013904223; + this_cpu_write(prng_state, state); + + return (u8)(state % (KASAN_TAG_MAX + 1)); +} + +void *kasan_reset_tag(const void *addr) +{ + return reset_tag(addr); +} + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + u8 tag; + u8 *shadow_first, *shadow_last, *shadow; + void *untagged_addr; + + if (unlikely(size == 0)) + return; + + tag = get_tag((const void *)addr); + + /* + * Ignore accesses for pointers tagged with 0xff (native kernel + * pointer tag) to suppress false positives caused by kmap. + * + * Some kernel code was written to account for archs that don't keep + * high memory mapped all the time, but rather map and unmap particular + * pages when needed. Instead of storing a pointer to the kernel memory, + * this code saves the address of the page structure and offset within + * that page for later use. Those pages are then mapped and unmapped + * with kmap/kunmap when necessary and virt_to_page is used to get the + * virtual address of the page. For arm64 (that keeps the high memory + * mapped all the time), kmap is turned into a page_address call. + + * The issue is that with use of the page_address + virt_to_page + * sequence the top byte value of the original pointer gets lost (gets + * set to KASAN_TAG_KERNEL (0xFF)). + */ + if (tag == KASAN_TAG_KERNEL) + return; + + untagged_addr = reset_tag((const void *)addr); + if (unlikely(untagged_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + kasan_report(addr, size, write, ret_ip); + return; + } + shadow_first = kasan_mem_to_shadow(untagged_addr); + shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); + for (shadow = shadow_first; shadow <= shadow_last; shadow++) { + if (*shadow != tag) { + kasan_report(addr, size, write, ret_ip); + return; + } + } +} + +#define DEFINE_HWASAN_LOAD_STORE(size) \ + void __hwasan_load##size##_noabort(unsigned long addr) \ + { \ + check_memory_region(addr, size, false, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ + void __hwasan_store##size##_noabort(unsigned long addr) \ + { \ + check_memory_region(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_store##size##_noabort) + +DEFINE_HWASAN_LOAD_STORE(1); +DEFINE_HWASAN_LOAD_STORE(2); +DEFINE_HWASAN_LOAD_STORE(4); +DEFINE_HWASAN_LOAD_STORE(8); +DEFINE_HWASAN_LOAD_STORE(16); + +void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +{ + check_memory_region(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_loadN_noabort); + +void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +{ + check_memory_region(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_storeN_noabort); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +{ + kasan_poison_shadow((void *)addr, size, tag); +} +EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c new file mode 100644 index 000000000000..8eaf5f722271 --- /dev/null +++ b/mm/kasan/tags_report.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains tag-based KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <andreyknvl@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> +#include <linux/module.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +const char *get_bug_type(struct kasan_access_info *info) +{ + return "invalid-access"; +} + +void *find_first_bad_addr(void *addr, size_t size) +{ + u8 tag = get_tag(addr); + void *p = reset_tag(addr); + void *end = p + size; + + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) + p += KASAN_SHADOW_SCALE_SIZE; + return p; +} + +void print_tags(u8 addr_tag, const void *addr) +{ + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); + + pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); +} diff --git a/mm/madvise.c b/mm/madvise.c index 576b753be428..da6c00db41d0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -138,7 +138,7 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*prev) { vma = *prev; goto success; @@ -798,6 +798,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; + start = untagged_addr(start); + if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memblock.c b/mm/memblock.c index e81d12c544e9..e371f7c39b13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -192,7 +192,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t kernel_end, ret; /* pump up @end */ - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + if (end == MEMBLOCK_ALLOC_ACCESSIBLE || + end == MEMBLOCK_ALLOC_KASAN) end = memblock.current_limit; /* avoid allocating the first page */ @@ -1301,13 +1302,15 @@ done: ptr = phys_to_virt(alloc); memset(ptr, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. This is because many of these blocks - * are only referred via the physical address which is not - * looked up by kmemleak. - */ - kmemleak_alloc(ptr, size, 0, 0); + /* Skip kmemleak for kasan_init() due to high volume. */ + if (max_addr != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that bootmem allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); return ptr; } diff --git a/mm/memory.c b/mm/memory.c index e9bce27bc18c..8677c25af727 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2380,6 +2380,10 @@ static int do_page_mkwrite(struct vm_fault *vmf) vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + if (vmf->vma->vm_file && + IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) + return VM_FAULT_SIGBUS; + ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1b34f2e35951..dd30f50301e7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -755,7 +755,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_get_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; @@ -1371,6 +1372,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, int err; unsigned short mode_flags; + start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) @@ -1512,6 +1514,8 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, int uninitialized_var(pval); nodemask_t nodes; + addr = untagged_addr(addr); + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; diff --git a/mm/mempool.c b/mm/mempool.c index c4a23cdae3f0..6fb1c0bd2870 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -103,15 +103,15 @@ static inline void poison_element(mempool_t *pool, void *element) } #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ -static void kasan_poison_element(mempool_t *pool, void *element) +static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_poison_kfree(element); + kasan_poison_kfree(element, _RET_IP_); if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) +static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); @@ -119,7 +119,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } -static void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); @@ -127,12 +127,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool, gfp_t flags) +static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element, flags); + kasan_unpoison_element(pool, element); check_element(pool, element); return element; } @@ -151,7 +151,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); + void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -247,7 +247,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool, GFP_KERNEL); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -333,7 +333,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool, gfp_temp); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/migrate.c b/mm/migrate.c index 9a3ce8847308..99db54bfd9ed 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -671,6 +671,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -1649,7 +1651,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, err = -EFAULT; if (get_user(p, pages + j + chunk_start)) goto out_pm; - pm[j].addr = (unsigned long) p; + pm[j].addr = (unsigned long)untagged_addr(p); if (get_user(node, nodes + j + chunk_start)) goto out_pm; diff --git a/mm/mincore.c b/mm/mincore.c index 2732c8c0764c..7eb149191550 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -249,6 +249,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned long pages; unsigned char *tmp; + start = untagged_addr(start); + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 1f9ee86672e8..14b3f85e374f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -528,7 +528,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*prev) { vma = *prev; goto success; @@ -667,6 +667,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla unsigned long lock_limit; int error = -ENOMEM; + start = untagged_addr(start); + if (!can_do_mlock()) return -EPERM; @@ -730,6 +732,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + start = untagged_addr(start); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index 8c6ed06983f9..6afba0872180 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -968,7 +968,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -986,6 +987,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (vma_get_anon_name(vma) != anon_name) + return 0; return 1; } @@ -1018,9 +1021,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1039,9 +1043,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1052,9 +1057,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1096,7 +1101,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char __user *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1129,7 +1135,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, + anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1138,7 +1145,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, + anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1161,7 +1169,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, + anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1417,8 +1426,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, switch (flags & MAP_TYPE) { case MAP_SHARED: - if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) - return -EACCES; + if (prot & PROT_WRITE) { + if (!(file->f_mode & FMODE_WRITE)) + return -EACCES; + if (IS_SWAPFILE(file->f_mapping->host)) + return -ETXTBSY; + } /* * Make sure we don't allow writing to an append-only @@ -1673,7 +1686,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -2747,6 +2760,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return 0; } +EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { @@ -2766,6 +2780,7 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + addr = untagged_addr(addr); profile_munmap(addr); return vm_munmap(addr, len); } @@ -2933,7 +2948,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3134,7 +3149,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index 18ecbd744978..92891393f279 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -418,7 +418,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); @@ -483,6 +483,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + start = untagged_addr(start); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; diff --git a/mm/mremap.c b/mm/mremap.c index 88ceeb4ef817..e9990c1afd60 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -529,6 +529,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + addr = untagged_addr(addr); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/msync.c b/mm/msync.c index ef30a429623a..c3bd3e75f687 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) int unmapped_error = 0; int error = -EINVAL; + start = untagged_addr(start); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 29f9980c13ac..5b405e9a2b89 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2195,30 +2195,14 @@ int write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f71518a4558..4bb46b766afd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -67,6 +67,7 @@ #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> +#include <linux/psi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -129,6 +130,55 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_alloc); +#else +DEFINE_STATIC_KEY_FALSE(init_on_alloc); +#endif +EXPORT_SYMBOL(init_on_alloc); + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_free); +#else +DEFINE_STATIC_KEY_FALSE(init_on_free); +#endif +EXPORT_SYMBOL(init_on_free); + +static int __init early_init_on_alloc(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); + if (bool_result) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + return ret; +} +early_param("init_on_alloc", early_init_on_alloc); + +static int __init early_init_on_free(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); + if (bool_result) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + return ret; +} +early_param("init_on_free", early_init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -258,10 +308,22 @@ compound_page_dtor * const compound_page_dtors[] = { #endif }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; +/* + * Extra memory for the system to try freeing. Used to temporarily + * free memory, to make space for new workloads. Anyone can allocate + * down to the min watermarks controlled by min_free_kbytes above. + */ +int extra_free_kbytes = 0; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -1013,6 +1075,14 @@ out: return ret; } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; + + for (i = 0; i < numpages; i++) + clear_highpage(page + i); +} + static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { @@ -1064,6 +1134,9 @@ static __always_inline bool free_pages_prepare(struct page *page, PAGE_SIZE << order); } arch_free_page(page, order); + if (want_init_on_free()) + kernel_init_free_pages(page, 1 << order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); kasan_free_pages(page, order); @@ -1181,6 +1254,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); + page_kasan_tag_reset(page); INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL @@ -1718,8 +1792,8 @@ static inline int check_new_page(struct page *page) static inline bool free_pages_prezeroed(void) { - return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled(); + return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled()) || want_init_on_free(); } #ifdef CONFIG_DEBUG_VM @@ -1772,13 +1846,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order, static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { - int i; - post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -3371,15 +3442,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3568,11 +3644,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -3584,6 +3662,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); cond_resched(); @@ -4513,6 +4592,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4538,19 +4618,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. - */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; @@ -4779,6 +4853,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " managed:%lukB" " mlocked:%lukB" " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -4800,6 +4877,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024, +#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), @@ -6890,6 +6970,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -6901,11 +6982,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -6926,7 +7010,7 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } /* @@ -6934,12 +7018,14 @@ static void __setup_per_zone_wmarks(void) * scale factor in proportion to available memory, but * ensure a minimum size on small systems. */ - tmp = max_t(u64, tmp >> 2, + min = max_t(u64, min >> 2, mult_frac(zone->managed_pages, watermark_scale_factor, 10000)); - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + min; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + min * 2; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7022,7 +7108,7 @@ core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes - * changes. + * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) diff --git a/mm/page_io.c b/mm/page_io.c index 5d882de3fbfd..9f8fd8f42b0d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -22,6 +22,7 @@ #include <linux/writeback.h> #include <linux/frontswap.h> #include <linux/blkdev.h> +#include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> #include <asm/pgtable.h> @@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool do_poll) struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; struct gendisk *disk; + unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall. When the device is congested, + * or the submitting cgroup IO-throttled, submission can be a + * significant part of overall IO time. + */ + psi_memstall_enter(&pflags); + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); @@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool do_poll) ret = mapping->a_ops->readpage(swap_file, page); if (!ret) count_vm_event(PSWPIN); - return ret; + goto out; } ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); @@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool do_poll) } count_vm_event(PSWPIN); - return 0; + goto out; } ret = 0; @@ -415,6 +425,7 @@ int swap_readpage(struct page *page, bool do_poll) bio_put(bio); out: + psi_memstall_leave(&pflags); return ret; } diff --git a/mm/readahead.c b/mm/readahead.c index 59aa0d06f254..bf9db0fa4506 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -81,7 +81,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * Hides the details of the LRU cache etc from the filesystems. */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) + int (*filler)(struct file *, struct page *), void *data) { struct page *page; int ret = 0; diff --git a/mm/shmem.c b/mm/shmem.c index 0b6db162083c..a28131f99545 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2150,6 +2150,25 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2403,8 +2422,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2763,7 +2783,8 @@ continue_resched: #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) int shmem_add_seals(struct file *file, unsigned int seals) { @@ -2890,7 +2911,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } @@ -4313,6 +4334,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags } EXPORT_SYMBOL_GPL(shmem_file_setup); +void shmem_set_file(struct vm_area_struct *vma, struct file *file) +{ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; +} + /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap_pgoff @@ -4332,10 +4361,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); - if (vma->vm_file) - fput(vma->vm_file); - vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + shmem_set_file(vma, file); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < diff --git a/mm/slab.c b/mm/slab.c index a04aeae42306..5dd23f6fd451 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, return page->s_mem + cache->size * idx; } -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct page *page, void *obj) -{ - u32 offset = (obj - page->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - #define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { @@ -1293,7 +1280,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; @@ -1308,7 +1295,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } @@ -1590,11 +1577,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) *dbg_redzone2(cachep, objp)); } - if (cachep->flags & SLAB_STORE_USER) { - pr_err("Last user: [<%p>](%pSR)\n", - *dbg_userword(cachep, objp), - *dbg_userword(cachep, objp)); - } + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { @@ -1627,7 +1611,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); @@ -1656,13 +1640,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%p, len=%d\n", realobj, size); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%p, len=%d\n", realobj, size); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1913,6 +1897,14 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; + /* + * If slab auto-initialization on free is enabled, store the freelist + * off-slab, so that its contents don't end up in one of the allocated + * objects. + */ + if (unlikely(slab_want_init_on_free(cachep))) + return false; + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; @@ -2006,7 +1998,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; - size_t size = cachep->size; + unsigned int size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2301,6 +2293,18 @@ out: return nr_freed; } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) + return false; + return true; +} + int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; @@ -2379,6 +2383,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); + freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2574,7 +2579,7 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { objp = index_to_obj(cachep, page, i); - kasan_init_slab_obj(cachep, objp); + objp = kasan_init_slab_obj(cachep, objp); /* constructor could break poison info */ if (DEBUG == 0 && cachep->ctor) { @@ -2613,7 +2618,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } @@ -2691,6 +2696,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, offset *= cachep->colour_off; + /* + * Call kasan_poison_slab() before calling alloc_slabmgmt(), so + * page_address() in the latter returns a non-tagged pointer, + * as it should be for slab pages. + */ + kasan_poison_slab(page); + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, page_node); @@ -2699,7 +2711,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); - kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -2777,7 +2788,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", obj, redzone1, redzone2); } @@ -3083,7 +3094,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3096,7 +3107,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -3327,7 +3338,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - if (unlikely(flags & __GFP_ZERO) && ptr) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &ptr); @@ -3384,7 +3395,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - if (unlikely(flags & __GFP_ZERO) && objp) + if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); slab_post_alloc_hook(cachep, flags, 1, &objp); @@ -3489,11 +3500,11 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) +static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp)) + if (kasan_slab_free(cachep, objp, _RET_IP_)) return; ___cache_free(cachep, objp, caller); @@ -3505,6 +3516,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + if (unlikely(slab_want_init_on_free(cachep))) + memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3549,7 +3562,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3591,7 +3603,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); /* Clear memory outside IRQ disabled section */ - if (unlikely(flags & __GFP_ZERO)) + if (unlikely(slab_want_init_on_alloc(flags, s))) for (i = 0; i < size; i++) memset(p[i], 0, s->object_size); @@ -3615,7 +3627,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3639,7 +3651,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3658,7 +3669,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3679,7 +3690,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); return ret; } @@ -3717,7 +3728,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4294,7 +4305,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) return; } #endif - seq_printf(m, "%p", (void *)address); + seq_printf(m, "%px", (void *)address); } static int leaks_show(struct seq_file *m, void *p) @@ -4421,6 +4432,8 @@ const char *__check_heap_object(const void *ptr, unsigned long n, unsigned int objnr; unsigned long offset; + ptr = kasan_reset_tag(ptr); + /* Find and validate object. */ cachep = page->slab_cache; objnr = obj_to_index(cachep, page, (void *)ptr); diff --git a/mm/slab.h b/mm/slab.h index 485d9fbb8802..20ee67d11bb4 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -165,6 +165,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, SLAB_TEMPORARY | \ SLAB_ACCOUNT) +bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); @@ -435,11 +436,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemleak_alloc_recursive(object, s->object_size, 1, + p[i] = kasan_slab_alloc(s, p[i], flags); + kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object, flags); } if (memcg_kmem_enabled()) @@ -518,4 +517,24 @@ static inline int cache_random_seq_create(struct kmem_cache *cachep, static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index f6764cf162b8..20da89561fd2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -917,14 +917,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -984,12 +980,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = fls(size - 1); } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1003,15 +994,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { {"kmalloc-16", 16}, {"kmalloc-32", 32}, {"kmalloc-64", 64}, {"kmalloc-128", 128}, {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, - {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, - {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, - {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, - {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, - {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, - {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, - {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, - {"kmalloc-67108864", 67108864} + {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, + {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, + {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, + {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, + {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, + {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, + {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, + {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, + {"kmalloc-64M", 67108864} }; /* @@ -1061,9 +1052,36 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, unsigned long flags) +static const char * +kmalloc_cache_name(const char *prefix, unsigned int size) +{ + + static const char units[3] = "\0kM"; + int idx = 0; + + while (size >= 1024 && (size % 1024 == 0)) { + size /= 1024; + idx++; + } + + return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); +} + +static void __init +new_kmalloc_cache(int idx, int type, unsigned long flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kmalloc_cache_name("kmalloc-rcl", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags); } @@ -1074,21 +1092,25 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags) */ void __init create_kmalloc_caches(unsigned long flags) { - int i; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ @@ -1096,16 +1118,15 @@ void __init create_kmalloc_caches(unsigned long flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { int size = kmalloc_size(i); - char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", size); + const char *n = kmalloc_cache_name("dma-kmalloc", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags); } } #endif @@ -1125,8 +1146,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) flags |= __GFP_COMP; page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; + ret = kasan_kmalloc_large(ret, size, flags); kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1371,7 +1392,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size, flags); + p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } @@ -1423,7 +1444,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) } ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; diff --git a/mm/slub.c b/mm/slub.c index 958a8f7a3c25..766e3bd0483f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -248,7 +248,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) { #ifdef CONFIG_SLAB_FREELIST_HARDENED - return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); + /* + * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * Normally, this doesn't cause any issues, as both set_freepointer() + * and get_freepointer() are called with a pointer with the same tag. + * However, there are some issues with CONFIG_SLUB_DEBUG code. For + * example, when __free_slub() iterates over objects in a cache, it + * passes untagged pointers to check_object(). check_object() in turns + * calls get_freepointer() with an untagged pointer, which causes the + * freepointer to be restored incorrectly. + */ + return (void *)((unsigned long)ptr ^ s->random ^ + (unsigned long)kasan_reset_tag((void *)ptr_addr)); #else return ptr; #endif @@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p < (__addr) + (__objects) * (__s)->size; \ __p += (__s)->size) -#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr), __idx = 1; \ - __idx <= __objects; \ - __p += (__s)->size, __idx++) - /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { - return (p - addr) / s->size; + return (kasan_reset_tag(p) - addr) / s->size; } static inline int order_objects(int order, unsigned long size, int reserved) @@ -522,6 +528,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); + object = kasan_reset_tag(object); object = restore_red_left(s, object); if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { @@ -1087,6 +1094,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } +static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +{ + if (!(s->flags & SLAB_POISON)) + return; + + metadata_access_enable(); + memset(addr, POISON_INUSE, PAGE_SIZE << order); + metadata_access_disable(); +} + static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) @@ -1283,6 +1300,10 @@ check_slabs: if (*str == ',') slub_debug_slabs = str + 1; out: + if ((static_branch_unlikely(&init_on_alloc) || + static_branch_unlikely(&init_on_free)) && + (slub_debug & SLAB_POISON)) + pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); return 1; } @@ -1304,6 +1325,8 @@ unsigned long kmem_cache_flags(unsigned long object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} +static inline void setup_page_debug(struct kmem_cache *s, + void *addr, int order) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1346,22 +1369,21 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { + ptr = kasan_kmalloc_large(ptr, size, flags); kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size, flags); + return ptr; } -static inline void kfree_hook(const void *x) +static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x); + kasan_kfree_large(x, _RET_IP_); } -static inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1381,47 +1403,66 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { -/* - * Compiler cannot detect this function can be removed if slab_free_hook() - * evaluates to nothing. Thus, catch all relevant config debug options here. - */ -#if defined(CONFIG_LOCKDEP) || \ - defined(CONFIG_DEBUG_KMEMLEAK) || \ - defined(CONFIG_DEBUG_OBJECTS_FREE) || \ - defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + int rsize; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); -#endif + object = next; + next = get_freepointer(s, object); + + if (slab_want_init_on_free(s)) { + /* + * Clear the object and the metadata, but don't touch + * the redzone. + */ + memset(object, 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); + + } + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; } -static void setup_object(struct kmem_cache *s, struct page *page, +static void *setup_object(struct kmem_cache *s, struct page *page, void *object) { setup_object_debug(s, page, object); - kasan_init_slab_obj(s, object); + object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { kasan_unpoison_object_data(s, object); s->ctor(object); kasan_poison_object_data(s, object); } + return object; } /* @@ -1527,16 +1568,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page) /* First entry is used as the base of the freelist */ cur = next_freelist_entry(s, page, &pos, start, page_limit, freelist_count); + cur = setup_object(s, page, cur); page->freelist = cur; for (idx = 1; idx < page->objects; idx++) { - setup_object(s, page, cur); next = next_freelist_entry(s, page, &pos, start, page_limit, freelist_count); + next = setup_object(s, page, next); set_freepointer(s, cur, next); cur = next; } - setup_object(s, page, cur); set_freepointer(s, cur, NULL); return true; @@ -1558,7 +1599,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; - void *start, *p; + void *start, *p, *next; int idx, order; bool shuffle; @@ -1599,24 +1640,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - start = page_address(page); + kasan_poison_slab(page); - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << order); + start = page_address(page); - kasan_poison_slab(page); + setup_page_debug(s, start, order); shuffle = shuffle_freelist(s, page); if (!shuffle) { - for_each_object_idx(p, idx, s, start, page->objects) { - setup_object(s, page, p); - if (likely(idx < page->objects)) - set_freepointer(s, p, p + s->size); - else - set_freepointer(s, p, NULL); + start = fixup_red_left(s, start); + start = setup_object(s, page, start); + page->freelist = start; + for (idx = 0, p = start; idx < page->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, page, next); + set_freepointer(s, p, next); + p = next; } - page->freelist = fixup_red_left(s, start); + set_freepointer(s, p, NULL); } page->inuse = page->objects; @@ -2637,6 +2679,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, } /* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)obj + s->offset), 0, sizeof(void *)); +} + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -2725,7 +2778,9 @@ redo: stat(s, ALLOC_FASTPATH); } - if (unlikely(gfpflags & __GFP_ZERO) && object) + maybe_wipe_obj_freeptr(s, object); + + if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, 1, &object); @@ -2755,7 +2810,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size, gfpflags); + ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2783,7 +2838,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size, gfpflags); + ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2973,17 +3028,15 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); @@ -3151,16 +3204,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, goto error; c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + continue; /* goto for-loop */ } c->freelist = get_freepointer(s, object); p[i] = object; + maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); local_irq_enable(); /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(flags & __GFP_ZERO)) { + if (unlikely(slab_want_init_on_alloc(flags, s))) { int j; for (j = 0; j < i; j++) @@ -3362,16 +3418,16 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse = 1; - page->frozen = 0; - kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), GFP_KERNEL); + page->freelist = get_freepointer(kmem_cache_node, n); + page->inuse = 1; + page->frozen = 0; + kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3474,7 +3530,7 @@ static void set_cpu_partial(struct kmem_cache *s) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - size_t size = s->object_size; + unsigned int size = s->object_size; int order; /* @@ -3708,6 +3764,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || slabs_node(s, node)) + return false; + return true; +} + /* * Release all resources used by a slab cache. */ @@ -3776,7 +3843,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size, flags); + ret = kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3793,8 +3860,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmalloc_large_node_hook(ptr, size, flags); - return ptr; + return kmalloc_large_node_hook(ptr, size, flags); } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3821,7 +3887,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size, flags); + ret = kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3842,6 +3908,8 @@ const char *__check_heap_object(const void *ptr, unsigned long n, unsigned long offset; size_t object_size; + ptr = kasan_reset_tag(ptr); + /* Find object and usable object size. */ s = page->slab_cache; object_size = slab_ksize(s); @@ -3909,7 +3977,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kfree_hook(x); + kfree_hook(object); __free_pages(page, compound_order(page)); return; } @@ -4672,6 +4740,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4684,7 +4753,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4693,33 +4762,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS diff --git a/mm/swap.c b/mm/swap.c index a77d68f2c1b6..4edac536fe24 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -986,15 +986,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag) { - pvec->nr = find_get_pages_tag(mapping, index, tag, - nr_pages, pvec->pages); + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 326439428daf..3931379fac4d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -435,6 +435,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Initiate read into locked page and return. */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f9e522643a2..4cc380d5712b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2375,6 +2375,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } +EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages @@ -2396,9 +2397,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, * requirements, they are simply tossed out - we will never use those blocks * for swapping. * - * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This - * prevents root from shooting her foot off by ftruncating an in-use swapfile, - * which will scribble on the fs. + * For all swap devices we set S_SWAPFILE across the life of the swapon. This + * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of @@ -2661,13 +2661,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - } else { - inode_lock(inode); - inode->i_flags &= ~S_SWAPFILE; - inode_unlock(inode); } + + inode_lock(inode); + inode->i_flags &= ~S_SWAPFILE; + inode_unlock(inode); filp_close(swap_file, NULL); /* @@ -2895,11 +2896,11 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - inode_lock(inode); - if (IS_SWAPFILE(inode)) - return -EBUSY; - } else - return -EINVAL; + } + + inode_lock(inode); + if (IS_SWAPFILE(inode)) + return -EBUSY; return 0; } @@ -3273,6 +3274,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap; + /* + * Flush any pending IO and dirty mappings before we start using this + * swap device. + */ + inode->i_flags |= S_SWAPFILE; + error = inode_drain_writes(inode); + if (error) { + inode->i_flags &= ~S_SWAPFILE; + goto bad_swap; + } + mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) @@ -3293,8 +3305,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); - if (S_ISREG(inode->i_mode)) - inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: @@ -3314,7 +3324,7 @@ bad_swap: kvfree(cluster_info); kvfree(frontswap_map); if (swap_file) { - if (inode && S_ISREG(inode->i_mode)) { + if (inode) { inode_unlock(inode); inode = NULL; } @@ -3327,7 +3337,7 @@ out: } if (name) putname(name); - if (inode && S_ISREG(inode->i_mode)) + if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); diff --git a/mm/usercopy.c b/mm/usercopy.c index f8d74e09f8e4..dd5f42afc3c4 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -59,12 +59,11 @@ static noinline int check_stack_object(const void *obj, unsigned long len) return GOOD_STACK; } -static void report_usercopy(const void *ptr, unsigned long len, - bool to_user, const char *type) +static void report_usercopy(unsigned long len, bool to_user, const char *type) { - pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + pr_emerg("kernel memory %s attempt detected %s '%s' (%lu bytes)\n", to_user ? "exposure" : "overwrite", - to_user ? "from" : "to", ptr, type ? : "unknown", len); + to_user ? "from" : "to", type ? : "unknown", len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch @@ -267,6 +266,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) return; report: - report_usercopy(ptr, n, to_user, err); + report_usercopy(n, to_user, err); } EXPORT_SYMBOL(__check_object_size); diff --git a/mm/util.c b/mm/util.c index 842ba5fb662e..b8e849125ecc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -367,7 +367,8 @@ EXPORT_SYMBOL(vm_mmap); * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not + * fall back to vmalloc. */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -378,7 +379,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + if ((flags & GFP_KERNEL) != GFP_KERNEL) + return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because @@ -639,8 +641,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d00961ba0c42..5fff24ddc17b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -340,6 +340,13 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; +static atomic_long_t nr_vmalloc_pages; + +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -1543,6 +1550,7 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, 0); } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } @@ -1708,12 +1716,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (map_vm_area(area, prot, pages)) goto fail; diff --git a/mm/vmscan.c b/mm/vmscan.c index c6962aa5ddb4..46c0231b5745 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -49,6 +49,7 @@ #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> +#include <linux/psi.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -2065,6 +2066,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } @@ -3134,6 +3136,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3162,9 +3165,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3329,6 +3336,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3338,6 +3346,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + psi_memstall_enter(&pflags); count_vm_event(PAGEOUTRUN); do { @@ -3432,6 +3441,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller diff --git a/mm/vmstat.c b/mm/vmstat.c index e2197b03da57..2c8e482d3f22 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1046,6 +1046,9 @@ const char * const vmstat_text[] = { "nr_mlock", "nr_page_table_pages", "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack_bytes", +#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1074,6 +1077,7 @@ const char * const vmstat_text[] = { "nr_isolated_file", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1090,7 +1094,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1672,10 +1676,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); diff --git a/mm/workingset.c b/mm/workingset.c index b997c9de28f6..808a69a4a9ef 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -156,8 +168,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -170,23 +181,28 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << 1) | workingset; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { unsigned long entry = (unsigned long)shadow; int memcgid, nid; + bool workingset; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c6df483b3751..801afc0b8c36 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -191,6 +191,7 @@ static struct vfsmount *zsmalloc_mnt; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -1431,6 +1432,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct size_class *class, struct zspage *zspage, unsigned long handle) { @@ -2461,6 +2481,27 @@ struct zs_pool *zs_create_pool(const char *name) objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + + /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we |