From e83acd7d37d83035f2fe078f656f87418ea2a687 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:09 +0000
Subject: io_uring: avoid taking ctx refs for task-cancel

Don't bother to take a ctx->refs for io_req_task_cancel() because it
take uring_lock before putting a request, and the context is promised to
stay alive until unlock happens.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bd14327c8e7e..db0c4b2dd141 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1985,10 +1985,10 @@ static void io_req_task_cancel(struct callback_head *cb)
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 	struct io_ring_ctx *ctx = req->ctx;
 
+	/* ctx is guaranteed to stay alive while we hold uring_lock */
 	mutex_lock(&ctx->uring_lock);
 	__io_req_task_cancel(req, req->result);
 	mutex_unlock(&ctx->uring_lock);
-	percpu_ref_put(&ctx->refs);
 }
 
 static void __io_req_task_submit(struct io_kiocb *req)
@@ -2019,14 +2019,12 @@ static void io_req_task_queue(struct io_kiocb *req)
 	ret = io_req_task_work_add(req);
 	if (unlikely(ret)) {
 		req->result = -ECANCELED;
-		percpu_ref_get(&req->ctx->refs);
 		io_req_task_work_add_fallback(req, io_req_task_cancel);
 	}
 }
 
 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
-	percpu_ref_get(&req->ctx->refs);
 	req->result = ret;
 	req->task_work.func = io_req_task_cancel;
 
-- 
cgit v1.2.3


From 2c4b8eb6435e615544b92acdcd4b25a85e83f300 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:10 +0000
Subject: io_uring: reuse io_req_task_queue_fail()

Use io_req_task_queue_fail() on the fail path of io_req_task_queue().
It's unlikely to happen, so don't care about additional overhead, but
allows to keep all the req->result invariant in a single function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index db0c4b2dd141..49cdeaf710ee 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2011,25 +2011,21 @@ static void io_req_task_submit(struct callback_head *cb)
 	__io_req_task_submit(req);
 }
 
-static void io_req_task_queue(struct io_kiocb *req)
+static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
-	int ret;
+	req->result = ret;
+	req->task_work.func = io_req_task_cancel;
 
-	req->task_work.func = io_req_task_submit;
-	ret = io_req_task_work_add(req);
-	if (unlikely(ret)) {
-		req->result = -ECANCELED;
+	if (unlikely(io_req_task_work_add(req)))
 		io_req_task_work_add_fallback(req, io_req_task_cancel);
-	}
 }
 
-static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+static void io_req_task_queue(struct io_kiocb *req)
 {
-	req->result = ret;
-	req->task_work.func = io_req_task_cancel;
+	req->task_work.func = io_req_task_submit;
 
 	if (unlikely(io_req_task_work_add(req)))
-		io_req_task_work_add_fallback(req, io_req_task_cancel);
+		io_req_task_queue_fail(req, -ECANCELED);
 }
 
 static inline void io_queue_next(struct io_kiocb *req)
-- 
cgit v1.2.3


From dafecf19e25f9b864ce0f3b8bb12de2e3d5f6da6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:11 +0000
Subject: io_uring: further deduplicate file slot selection

io_fixed_file_slot() and io_file_from_index() behave pretty similarly,
DRY and call one from another.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 49cdeaf710ee..a5eee7514a03 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6204,13 +6204,19 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 }
 
-static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
-					      int index)
+static inline struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+					       unsigned i)
 {
 	struct fixed_rsrc_table *table;
 
-	table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
-	return table->files[index & IORING_FILE_TABLE_MASK];
+	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+	return &table->files[i & IORING_FILE_TABLE_MASK];
+}
+
+static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
+					      int index)
+{
+	return *io_fixed_file_slot(ctx->file_data, index);
 }
 
 static struct file *io_file_get(struct io_submit_state *state,
@@ -7478,15 +7484,6 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
-					unsigned i)
-{
-	struct fixed_rsrc_table *table;
-
-	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-	return &table->files[i & IORING_FILE_TABLE_MASK];
-}
-
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_ref_node *ref_node;
-- 
cgit v1.2.3


From f41db2732d4835799af64159c61e522063786e5c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:12 +0000
Subject: io_uring: add a helper failing not issued requests

Add a simple helper doing CQE posting, marking request for link-failure,
and putting both submission and completion references.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a5eee7514a03..11a02ec86e54 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1584,6 +1584,13 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 	__io_req_complete(req, 0, res, 0);
 }
 
+static void io_req_complete_failed(struct io_kiocb *req, long res)
+{
+	req_set_fail_links(req);
+	io_put_req(req);
+	io_req_complete_post(req, res, 0);
+}
+
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
@@ -6346,9 +6353,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			io_put_req(req);
 		}
 	} else {
-		req_set_fail_links(req);
-		io_put_req(req);
-		io_req_complete(req, ret);
+		io_req_complete_failed(req, ret);
 	}
 	if (linked_timeout)
 		io_queue_linked_timeout(linked_timeout);
@@ -6362,9 +6367,7 @@ static void io_queue_sqe(struct io_kiocb *req)
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 fail_req:
-			req_set_fail_links(req);
-			io_put_req(req);
-			io_req_complete(req, ret);
+			io_req_complete_failed(req, ret);
 		}
 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
 		ret = io_req_defer_prep(req);
@@ -6485,12 +6488,10 @@ fail_req:
 		if (link->head) {
 			/* fail even hard links since we don't submit */
 			link->head->flags |= REQ_F_FAIL_LINK;
-			io_put_req(link->head);
-			io_req_complete(link->head, -ECANCELED);
+			io_req_complete_failed(link->head, -ECANCELED);
 			link->head = NULL;
 		}
-		io_put_req(req);
-		io_req_complete(req, ret);
+		io_req_complete_failed(req, ret);
 		return ret;
 	}
 	ret = io_req_prep(req, sqe);
@@ -8716,9 +8717,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	while (!list_empty(&list)) {
 		de = list_first_entry(&list, struct io_defer_entry, list);
 		list_del_init(&de->list);
-		req_set_fail_links(de->req);
-		io_put_req(de->req);
-		io_req_complete(de->req, -ECANCELED);
+		io_req_complete_failed(de->req, -ECANCELED);
 		kfree(de);
 	}
 	return true;
-- 
cgit v1.2.3


From 9fb8cb49c7b634982ac2a4302b5158d7120f0186 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:13 +0000
Subject: io_uring: refactor provide/remove buffer locking

Always complete request holding the mutex instead of doing that strange
dancing with conditional ordering.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11a02ec86e54..2642369d1332 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3966,14 +3966,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail_links(req);
 
-	/* need to hold the lock to complete IOPOLL requests */
-	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		__io_req_complete(req, issue_flags, ret, 0);
-		io_ring_submit_unlock(ctx, !force_nonblock);
-	} else {
-		io_ring_submit_unlock(ctx, !force_nonblock);
-		__io_req_complete(req, issue_flags, ret, 0);
-	}
+	/* complete before unlock, IOPOLL may need the lock */
+	__io_req_complete(req, issue_flags, ret, 0);
+	io_ring_submit_unlock(ctx, !force_nonblock);
 	return 0;
 }
 
@@ -4055,15 +4050,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	if (ret < 0)
 		req_set_fail_links(req);
-
-	/* need to hold the lock to complete IOPOLL requests */
-	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		__io_req_complete(req, issue_flags, ret, 0);
-		io_ring_submit_unlock(ctx, !force_nonblock);
-	} else {
-		io_ring_submit_unlock(ctx, !force_nonblock);
-		__io_req_complete(req, issue_flags, ret, 0);
-	}
+	/* complete before unlock, IOPOLL may need the lock */
+	__io_req_complete(req, issue_flags, ret, 0);
+	io_ring_submit_unlock(ctx, !force_nonblock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8c3f9cd1603d0e4af6c50ebc6d974ab7bdd03cf4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:15 +0000
Subject: io_uring: use better types for cflags

__io_cqring_fill_event() takes cflags as long to squeeze it into u32 in
an CQE, awhile all users pass int or unsigned. Replace it with unsigned
int and store it as u32 in struct io_completion to match CQE.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2642369d1332..bb497511fadb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -653,7 +653,7 @@ struct io_unlink {
 struct io_completion {
 	struct file			*file;
 	struct list_head		list;
-	int				cflags;
+	u32				cflags;
 };
 
 struct io_async_connect {
@@ -1476,7 +1476,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 	return ret;
 }
 
-static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
+static void __io_cqring_fill_event(struct io_kiocb *req, long res,
+				   unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_cqe *cqe;
-- 
cgit v1.2.3


From 2e052d443df15d71277f6b8509badae4310ebd92 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:16 +0000
Subject: io_uring: refactor out send/recv async setup

IORING_OP_[SEND,RECV] don't need async setup neither will get into
io_req_prep_async(). Remove them from io_req_prep_async() and remove
needs_async_data checks from the related setup functions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bb497511fadb..7833ccd085ba 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4349,8 +4349,6 @@ static int io_sendmsg_prep_async(struct io_kiocb *req)
 {
 	int ret;
 
-	if (!io_op_defs[req->opcode].needs_async_data)
-		return 0;
 	ret = io_sendmsg_copy_hdr(req, req->async_data);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
@@ -4578,8 +4576,6 @@ static int io_recvmsg_prep_async(struct io_kiocb *req)
 {
 	int ret;
 
-	if (!io_op_defs[req->opcode].needs_async_data)
-		return 0;
 	ret = io_recvmsg_copy_hdr(req, req->async_data);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
@@ -5892,10 +5888,8 @@ static int io_req_prep_async(struct io_kiocb *req)
 	case IORING_OP_WRITE:
 		return io_rw_prep_async(req, WRITE);
 	case IORING_OP_SENDMSG:
-	case IORING_OP_SEND:
 		return io_sendmsg_prep_async(req);
 	case IORING_OP_RECVMSG:
-	case IORING_OP_RECV:
 		return io_recvmsg_prep_async(req);
 	case IORING_OP_CONNECT:
 		return io_connect_prep_async(req);
-- 
cgit v1.2.3


From 6cb78689fa94c80784faef76744746aee558c344 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:17 +0000
Subject: io_uring: untie alloc_async_data and needs_async_data

All opcode handlers pretty well know whether they need async data or
not, and can skip testing for needs_async_data. The exception is rw
the generic path, but those test the flag by hand anyway. So, check the
flag and make io_alloc_async_data() allocating unconditionally.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7833ccd085ba..e45893823652 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3128,21 +3128,13 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 	}
 }
 
-static inline int __io_alloc_async_data(struct io_kiocb *req)
+static inline int io_alloc_async_data(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
 	return req->async_data == NULL;
 }
 
-static int io_alloc_async_data(struct io_kiocb *req)
-{
-	if (!io_op_defs[req->opcode].needs_async_data)
-		return 0;
-
-	return  __io_alloc_async_data(req);
-}
-
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force)
@@ -3150,7 +3142,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!force && !io_op_defs[req->opcode].needs_async_data)
 		return 0;
 	if (!req->async_data) {
-		if (__io_alloc_async_data(req)) {
+		if (io_alloc_async_data(req)) {
 			kfree(iovec);
 			return -ENOMEM;
 		}
@@ -5904,7 +5896,7 @@ static int io_req_defer_prep(struct io_kiocb *req)
 	/* some opcodes init it during the inital prep */
 	if (req->async_data)
 		return 0;
-	if (__io_alloc_async_data(req))
+	if (io_alloc_async_data(req))
 		return -EAGAIN;
 	return io_req_prep_async(req);
 }
-- 
cgit v1.2.3


From 26f0505a9ce571f3b1fcef6e86c5c99c68ca7eca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:18 +0000
Subject: io_uring: rethink def->needs_async_data

needs_async_data controls allocation of async_data, and used in two
cases. 1) when async setup requires it (by io_req_prep_async() or
handler themselves), and 2) when op always needs additional space to
operate, like timeouts do.

Opcode preps already don't bother about the second case and do
allocation unconditionally, restrict needs_async_data to the first case
only and rename it into needs_async_setup.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: update for IOPOLL fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e45893823652..4df9961d6245 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -849,8 +849,8 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
-	/* must always have async data allocated */
-	unsigned		needs_async_data : 1;
+	/* do prep async if is going to be punted */
+	unsigned		needs_async_setup : 1;
 	/* should block plug */
 	unsigned		plug : 1;
 	/* size of async data needed, if any */
@@ -864,7 +864,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
-		.needs_async_data	= 1,
+		.needs_async_setup	= 1,
 		.plug			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
@@ -873,7 +873,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_data	= 1,
+		.needs_async_setup	= 1,
 		.plug			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
@@ -907,7 +907,7 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_data	= 1,
+		.needs_async_setup	= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 	},
 	[IORING_OP_RECVMSG] = {
@@ -915,11 +915,10 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
-		.needs_async_data	= 1,
+		.needs_async_setup	= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 	},
 	[IORING_OP_TIMEOUT] = {
-		.needs_async_data	= 1,
 		.async_size		= sizeof(struct io_timeout_data),
 	},
 	[IORING_OP_TIMEOUT_REMOVE] = {
@@ -932,14 +931,13 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_ASYNC_CANCEL] = {},
 	[IORING_OP_LINK_TIMEOUT] = {
-		.needs_async_data	= 1,
 		.async_size		= sizeof(struct io_timeout_data),
 	},
 	[IORING_OP_CONNECT] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.needs_async_data	= 1,
+		.needs_async_setup	= 1,
 		.async_size		= sizeof(struct io_async_connect),
 	},
 	[IORING_OP_FALLOCATE] = {
@@ -3139,7 +3137,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force)
 {
-	if (!force && !io_op_defs[req->opcode].needs_async_data)
+	if (!force && !io_op_defs[req->opcode].needs_async_setup)
 		return 0;
 	if (!req->async_data) {
 		if (io_alloc_async_data(req)) {
@@ -5872,12 +5870,8 @@ static int io_req_prep_async(struct io_kiocb *req)
 {
 	switch (req->opcode) {
 	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_READ:
 		return io_rw_prep_async(req, READ);
 	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-	case IORING_OP_WRITE:
 		return io_rw_prep_async(req, WRITE);
 	case IORING_OP_SENDMSG:
 		return io_sendmsg_prep_async(req);
@@ -5891,11 +5885,10 @@ static int io_req_prep_async(struct io_kiocb *req)
 
 static int io_req_defer_prep(struct io_kiocb *req)
 {
-	if (!io_op_defs[req->opcode].needs_async_data)
-		return 0;
-	/* some opcodes init it during the inital prep */
-	if (req->async_data)
+	if (!io_op_defs[req->opcode].needs_async_setup)
 		return 0;
+	if (WARN_ON_ONCE(req->async_data))
+		return -EFAULT;
 	if (io_alloc_async_data(req))
 		return -EAGAIN;
 	return io_req_prep_async(req);
-- 
cgit v1.2.3


From b7e298d265f20eafc3615be271a3e5d90e4dc3dd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:19 +0000
Subject: io_uring: merge defer_prep() and prep_async()

Merge two function and do renaming in favour of the second one, it
relays the meaning better.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4df9961d6245..f0629065e16d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5868,6 +5868,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_req_prep_async(struct io_kiocb *req)
 {
+	if (!io_op_defs[req->opcode].needs_async_setup)
+		return 0;
+	if (WARN_ON_ONCE(req->async_data))
+		return -EFAULT;
+	if (io_alloc_async_data(req))
+		return -EAGAIN;
+
 	switch (req->opcode) {
 	case IORING_OP_READV:
 		return io_rw_prep_async(req, READ);
@@ -5880,18 +5887,9 @@ static int io_req_prep_async(struct io_kiocb *req)
 	case IORING_OP_CONNECT:
 		return io_connect_prep_async(req);
 	}
-	return 0;
-}
-
-static int io_req_defer_prep(struct io_kiocb *req)
-{
-	if (!io_op_defs[req->opcode].needs_async_setup)
-		return 0;
-	if (WARN_ON_ONCE(req->async_data))
-		return -EFAULT;
-	if (io_alloc_async_data(req))
-		return -EAGAIN;
-	return io_req_prep_async(req);
+	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
+		    req->opcode);
+	return -EFAULT;
 }
 
 static u32 io_get_sequence(struct io_kiocb *req)
@@ -5924,7 +5922,7 @@ static int io_req_defer(struct io_kiocb *req)
 	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
 		return 0;
 
-	ret = io_req_defer_prep(req);
+	ret = io_req_prep_async(req);
 	if (ret)
 		return ret;
 	io_prep_async_link(req);
@@ -6339,7 +6337,7 @@ fail_req:
 			io_req_complete_failed(req, ret);
 		}
 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
-		ret = io_req_defer_prep(req);
+		ret = io_req_prep_async(req);
 		if (unlikely(ret))
 			goto fail_req;
 		io_queue_async_work(req);
@@ -6492,7 +6490,7 @@ fail_req:
 			head->flags |= REQ_F_IO_DRAIN;
 			ctx->drain_next = 1;
 		}
-		ret = io_req_defer_prep(req);
+		ret = io_req_prep_async(req);
 		if (unlikely(ret))
 			goto fail_req;
 		trace_io_uring_link(ctx, req, head);
-- 
cgit v1.2.3


From 179ae0d15e8b3a2d9affe680281009f1f10c4a9d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Feb 2021 22:35:20 +0000
Subject: io_uring: simplify io_resubmit_prep()

If not for async_data NULL check, io_resubmit_prep() is already an rw
specific version of io_req_prep_async(), but slower because 1) it always
goes through io_import_iovec() even if following io_setup_async_rw() the
result 2) instead of initialising iovec/iter in-place it does it
on-stack and then copies with io_setup_async_rw().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 +++----------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f0629065e16d..3e102acb311a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1032,14 +1032,10 @@ static struct file *io_file_get(struct io_submit_state *state,
 static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
 
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
-			   struct iov_iter *iter, bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-			     const struct iovec *fast_iov,
-			     struct iov_iter *iter, bool force);
 static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_comp_state *cs,
 					struct io_ring_ctx *ctx);
+static int io_req_prep_async(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -2429,35 +2425,8 @@ static void kiocb_end_write(struct io_kiocb *req)
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
-	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-	int rw, ret;
-	struct iov_iter iter;
-
-	/* already prepared */
-	if (req->async_data)
-		return true;
-
-	switch (req->opcode) {
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_READ:
-		rw = READ;
-		break;
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-	case IORING_OP_WRITE:
-		rw = WRITE;
-		break;
-	default:
-		printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
-				req->opcode);
-		return false;
-	}
-
-	ret = io_import_iovec(rw, req, &iovec, &iter, false);
-	if (ret < 0)
-		return false;
-	return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+	/* either already prepared or successfully done */
+	return req->async_data || !io_req_prep_async(req);
 }
 
 static bool io_rw_should_reissue(struct io_kiocb *req)
-- 
cgit v1.2.3


From de9b4ccad750f216616730b74ed2be16c80892a4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Feb 2021 13:28:27 -0700
Subject: io_uring: wrap io_kiocb reference count manipulation in helpers

No functional changes in this patch, just in preparation for handling the
references a bit more efficiently.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 55 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3e102acb311a..e37d5e769afe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1470,6 +1470,31 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 	return ret;
 }
 
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+	return refcount_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
+{
+	return refcount_sub_and_test(refs, &req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+	return refcount_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_put(struct io_kiocb *req)
+{
+	refcount_dec(&req->refs);
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+	refcount_inc(&req->refs);
+}
+
 static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 				   unsigned int cflags)
 {
@@ -1506,7 +1531,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 		io_clean_op(req);
 		req->result = res;
 		req->compl.cflags = cflags;
-		refcount_inc(&req->refs);
+		req_ref_get(req);
 		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
 	}
 }
@@ -1528,7 +1553,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
 	 */
-	if (refcount_dec_and_test(&req->refs)) {
+	if (req_ref_put_and_test(req)) {
 		struct io_comp_state *cs = &ctx->submit_state.comp;
 
 		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
@@ -2108,7 +2133,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 		req = cs->reqs[i];
 
 		/* submission and completion refs */
-		if (refcount_sub_and_test(2, &req->refs))
+		if (req_ref_sub_and_test(req, 2))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
@@ -2124,7 +2149,7 @@ static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = NULL;
 
-	if (refcount_dec_and_test(&req->refs)) {
+	if (req_ref_put_and_test(req)) {
 		nxt = io_req_find_next(req);
 		__io_free_req(req);
 	}
@@ -2133,7 +2158,7 @@ static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 
 static void io_put_req(struct io_kiocb *req)
 {
-	if (refcount_dec_and_test(&req->refs))
+	if (req_ref_put_and_test(req))
 		io_free_req(req);
 }
 
@@ -2156,14 +2181,14 @@ static void io_free_req_deferred(struct io_kiocb *req)
 
 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 {
-	if (refcount_sub_and_test(refs, &req->refs))
+	if (req_ref_sub_and_test(req, refs))
 		io_free_req_deferred(req);
 }
 
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
-	if (refcount_sub_and_test(2, &req->refs))
+	if (req_ref_sub_and_test(req, 2))
 		io_free_req(req);
 }
 
@@ -2249,7 +2274,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
 
-		if (refcount_dec_and_test(&req->refs))
+		if (req_ref_put_and_test(req))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
@@ -2464,7 +2489,7 @@ static bool io_rw_reissue(struct io_kiocb *req)
 	lockdep_assert_held(&req->ctx->uring_lock);
 
 	if (io_resubmit_prep(req)) {
-		refcount_inc(&req->refs);
+		req_ref_get(req);
 		io_queue_async_work(req);
 		return true;
 	}
@@ -3169,7 +3194,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	list_del_init(&wait->entry);
 
 	/* submit ref gets dropped, acquire a new one */
-	refcount_inc(&req->refs);
+	req_ref_get(req);
 	io_req_task_queue(req);
 	return 1;
 }
@@ -4893,7 +4918,7 @@ static void io_poll_remove_double(struct io_kiocb *req)
 		spin_lock(&head->lock);
 		list_del_init(&poll->wait.entry);
 		if (poll->wait.private)
-			refcount_dec(&req->refs);
+			req_ref_put(req);
 		poll->head = NULL;
 		spin_unlock(&head->lock);
 	}
@@ -4959,7 +4984,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 			poll->wait.func(&poll->wait, mode, sync, key);
 		}
 	}
-	refcount_dec(&req->refs);
+	req_ref_put(req);
 	return 1;
 }
 
@@ -5002,7 +5027,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 			return;
 		}
 		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
-		refcount_inc(&req->refs);
+		req_ref_get(req);
 		poll->wait.private = req;
 		*poll_ptr = poll;
 	}
@@ -6142,7 +6167,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	/* avoid locking problems by failing it from a clean context */
 	if (ret) {
 		/* io-wq is going to take one down */
-		refcount_inc(&req->refs);
+		req_ref_get(req);
 		io_req_task_queue_fail(req, ret);
 	}
 }
@@ -6200,7 +6225,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	 * We don't expect the list to be empty, that will only happen if we
 	 * race with the completion of the linked work.
 	 */
-	if (prev && refcount_inc_not_zero(&prev->refs))
+	if (prev && req_ref_inc_not_zero(prev))
 		io_remove_next_linked(prev);
 	else
 		prev = NULL;
-- 
cgit v1.2.3


From abc54d634334f24d9a3253b8207b42eda852f25a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Feb 2021 13:32:30 -0700
Subject: io_uring: switch to atomic_t for io_kiocb reference count

io_uring manipulates references twice for each request, and hence is very
sensitive to performance of the reference count. This commit borrows a
trick from:

commit f958d7b528b1b40c44cfda5eabe2d82760d868c3
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu Apr 11 10:06:20 2019 -0700

    mm: make page ref count overflow check tighter and more explicit

and switches to atomic_t for references, while still retaining overflow
and underflow checks.

This is good for a 2-3% increase in peak IOPS on a single core. Before:

IOPS=2970879, IOS/call=31/31, inflight=128 (128)
IOPS=2952597, IOS/call=31/31, inflight=128 (128)
IOPS=2943904, IOS/call=31/31, inflight=128 (128)
IOPS=2930006, IOS/call=31/31, inflight=96 (96)

and after:

IOPS=3054354, IOS/call=31/31, inflight=128 (128)
IOPS=3059038, IOS/call=31/31, inflight=128 (128)
IOPS=3060320, IOS/call=31/31, inflight=128 (128)
IOPS=3068256, IOS/call=31/31, inflight=96 (96)

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e37d5e769afe..094f869674b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -801,7 +801,7 @@ struct io_kiocb {
 
 	struct io_ring_ctx		*ctx;
 	unsigned int			flags;
-	refcount_t			refs;
+	atomic_t			refs;
 	struct task_struct		*task;
 	u64				user_data;
 
@@ -1470,29 +1470,39 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 	return ret;
 }
 
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 {
-	return refcount_inc_not_zero(&req->refs);
+	return atomic_inc_not_zero(&req->refs);
 }
 
 static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
 {
-	return refcount_sub_and_test(refs, &req->refs);
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_sub_and_test(refs, &req->refs);
 }
 
 static inline bool req_ref_put_and_test(struct io_kiocb *req)
 {
-	return refcount_dec_and_test(&req->refs);
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->refs);
 }
 
 static inline void req_ref_put(struct io_kiocb *req)
 {
-	refcount_dec(&req->refs);
+	WARN_ON_ONCE(req_ref_put_and_test(req));
 }
 
 static inline void req_ref_get(struct io_kiocb *req)
 {
-	refcount_inc(&req->refs);
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	atomic_inc(&req->refs);
 }
 
 static void __io_cqring_fill_event(struct io_kiocb *req, long res,
@@ -6383,7 +6393,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->link = NULL;
 	req->fixed_rsrc_refs = NULL;
 	/* one is dropped after submission, the other at completion */
-	refcount_set(&req->refs, 2);
+	atomic_set(&req->refs, 2);
 	req->task = current;
 	req->result = 0;
 	req->work.list.next = NULL;
-- 
cgit v1.2.3


From c9dca27dc7f9c5dc4ee4ba5b77f7584387f867fe Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Mar 2021 13:13:55 +0000
Subject: io_uring: simplify io_sqd_update_thread_idle()

Use a more comprehensible() max instead of hand coding it with ifs in
io_sqd_update_thread_idle().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 094f869674b3..d5ef9560449b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6699,11 +6699,8 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 	struct io_ring_ctx *ctx;
 	unsigned sq_thread_idle = 0;
 
-	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-		if (sq_thread_idle < ctx->sq_thread_idle)
-			sq_thread_idle = ctx->sq_thread_idle;
-	}
-
+	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
 	sqd->sq_thread_idle = sq_thread_idle;
 }
 
-- 
cgit v1.2.3


From d44f554e105b0c20e5b06b9f821bef228e04d573 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Mar 2021 08:27:05 -0700
Subject: io_uring: don't check for io_uring_fops for fixed files

We don't allow them at registration time, so limit the check for needing
inflight tracking in io_file_get() to the non-fixed path.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d5ef9560449b..e7ddfa136860 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6212,10 +6212,12 @@ static struct file *io_file_get(struct io_submit_state *state,
 	} else {
 		trace_io_uring_file_get(ctx, fd);
 		file = __io_file_get(state, fd);
+
+		/* we don't allow fixed io_uring files */
+		if (file && unlikely(file->f_op == &io_uring_fops))
+			io_req_track_inflight(req);
 	}
 
-	if (file && unlikely(file->f_op == &io_uring_fops))
-		io_req_track_inflight(req);
 	return file;
 }
 
-- 
cgit v1.2.3


From 7b29f92da377c358955b522045d0778aa79a540a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Mar 2021 08:30:14 -0700
Subject: io_uring: cache async and regular file state for fixed files

We have to dig quite deep to check for particularly whether or not a
file supports a fast-path nonblock attempt. For fixed files, we can do
this lookup once and cache the state instead.

This adds two new bits to track whether we support async read/write
attempt, and lines up the REQ_F_ISREG bit with those two. The file slot
re-uses the last 3 (or 2, for 32-bit) of the file pointer to cache that
state, and then we mask it in when we go and use a fixed file.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e7ddfa136860..9ee33c32bea4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -690,7 +690,6 @@ enum {
 	REQ_F_CUR_POS_BIT,
 	REQ_F_NOWAIT_BIT,
 	REQ_F_LINK_TIMEOUT_BIT,
-	REQ_F_ISREG_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
@@ -698,6 +697,10 @@ enum {
 	REQ_F_LTIMEOUT_ACTIVE_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
+	/* keep async read/write and isreg together and in order */
+	REQ_F_ASYNC_READ_BIT,
+	REQ_F_ASYNC_WRITE_BIT,
+	REQ_F_ISREG_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -727,8 +730,6 @@ enum {
 	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
 	/* has or had linked timeout */
 	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
-	/* regular file */
-	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* needs cleanup */
 	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
 	/* already went through poll handler */
@@ -743,6 +744,12 @@ enum {
 	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
+	/* supports async reads */
+	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
+	/* supports async writes */
+	REQ_F_ASYNC_WRITE	= BIT(REQ_F_ASYNC_WRITE_BIT),
+	/* regular file */
+	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 };
 
 struct async_poll {
@@ -2651,7 +2658,7 @@ static bool io_bdev_nowait(struct block_device *bdev)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static bool io_file_supports_async(struct file *file, int rw)
+static bool __io_file_supports_async(struct file *file, int rw)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
@@ -2684,6 +2691,16 @@ static bool io_file_supports_async(struct file *file, int rw)
 	return file->f_op->write_iter != NULL;
 }
 
+static bool io_file_supports_async(struct io_kiocb *req, int rw)
+{
+	if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+		return true;
+	else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+		return true;
+
+	return __io_file_supports_async(req->file, rw);
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2692,7 +2709,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	unsigned ioprio;
 	int ret;
 
-	if (S_ISREG(file_inode(file)->i_mode))
+	if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
@@ -3289,7 +3306,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+	if (force_nonblock && !io_file_supports_async(req, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 		return ret ?: -EAGAIN;
 	}
@@ -3394,7 +3411,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req->file, WRITE))
+	if (force_nonblock && !io_file_supports_async(req, WRITE))
 		goto copy_iov;
 
 	/* file path doesn't support NOWAIT for non-direct_IO */
@@ -5173,7 +5190,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	else
 		return false;
 	/* if we can't nonblock try, then no point in arming a poll handler */
-	if (!io_file_supports_async(req->file, rw))
+	if (!io_file_supports_async(req, rw))
 		return false;
 
 	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -6182,6 +6199,15 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 }
 
+#define FFS_ASYNC_READ		0x1UL
+#define FFS_ASYNC_WRITE		0x2UL
+#ifdef CONFIG_64BIT
+#define FFS_ISREG		0x4UL
+#else
+#define FFS_ISREG		0x0UL
+#endif
+#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+
 static inline struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
 					       unsigned i)
 {
@@ -6194,7 +6220,9 @@ static inline struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 					      int index)
 {
-	return *io_fixed_file_slot(ctx->file_data, index);
+	struct file **file_slot = io_fixed_file_slot(ctx->file_data, index);
+
+	return (struct file *) ((unsigned long) *file_slot & FFS_MASK);
 }
 
 static struct file *io_file_get(struct io_submit_state *state,
@@ -6204,10 +6232,16 @@ static struct file *io_file_get(struct io_submit_state *state,
 	struct file *file;
 
 	if (fixed) {
+		unsigned long file_ptr;
+
 		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 			return NULL;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
-		file = io_file_from_index(ctx, fd);
+		file_ptr = (unsigned long) *io_fixed_file_slot(ctx->file_data, fd);
+		file = (struct file *) (file_ptr & FFS_MASK);
+		file_ptr &= ~FFS_MASK;
+		/* mask in overlapping REQ_F and FFS bits */
+		req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
 		io_set_resource_node(req);
 	} else {
 		trace_io_uring_file_get(ctx, fd);
@@ -7556,6 +7590,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+		unsigned long file_ptr;
+
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
@@ -7580,7 +7616,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		*io_fixed_file_slot(file_data, i) = file;
+		file_ptr = (unsigned long) file;
+		if (__io_file_supports_async(file, READ))
+			file_ptr |= FFS_ASYNC_READ;
+		if (__io_file_supports_async(file, WRITE))
+			file_ptr |= FFS_ASYNC_WRITE;
+		if (S_ISREG(file_inode(file)->i_mode))
+			file_ptr |= FFS_ISREG;
+		*io_fixed_file_slot(file_data, i) = (struct file *) file_ptr;
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7713,7 +7756,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		file_slot = io_fixed_file_slot(ctx->file_data, i);
 
 		if (*file_slot) {
-			err = io_queue_file_removal(data, *file_slot);
+			file = (struct file *) ((unsigned long) *file_slot & FFS_MASK);
+			err = io_queue_file_removal(data, file);
 			if (err)
 				break;
 			*file_slot = NULL;
@@ -9288,7 +9332,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct file *f = *io_fixed_file_slot(ctx->file_data, i);
+		struct file *f = io_file_from_index(ctx, i);
 
 		if (f)
 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
-- 
cgit v1.2.3


From b9b0e0d39c7b4be7af7976c52bdb8664dfa389f5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 08:18:36 -0700
Subject: io_uring: correct comment on poll vs iopoll

The correct function is io_iopoll_complete(), which deals with completions
of IOPOLL requests, not io_poll_complete().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ee33c32bea4..13d087ebe057 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2564,7 +2564,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 		req_set_fail_links(req);
 
 	WRITE_ONCE(req->result, res);
-	/* order with io_poll_complete() checking ->result */
+	/* order with io_iopoll_complete() checking ->result */
 	smp_wmb();
 	WRITE_ONCE(req->iopoll_completed, 1);
 }
-- 
cgit v1.2.3


From 45ab03b19e8bf33af3e5f5a24729e5564d54fae9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 08:19:33 -0700
Subject: io_uring: transform ret == 0 for poll cancelation completions

We can set canceled == true and complete out-of-line, ensure that we catch
that and correctly return -ECANCELED if the poll operation got canceled.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 13d087ebe057..962a3580c49f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4955,6 +4955,9 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (!error && req->poll.canceled)
+		error = -ECANCELED;
+
 	io_poll_remove_double(req);
 	req->poll.done = true;
 	io_cqring_fill_event(req, error ? error : mangle_poll(mask));
-- 
cgit v1.2.3


From 493f3b158a1e445e24d567847045baf5a723d206 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:29 +0000
Subject: io_uring: don't take ctx refs in task_work handler

__tctx_task_work() guarantees that ctx won't be killed while running
task_works, so we can remove now unnecessary ctx pinning for internally
armed polling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 962a3580c49f..12e2ec7cfba4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4882,7 +4882,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 
 	req->result = mask;
 	req->task_work.func = func;
-	percpu_ref_get(&req->ctx->refs);
 
 	/*
 	 * If this fails, then the task is exiting. When a task exits, the
@@ -4982,8 +4981,6 @@ static void io_poll_task_func(struct callback_head *cb)
 		if (nxt)
 			__io_req_task_submit(nxt);
 	}
-
-	percpu_ref_put(&ctx->refs);
 }
 
 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
@@ -5090,7 +5087,6 @@ static void io_async_task_func(struct callback_head *cb)
 
 	if (io_poll_rewait(req, &apoll->poll)) {
 		spin_unlock_irq(&ctx->completion_lock);
-		percpu_ref_put(&ctx->refs);
 		return;
 	}
 
@@ -5106,7 +5102,6 @@ static void io_async_task_func(struct callback_head *cb)
 	else
 		__io_req_task_cancel(req, -ECANCELED);
 
-	percpu_ref_put(&ctx->refs);
 	kfree(apoll->double_poll);
 	kfree(apoll);
 }
-- 
cgit v1.2.3


From 33f993da9829738da3e088fb5d3128880a4137ba Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:30 +0000
Subject: io_uring: optimise io_uring_enter()

Add unlikely annotations, because my compiler pretty much mispredicts
every first check, and apart jumping around in the fast path, it also
generates extra instructions, like in advance setting ret value.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 12e2ec7cfba4..13a64b1db3be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9183,31 +9183,31 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		size_t, argsz)
 {
 	struct io_ring_ctx *ctx;
-	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
+	long ret;
 
 	io_run_task_work();
 
-	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
+	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (unlikely(!f.file))
 		return -EBADF;
 
 	ret = -EOPNOTSUPP;
-	if (f.file->f_op != &io_uring_fops)
+	if (unlikely(f.file->f_op != &io_uring_fops))
 		goto out_fput;
 
 	ret = -ENXIO;
 	ctx = f.file->private_data;
-	if (!percpu_ref_tryget(&ctx->refs))
+	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
 		goto out_fput;
 
 	ret = -EBADFD;
-	if (ctx->flags & IORING_SETUP_R_DISABLED)
+	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
 		goto out;
 
 	/*
-- 
cgit v1.2.3


From cf27f3b14961845d816c49abc99aae4863207c77 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:31 +0000
Subject: io_uring: optimise tctx node checks/alloc

First of all, w need to set tctx->sqpoll only when we add a new entry
into ->xa, so move it from the hot path. Also extract a hot path for
io_uring_add_task_file() as an inline helper.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 13a64b1db3be..1c846d5f39d8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8846,10 +8846,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-/*
- * Note that this task has used io_uring. We use it for cancelation purposes.
- */
-static int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -8861,32 +8858,40 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx)
 			return ret;
 		tctx = current->io_uring;
 	}
-	if (tctx->last != ctx) {
-		void *old = xa_load(&tctx->xa, (unsigned long)ctx);
-
-		if (!old) {
-			node = kmalloc(sizeof(*node), GFP_KERNEL);
-			if (!node)
-				return -ENOMEM;
-			node->ctx = ctx;
-			node->task = current;
-
-			ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
-						node, GFP_KERNEL));
-			if (ret) {
-				kfree(node);
-				return ret;
-			}
+	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			return -ENOMEM;
+		node->ctx = ctx;
+		node->task = current;
 
-			mutex_lock(&ctx->uring_lock);
-			list_add(&node->ctx_node, &ctx->tctx_list);
-			mutex_unlock(&ctx->uring_lock);
+		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+					node, GFP_KERNEL));
+		if (ret) {
+			kfree(node);
+			return ret;
 		}
-		tctx->last = ctx;
+
+		mutex_lock(&ctx->uring_lock);
+		list_add(&node->ctx_node, &ctx->tctx_list);
+		mutex_unlock(&ctx->uring_lock);
 	}
+	tctx->last = ctx;
 	return 0;
 }
 
+/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+{
+	struct io_uring_task *tctx = current->io_uring;
+
+	if (likely(tctx && tctx->last == ctx))
+		return 0;
+	return __io_uring_add_task_file(ctx);
+}
+
 /*
  * Remove this io_uring_file -> task mapping.
  */
-- 
cgit v1.2.3


From 966706579a7124fa6334f10c48474193fd6780c0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:32 +0000
Subject: io_uring: keep io_req_free_batch() call locality

Don't do a function call (io_dismantle_req()) in the middle and place it
to near other function calls, otherwise may lead to excessive register
spilling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1c846d5f39d8..7bbecf45d91c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2112,6 +2112,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 			      struct io_submit_state *state)
 {
 	io_queue_next(req);
+	io_dismantle_req(req);
 
 	if (req->task != rb->task) {
 		if (rb->task)
@@ -2122,7 +2123,6 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	rb->task_refs++;
 	rb->ctx_refs++;
 
-	io_dismantle_req(req);
 	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
 	else
-- 
cgit v1.2.3


From de968c182b4f48a421b0a3862e747c4147a7da22 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:33 +0000
Subject: io_uring: inline __io_queue_linked_timeout()

Inline __io_queue_linked_timeout(), we don't need it

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7bbecf45d91c..416afa7af5df 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1028,7 +1028,6 @@ static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
 static void io_queue_next(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
-static void __io_queue_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update *ip,
@@ -6285,8 +6284,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void __io_queue_linked_timeout(struct io_kiocb *req)
+static void io_queue_linked_timeout(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
 	/*
 	 * If the back reference is NULL, then our linked request finished
 	 * before we got a chance to setup the timer
@@ -6298,16 +6300,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req)
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
 	}
-}
-
-static void io_queue_linked_timeout(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock_irq(&ctx->completion_lock);
-	__io_queue_linked_timeout(req);
 	spin_unlock_irq(&ctx->completion_lock);
-
 	/* drop submission reference */
 	io_put_req(req);
 }
-- 
cgit v1.2.3


From 1840038e119573fc624a2fc586a1c5ced50b59f2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:34 +0000
Subject: io_uring: optimise success case of __io_queue_sqe

Move the case of successfully issued request by doing that check first.
It's not much of a difference, just generates slightly better code for
me.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 416afa7af5df..b08813eacd49 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6330,15 +6330,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 	 * doesn't support non-blocking read/write attempts
 	 */
-	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-		if (!io_arm_poll_handler(req)) {
-			/*
-			 * Queued up for async execution, worker will release
-			 * submit reference when the iocb is actually submitted.
-			 */
-			io_queue_async_work(req);
-		}
-	} else if (likely(!ret)) {
+	if (likely(!ret)) {
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
 			struct io_ring_ctx *ctx = req->ctx;
@@ -6350,6 +6342,14 @@ static void __io_queue_sqe(struct io_kiocb *req)
 		} else {
 			io_put_req(req);
 		}
+	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+		if (!io_arm_poll_handler(req)) {
+			/*
+			 * Queued up for async execution, worker will release
+			 * submit reference when the iocb is actually submitted.
+			 */
+			io_queue_async_work(req);
+		}
 	} else {
 		io_req_complete_failed(req, ret);
 	}
-- 
cgit v1.2.3


From dd78f49260dd49f21bbf12080cceb8e13ce53db3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:35 +0000
Subject: io_uring: refactor io_flush_cached_reqs()

Emphasize that return value of io_flush_cached_reqs() depends on number
of requests in the cache. It looks nicer and might help tools from
false-negative analyses.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b08813eacd49..7623188ff95f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1627,11 +1627,12 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
 	io_req_complete_post(req, res, 0);
 }
 
+/* Returns true IFF there are requests in the cache */
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	struct io_comp_state *cs = &state->comp;
-	struct io_kiocb *req = NULL;
+	int nr;
 
 	/*
 	 * If we have more than a batch's worth of requests in our IRQ side
@@ -1645,16 +1646,19 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 		spin_unlock_irq(&ctx->completion_lock);
 	}
 
+	nr = state->free_reqs;
 	while (!list_empty(&cs->free_list)) {
-		req = list_first_entry(&cs->free_list, struct io_kiocb,
-					compl.list);
+		struct io_kiocb *req = list_first_entry(&cs->free_list,
+						struct io_kiocb, compl.list);
+
 		list_del(&req->compl.list);
-		state->reqs[state->free_reqs++] = req;
-		if (state->free_reqs == ARRAY_SIZE(state->reqs))
+		state->reqs[nr++] = req;
+		if (nr == ARRAY_SIZE(state->reqs))
 			break;
 	}
 
-	return req != NULL;
+	state->free_reqs = nr;
+	return nr != 0;
 }
 
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From 8dd03afe611d371b8c8a2ebeec2720de662a21dc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:36 +0000
Subject: io_uring: refactor rsrc refnode allocation

There are two problems:
1) we always allocate refnodes in advance and free them if those
haven't been used. It's expensive, takes two allocations, where one of
them is percpu. And it may be pretty common not actually using them.

2) Current API with allocating a refnode and setting some of the fields
is error prone, we don't ever want to have a file node runninng fixed
buffer callback...

Solve both with pre-init/get API. Pre-init just leaves the node for
later if not used, and for get (i.e. io_rsrc_refnode_get()), you need to
explicitly pass all arguments setting callbacks/etc., so it's more
resilient.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 58 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 19 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7623188ff95f..ec2533c399f8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -442,6 +442,7 @@ struct io_ring_ctx {
 	struct llist_head		rsrc_put_llist;
 	struct list_head		rsrc_ref_list;
 	spinlock_t			rsrc_ref_lock;
+	struct fixed_rsrc_ref_node	*rsrc_backup_node;
 
 	struct io_restriction		restrictions;
 
@@ -7041,12 +7042,36 @@ static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_dat
 		percpu_ref_kill(&ref_node->refs);
 }
 
+static int io_rsrc_refnode_prealloc(struct io_ring_ctx *ctx)
+{
+	if (ctx->rsrc_backup_node)
+		return 0;
+	ctx->rsrc_backup_node = alloc_fixed_rsrc_ref_node(ctx);
+	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+}
+
+static struct fixed_rsrc_ref_node *
+io_rsrc_refnode_get(struct io_ring_ctx *ctx,
+		    struct fixed_rsrc_data *rsrc_data,
+		    void (*rsrc_put)(struct io_ring_ctx *ctx,
+		                     struct io_rsrc_put *prsrc))
+{
+	struct fixed_rsrc_ref_node *node = ctx->rsrc_backup_node;
+
+	WARN_ON_ONCE(!node);
+
+	ctx->rsrc_backup_node = NULL;
+	node->rsrc_data = rsrc_data;
+	node->rsrc_put = rsrc_put;
+	return node;
+}
+
 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 			       struct io_ring_ctx *ctx,
 			       void (*rsrc_put)(struct io_ring_ctx *ctx,
 			                        struct io_rsrc_put *prsrc))
 {
-	struct fixed_rsrc_ref_node *backup_node;
+	struct fixed_rsrc_ref_node *node;
 	int ret;
 
 	if (data->quiesce)
@@ -7054,13 +7079,9 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 
 	data->quiesce = true;
 	do {
-		ret = -ENOMEM;
-		backup_node = alloc_fixed_rsrc_ref_node(ctx);
-		if (!backup_node)
+		ret = io_rsrc_refnode_prealloc(ctx);
+		if (ret)
 			break;
-		backup_node->rsrc_data = data;
-		backup_node->rsrc_put = rsrc_put;
-
 		io_sqe_rsrc_kill_node(ctx, data);
 		percpu_ref_kill(&data->refs);
 		flush_delayed_work(&ctx->rsrc_put_work);
@@ -7070,17 +7091,16 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 			break;
 
 		percpu_ref_resurrect(&data->refs);
-		io_sqe_rsrc_set_node(ctx, data, backup_node);
-		backup_node = NULL;
+		node = io_rsrc_refnode_get(ctx, data, rsrc_put);
+		io_sqe_rsrc_set_node(ctx, data, node);
 		reinit_completion(&data->done);
+
 		mutex_unlock(&ctx->uring_lock);
 		ret = io_run_task_work_sig();
 		mutex_lock(&ctx->uring_lock);
 	} while (ret >= 0);
 	data->quiesce = false;
 
-	if (backup_node)
-		destroy_fixed_rsrc_ref_node(backup_node);
 	return ret;
 }
 
@@ -7731,11 +7751,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 	if (done > ctx->nr_user_files)
 		return -EINVAL;
-
-	ref_node = alloc_fixed_rsrc_ref_node(ctx);
-	if (!ref_node)
-		return -ENOMEM;
-	init_fixed_file_ref_node(ctx, ref_node);
+	err = io_rsrc_refnode_prealloc(ctx);
+	if (err)
+		return err;
 
 	fds = u64_to_user_ptr(up->data);
 	for (done = 0; done < nr_args; done++) {
@@ -7789,10 +7807,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
+		ref_node = io_rsrc_refnode_get(ctx, data, io_ring_file_put);
 		io_sqe_rsrc_set_node(ctx, data, ref_node);
-	} else
-		destroy_fixed_rsrc_ref_node(ref_node);
-
+	}
 	return done ? done : err;
 }
 
@@ -8468,6 +8485,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_eventfd_unregister(ctx);
 	io_destroy_buffers(ctx);
 
+	if (ctx->rsrc_backup_node)
+		destroy_fixed_rsrc_ref_node(ctx->rsrc_backup_node);
+
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
 		ctx->ring_sock->file = NULL; /* so that iput() is called */
-- 
cgit v1.2.3


From 0d85035a7368a6c6dc91ddeca6da12a50d24164e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:37 +0000
Subject: io_uring: inline io_put_req and friends

One big omission is that io_put_req() haven't been marked inline, and at
least gcc 9 doesn't inline it, not to mention that it's really hot and
extra function call is intolerable, especially when it doesn't put a
final ref.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ec2533c399f8..74ba816ba71d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2166,7 +2166,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
  */
-static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = NULL;
 
@@ -2177,7 +2177,7 @@ static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 	return nxt;
 }
 
-static void io_put_req(struct io_kiocb *req)
+static inline void io_put_req(struct io_kiocb *req)
 {
 	if (req_ref_put_and_test(req))
 		io_free_req(req);
-- 
cgit v1.2.3


From a05432fb49b6439d0c5b803053dfdd875940116d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:38 +0000
Subject: io_uring: refactor io_free_req_deferred()

We don't care about ret value in io_free_req_deferred(), make the code a
bit more concise.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 74ba816ba71d..08ab7c4830d5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2192,11 +2192,8 @@ static void io_put_req_deferred_cb(struct callback_head *cb)
 
 static void io_free_req_deferred(struct io_kiocb *req)
 {
-	int ret;
-
 	req->task_work.func = io_put_req_deferred_cb;
-	ret = io_req_task_work_add(req);
-	if (unlikely(ret))
+	if (unlikely(io_req_task_work_add(req)))
 		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
 }
 
-- 
cgit v1.2.3


From dac7a09864938a310eea08f26f5960d369680629 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:39 +0000
Subject: io_uring: add helper flushing locked_free_list

Add a new helper io_flush_cached_locked_reqs() that splices
locked_free_list to free_list, and does it right doing all sync and
invariant reinit.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 08ab7c4830d5..5eb12f45c6bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1628,6 +1628,15 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
 	io_req_complete_post(req, res, 0);
 }
 
+static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
+					struct io_comp_state *cs)
+{
+	spin_lock_irq(&ctx->completion_lock);
+	list_splice_init(&cs->locked_free_list, &cs->free_list);
+	cs->locked_free_nr = 0;
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
 /* Returns true IFF there are requests in the cache */
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
@@ -1640,12 +1649,8 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	 * locked cache, grab the lock and move them over to our submission
 	 * side cache.
 	 */
-	if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
-		spin_lock_irq(&ctx->completion_lock);
-		list_splice_init(&cs->locked_free_list, &cs->free_list);
-		cs->locked_free_nr = 0;
-		spin_unlock_irq(&ctx->completion_lock);
-	}
+	if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH)
+		io_flush_cached_locked_reqs(ctx, cs);
 
 	nr = state->free_reqs;
 	while (!list_empty(&cs->free_list)) {
@@ -8446,13 +8451,8 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		submit_state->free_reqs = 0;
 	}
 
-	spin_lock_irq(&ctx->completion_lock);
-	list_splice_init(&cs->locked_free_list, &cs->free_list);
-	cs->locked_free_nr = 0;
-	spin_unlock_irq(&ctx->completion_lock);
-
+	io_flush_cached_locked_reqs(ctx, cs);
 	io_req_cache_free(&cs->free_list, NULL);
-
 	mutex_unlock(&ctx->uring_lock);
 }
 
-- 
cgit v1.2.3


From 2593553a01c803e01e7c5c2131993885879efbec Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:40 +0000
Subject: io_uring: remove __io_req_task_cancel()

Both io_req_complete_failed() and __io_req_task_cancel() do the same
thing: set failure flag, put both req refs and emit an CQE. The former
one is a bit more advance as it puts req back into a req cache, so make
it to take over __io_req_task_cancel() and remove the last one.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5eb12f45c6bc..9f9eb853a083 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1024,7 +1024,6 @@ static bool io_rw_reissue(struct io_kiocb *req);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
-static void io_double_put_req(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
 static void io_queue_next(struct io_kiocb *req);
@@ -2019,20 +2018,6 @@ static void io_req_task_work_add_fallback(struct io_kiocb *req,
 	io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
 }
 
-static void __io_req_task_cancel(struct io_kiocb *req, int error)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock_irq(&ctx->completion_lock);
-	io_cqring_fill_event(req, error);
-	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
-	req_set_fail_links(req);
-	io_double_put_req(req);
-}
-
 static void io_req_task_cancel(struct callback_head *cb)
 {
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
@@ -2040,7 +2025,7 @@ static void io_req_task_cancel(struct callback_head *cb)
 
 	/* ctx is guaranteed to stay alive while we hold uring_lock */
 	mutex_lock(&ctx->uring_lock);
-	__io_req_task_cancel(req, req->result);
+	io_req_complete_failed(req, req->result);
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -2053,7 +2038,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 	if (!(current->flags & PF_EXITING) && !current->in_execve)
 		__io_queue_sqe(req);
 	else
-		__io_req_task_cancel(req, -EFAULT);
+		io_req_complete_failed(req, -EFAULT);
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -2208,13 +2193,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 		io_free_req_deferred(req);
 }
 
-static void io_double_put_req(struct io_kiocb *req)
-{
-	/* drop both submit and complete references */
-	if (req_ref_sub_and_test(req, 2))
-		io_free_req(req);
-}
-
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	/* See comment at the top of this file */
@@ -5106,7 +5084,7 @@ static void io_async_task_func(struct callback_head *cb)
 	if (!READ_ONCE(apoll->poll.canceled))
 		__io_req_task_submit(req);
 	else
-		__io_req_task_cancel(req, -ECANCELED);
+		io_req_complete_failed(req, -ECANCELED);
 
 	kfree(apoll->double_poll);
 	kfree(apoll);
-- 
cgit v1.2.3


From 68fb897966febe814f89f9462aa819abae00725f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:41 +0000
Subject: io_uring: inline io_clean_op()'s fast path

Inline io_clean_op(), leaving __io_clean_op() but renaming it. This will
be used in following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f9eb853a083..00860dc04e82 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1032,7 +1032,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update *ip,
 				 unsigned nr_args);
-static void __io_clean_op(struct io_kiocb *req);
+static void io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req);
@@ -1063,12 +1063,6 @@ EXPORT_SYMBOL(io_uring_get_socket);
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-static inline void io_clean_op(struct io_kiocb *req)
-{
-	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
-		__io_clean_op(req);
-}
-
 static inline void io_set_resource_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1544,7 +1538,9 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 			set_bit(0, &ctx->cq_check_overflow);
 			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
-		io_clean_op(req);
+		if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+			io_clean_op(req);
+
 		req->result = res;
 		req->compl.cflags = cflags;
 		req_ref_get(req);
@@ -1600,7 +1596,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 static void io_req_complete_state(struct io_kiocb *req, long res,
 				  unsigned int cflags)
 {
-	io_clean_op(req);
+	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+		io_clean_op(req);
 	req->result = res;
 	req->compl.cflags = cflags;
 	req->flags |= REQ_F_COMPLETE_INLINE;
@@ -1708,8 +1705,8 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
 
 static void io_dismantle_req(struct io_kiocb *req)
 {
-	io_clean_op(req);
-
+	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+		io_clean_op(req);
 	if (req->async_data)
 		kfree(req->async_data);
 	if (req->file)
@@ -5949,7 +5946,7 @@ static int io_req_defer(struct io_kiocb *req)
 	return -EIOCBQUEUED;
 }
 
-static void __io_clean_op(struct io_kiocb *req)
+static void io_clean_op(struct io_kiocb *req)
 {
 	if (req->flags & REQ_F_BUFFER_SELECTED) {
 		switch (req->opcode) {
-- 
cgit v1.2.3


From 094bae49e5ed9c30c1a6e50e121be20469486fab Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:42 +0000
Subject: io_uring: optimise io_dismantle_req() fast path

Reshuffle io_dismantle_req() checks to put most of slow path stuff under
a single if.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 00860dc04e82..85224753bef7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1705,28 +1705,32 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
 
 static void io_dismantle_req(struct io_kiocb *req)
 {
-	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
-		io_clean_op(req);
-	if (req->async_data)
-		kfree(req->async_data);
+	unsigned int flags = req->flags;
+
 	if (req->file)
-		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+		io_put_file(req, req->file, (flags & REQ_F_FIXED_FILE));
+	if (flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
+		     REQ_F_INFLIGHT)) {
+		io_clean_op(req);
+
+		if (req->flags & REQ_F_INFLIGHT) {
+			struct io_ring_ctx *ctx = req->ctx;
+			unsigned long flags;
+
+			spin_lock_irqsave(&ctx->inflight_lock, flags);
+			list_del(&req->inflight_entry);
+			spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+			req->flags &= ~REQ_F_INFLIGHT;
+		}
+	}
 	if (req->fixed_rsrc_refs)
 		percpu_ref_put(req->fixed_rsrc_refs);
+	if (req->async_data)
+		kfree(req->async_data);
 	if (req->work.creds) {
 		put_cred(req->work.creds);
 		req->work.creds = NULL;
 	}
-
-	if (req->flags & REQ_F_INFLIGHT) {
-		struct io_ring_ctx *ctx = req->ctx;
-		unsigned long flags;
-
-		spin_lock_irqsave(&ctx->inflight_lock, flags);
-		list_del(&req->inflight_entry);
-		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
-		req->flags &= ~REQ_F_INFLIGHT;
-	}
 }
 
 /* must to be called somewhat shortly after putting a request */
-- 
cgit v1.2.3


From e1d767f078b88423bb8ed179fbfe3369395e10f8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:43 +0000
Subject: io_uring: abolish old io_put_file()

io_put_file() doesn't do a good job at generating a good code. Inline
it, so we can check REQ_F_FIXED_FILE first, prioritising FIXED_FILE case
over requests without files, and saving a memory load in that case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 85224753bef7..6b805b1c003d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1696,10 +1696,9 @@ got_req:
 	return state->reqs[state->free_reqs];
 }
 
-static inline void io_put_file(struct io_kiocb *req, struct file *file,
-			  bool fixed)
+static inline void io_put_file(struct file *file)
 {
-	if (!fixed)
+	if (file)
 		fput(file);
 }
 
@@ -1707,8 +1706,8 @@ static void io_dismantle_req(struct io_kiocb *req)
 {
 	unsigned int flags = req->flags;
 
-	if (req->file)
-		io_put_file(req, req->file, (flags & REQ_F_FIXED_FILE));
+	if (!(flags & REQ_F_FIXED_FILE))
+		io_put_file(req->file);
 	if (flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
 		     REQ_F_INFLIGHT)) {
 		io_clean_op(req);
@@ -3648,7 +3647,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 	if (sp->len)
 		ret = do_tee(in, out, sp->len, flags);
 
-	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+		io_put_file(in);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
 	if (ret != sp->len)
@@ -3684,7 +3684,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 	if (sp->len)
 		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
 
-	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+		io_put_file(in);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
 	if (ret != sp->len)
@@ -5989,8 +5990,8 @@ static void io_clean_op(struct io_kiocb *req)
 			}
 		case IORING_OP_SPLICE:
 		case IORING_OP_TEE:
-			io_put_file(req, req->splice.file_in,
-				    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+			if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
+				io_put_file(req->splice.file_in);
 			break;
 		case IORING_OP_OPENAT:
 		case IORING_OP_OPENAT2:
-- 
cgit v1.2.3


From c15b79dee51bd73d56fe526a779e8fbc02b09e6c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Mar 2021 17:22:44 +0000
Subject: io_uring: optimise io_req_task_work_add()

Inline io_task_work_add() into io_req_task_work_add(). They both work
with a request, so keeping them separate doesn't make things much more
clear, but merging allows optimise it. Apart from small wins like not
reading req->ctx or not calculating @notify in the hot path, i.e. with
tctx->task_state set, it avoids doing wake_up_process() for every single
add, but only after actually done task_work_add().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 50 ++++++++++++++++++--------------------------------
 1 file changed, 18 insertions(+), 32 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6b805b1c003d..7a6b2b313328 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1911,13 +1911,17 @@ static void tctx_task_work(struct callback_head *cb)
 		cond_resched();
 }
 
-static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
-			    enum task_work_notify_mode notify)
+static int io_req_task_work_add(struct io_kiocb *req)
 {
+	struct task_struct *tsk = req->task;
 	struct io_uring_task *tctx = tsk->io_uring;
+	enum task_work_notify_mode notify;
 	struct io_wq_work_node *node, *prev;
 	unsigned long flags;
-	int ret;
+	int ret = 0;
+
+	if (unlikely(tsk->flags & PF_EXITING))
+		return -ESRCH;
 
 	WARN_ON_ONCE(!tctx);
 
@@ -1930,14 +1934,23 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
 	    test_and_set_bit(0, &tctx->task_state))
 		return 0;
 
-	if (!task_work_add(tsk, &tctx->task_work, notify))
+	/*
+	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+	 * processing task_work. There's no reliable way to tell if TWA_RESUME
+	 * will do the job.
+	 */
+	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
+
+	if (!task_work_add(tsk, &tctx->task_work, notify)) {
+		wake_up_process(tsk);
 		return 0;
+	}
 
 	/*
 	 * Slow path - we failed, find and delete work. if the work is not
 	 * in the list, it got run and we're fine.
 	 */
-	ret = 0;
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	wq_list_for_each(node, prev, &tctx->task_list) {
 		if (&req->io_task_work.node == node) {
@@ -1951,33 +1964,6 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
 	return ret;
 }
 
-static int io_req_task_work_add(struct io_kiocb *req)
-{
-	struct task_struct *tsk = req->task;
-	struct io_ring_ctx *ctx = req->ctx;
-	enum task_work_notify_mode notify;
-	int ret;
-
-	if (tsk->flags & PF_EXITING)
-		return -ESRCH;
-
-	/*
-	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
-	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
-	 * processing task_work. There's no reliable way to tell if TWA_RESUME
-	 * will do the job.
-	 */
-	notify = TWA_NONE;
-	if (!(ctx->flags & IORING_SETUP_SQPOLL))
-		notify = TWA_SIGNAL;
-
-	ret = io_task_work_add(tsk, req, notify);
-	if (!ret)
-		wake_up_process(tsk);
-
-	return ret;
-}
-
 static bool io_run_task_work_head(struct callback_head **work_head)
 {
 	struct callback_head *work, *next;
-- 
cgit v1.2.3


From d4729fbde7665e81f4345e04e2ca86c0b52994d3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:24 +0000
Subject: io_uring: don't clear REQ_F_LINK_TIMEOUT

REQ_F_LINK_TIMEOUT is a hint that to look for linked timeouts to cancel,
we're leaving it even when it's already fired. Hence don't care to clear
it in io_kill_linked_timeout(), it's safe and is called only once.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7a6b2b313328..a152c0fd24cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1766,7 +1766,6 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_kiocb *link = req->link;
-	bool cancelled = false;
 
 	/*
 	 * Can happen if a linked timeout fired and link had been like
@@ -1782,11 +1781,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		if (ret != -1) {
 			io_cqring_fill_event(link, -ECANCELED);
 			io_put_req_deferred(link, 1);
-			cancelled = true;
+			return true;
 		}
 	}
-	req->flags &= ~REQ_F_LINK_TIMEOUT;
-	return cancelled;
+	return false;
 }
 
 static void io_fail_links(struct io_kiocb *req)
-- 
cgit v1.2.3


From 682076801a2f46867743d9520d228e3c7eca751f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:25 +0000
Subject: io_uring: don't do extra EXITING cancellations

io_match_task() matches all requests with PF_EXITING task, even though
those may be valid requests. It was necessary for SQPOLL cancellation,
but now it kills all requests before exiting via
io_uring_cancel_sqpoll(), so it's not needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a152c0fd24cc..b33cc2926ac6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1079,12 +1079,8 @@ static bool io_match_task(struct io_kiocb *head,
 {
 	struct io_kiocb *req;
 
-	if (task && head->task != task) {
-		/* in terms of cancelation, always match if req task is dead */
-		if (head->task->flags & PF_EXITING)
-			return true;
+	if (task && head->task != task)
 		return false;
-	}
 	if (!files)
 		return true;
 
-- 
cgit v1.2.3


From 05356d86c64271b6f545fc14342526ab33514682 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:27 +0000
Subject: io_uring: remove tctx->sqpoll

struct io_uring_task::sqpoll is not used anymore, kill it

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b33cc2926ac6..90b26221ba31 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -464,7 +464,6 @@ struct io_uring_task {
 	struct io_wq		*io_wq;
 	struct percpu_counter	inflight;
 	atomic_t		in_idle;
-	bool			sqpoll;
 
 	spinlock_t		task_lock;
 	struct io_wq_work_list	task_list;
-- 
cgit v1.2.3


From e1d675df1a36e33e43c614e01d9f714618ac121e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:29 +0000
Subject: io_uring: don't init req->work fully in advance

req->work is mostly unused unless it's punted, and io_init_req() is too
hot for fully initialising it. Fortunately, we can skip init work.next
as it's controlled by io-wq, and can not touch work.flags by moving
everything related into io_prep_async_work(). The only field left is
req->work.creds, but there is nothing can be done, keep maintaining it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 90b26221ba31..74e665953ab2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1199,6 +1199,8 @@ static void io_prep_async_work(struct io_kiocb *req)
 	if (!req->work.creds)
 		req->work.creds = get_current_cred();
 
+	req->work.list.next = NULL;
+	req->work.flags = 0;
 	if (req->flags & REQ_F_FORCE_ASYNC)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
@@ -1209,6 +1211,18 @@ static void io_prep_async_work(struct io_kiocb *req)
 		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
+
+	switch (req->opcode) {
+	case IORING_OP_SPLICE:
+	case IORING_OP_TEE:
+		/*
+		 * Splice operation will be punted aync, and here need to
+		 * modify io_wq_work.flags, so initialize io_wq_work firstly.
+		 */
+		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
+			req->work.flags |= IO_WQ_WORK_UNBOUND;
+		break;
+	}
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -3593,15 +3607,6 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (!sp->file_in)
 		return -EBADF;
 	req->flags |= REQ_F_NEED_CLEANUP;
-
-	if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
-		/*
-		 * Splice operation will be punted aync, and here need to
-		 * modify io_wq_work.flags, so initialize io_wq_work firstly.
-		 */
-		req->work.flags |= IO_WQ_WORK_UNBOUND;
-	}
-
 	return 0;
 }
 
@@ -6389,9 +6394,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	atomic_set(&req->refs, 2);
 	req->task = current;
 	req->result = 0;
-	req->work.list.next = NULL;
 	req->work.creds = NULL;
-	req->work.flags = 0;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
-- 
cgit v1.2.3


From 59d7001345a7b9d849e2e768903458883395b00f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:30 +0000
Subject: io_uring: kill unused REQ_F_NO_FILE_TABLE

current->files are always valid now even for io-wq threads, so kill not
used anymore REQ_F_NO_FILE_TABLE.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 74e665953ab2..605f253e8f93 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -693,7 +693,6 @@ enum {
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
-	REQ_F_NO_FILE_TABLE_BIT,
 	REQ_F_LTIMEOUT_ACTIVE_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
@@ -736,8 +735,6 @@ enum {
 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
-	/* doesn't need file table for this request */
-	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
 	/* linked timeout is active, i.e. prepared by link's head */
 	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 	/* completion is deferred through io_comp_state */
@@ -4180,12 +4177,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_statx *ctx = &req->statx;
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK) {
-		/* only need file table for an actual valid fd */
-		if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
-			req->flags |= REQ_F_NO_FILE_TABLE;
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
-	}
 
 	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
 		       ctx->buffer);
-- 
cgit v1.2.3


From 1c98679db94155a145f8389f9aaee30c99dbbd5a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:31 +0000
Subject: io_uring: optimise kiocb_end_write for !ISREG

file_end_write() is only for regular files, so the function do a couple
of dereferences to get inode and check for it. However, we already have
REQ_F_ISREG at hand, just use it and inline file_end_write().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 605f253e8f93..a3a0348ab6eb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2431,11 +2431,11 @@ static void kiocb_end_write(struct io_kiocb *req)
 	 * thread.
 	 */
 	if (req->flags & REQ_F_ISREG) {
-		struct inode *inode = file_inode(req->file);
+		struct super_block *sb = file_inode(req->file)->i_sb;
 
-		__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
+		sb_end_write(sb);
 	}
-	file_end_write(req->file);
 }
 
 #ifdef CONFIG_BLOCK
-- 
cgit v1.2.3


From 8c130827f417da791edb919df8cac56af30a1489 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:32 +0000
Subject: io_uring: don't alter iopoll reissue fail ret code

When reissue_prep failed in io_complete_rw_iopoll(), we change return
code to -EIO to prevent io_iopoll_complete() from doing resubmission.
Mark requests with a new flag (i.e. REQ_F_DONT_REISSUE) instead and
retain the original return value.

It also removes io_rw_reissue() from io_iopoll_complete() that will be
used later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a3a0348ab6eb..9362c9228540 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -696,6 +696,7 @@ enum {
 	REQ_F_LTIMEOUT_ACTIVE_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
+	REQ_F_DONT_REISSUE_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_ASYNC_READ_BIT,
 	REQ_F_ASYNC_WRITE_BIT,
@@ -741,6 +742,8 @@ enum {
 	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
+	/* don't attempt request reissue, see io_rw_reissue() */
+	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
 	/* supports async reads */
 	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
 	/* supports async writes */
@@ -1016,7 +1019,6 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
-static bool io_rw_reissue(struct io_kiocb *req);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
@@ -2253,10 +2255,12 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		req = list_first_entry(done, struct io_kiocb, inflight_entry);
 		list_del(&req->inflight_entry);
 
-		if (READ_ONCE(req->result) == -EAGAIN) {
+		if (READ_ONCE(req->result) == -EAGAIN &&
+		    !(req->flags & REQ_F_DONT_REISSUE)) {
 			req->iopoll_completed = 0;
-			if (io_rw_reissue(req))
-				continue;
+			req_ref_get(req);
+			io_queue_async_work(req);
+			continue;
 		}
 
 		if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -2471,24 +2475,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 }
 #endif
 
-static bool io_rw_reissue(struct io_kiocb *req)
-{
-#ifdef CONFIG_BLOCK
-	if (!io_rw_should_reissue(req))
-		return false;
-
-	lockdep_assert_held(&req->ctx->uring_lock);
-
-	if (io_resubmit_prep(req)) {
-		req_ref_get(req);
-		io_queue_async_work(req);
-		return true;
-	}
-	req_set_fail_links(req);
-#endif
-	return false;
-}
-
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     unsigned int issue_flags)
 {
@@ -2527,15 +2513,17 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 			iov_iter_revert(&rw->iter,
 					req->result - iov_iter_count(&rw->iter));
 		else if (!io_resubmit_prep(req))
-			res = -EIO;
+			req->flags |= REQ_F_DONT_REISSUE;
 	}
 #endif
 
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if (res != -EAGAIN && res != req->result)
+	if (res != -EAGAIN && res != req->result) {
+		req->flags |= REQ_F_DONT_REISSUE;
 		req_set_fail_links(req);
+	}
 
 	WRITE_ONCE(req->result, res);
 	/* order with io_iopoll_complete() checking ->result */
@@ -2776,7 +2764,10 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 
 	if (check_reissue && req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
-		if (!io_rw_reissue(req)) {
+		if (!io_resubmit_prep(req)) {
+			req_ref_get(req);
+			io_queue_async_work(req);
+		} else {
 			int cflags = 0;
 
 			req_set_fail_links(req);
-- 
cgit v1.2.3


From ab454438aa8dc9eb113df7d00f2cf9ec628a26ce Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:33 +0000
Subject: io_uring: hide iter revert in resubmit_prep

Move iov_iter_revert() resetting iterator in case of -EIOCBQUEUED into
io_resubmit_prep(), so we don't do heavy revert in hot path, also saves
a couple of checks.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9362c9228540..a7239c86326e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2445,8 +2445,13 @@ static void kiocb_end_write(struct io_kiocb *req)
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
-	/* either already prepared or successfully done */
-	return req->async_data || !io_req_prep_async(req);
+	struct io_async_rw *rw = req->async_data;
+
+	if (!rw)
+		return !io_req_prep_async(req);
+	/* may have left rw->iter inconsistent on -EIOCBQUEUED */
+	iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
+	return true;
 }
 
 static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -2505,14 +2510,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
 #ifdef CONFIG_BLOCK
-	/* Rewind iter, if we have one. iopoll path resubmits as usual */
 	if (res == -EAGAIN && io_rw_should_reissue(req)) {
-		struct io_async_rw *rw = req->async_data;
-
-		if (rw)
-			iov_iter_revert(&rw->iter,
-					req->result - iov_iter_count(&rw->iter));
-		else if (!io_resubmit_prep(req))
+		if (!io_resubmit_prep(req))
 			req->flags |= REQ_F_DONT_REISSUE;
 	}
 #endif
-- 
cgit v1.2.3


From 9532b99bd9ca3f8f2f17b38500a8901ac1e7baee Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 22 Mar 2021 01:58:34 +0000
Subject: io_uring: optimise rw complete error handling

Expect read/write to succeed and create a hot path for this case, in
particular hide all error handling with resubmission under a single
check with the desired result.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a7239c86326e..c791e4031fb2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2487,12 +2487,14 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 
 	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
-	if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) {
-		req->flags |= REQ_F_REISSUE;
-		return;
-	}
-	if (res != req->result)
+	if (res != req->result) {
+		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+		    io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE;
+			return;
+		}
 		req_set_fail_links(req);
+	}
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_rw_kbuf(req);
 	__io_req_complete(req, issue_flags, res, cflags);
@@ -2509,19 +2511,20 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-#ifdef CONFIG_BLOCK
-	if (res == -EAGAIN && io_rw_should_reissue(req)) {
-		if (!io_resubmit_prep(req))
-			req->flags |= REQ_F_DONT_REISSUE;
-	}
-#endif
-
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
+	if (unlikely(res != req->result)) {
+		bool fail = true;
 
-	if (res != -EAGAIN && res != req->result) {
-		req->flags |= REQ_F_DONT_REISSUE;
-		req_set_fail_links(req);
+#ifdef CONFIG_BLOCK
+		if (res == -EAGAIN && io_rw_should_reissue(req) &&
+		    io_resubmit_prep(req))
+			fail = false;
+#endif
+		if (fail) {
+			req_set_fail_links(req);
+			req->flags |= REQ_F_DONT_REISSUE;
+		}
 	}
 
 	WRITE_ONCE(req->result, res);
-- 
cgit v1.2.3


From 464dca612bc6bceceafadfb4bf28f1a27ccc4632 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Mar 2021 14:06:24 -0600
Subject: io_uring: mask in error/nval/hangup consistently for poll

Instead of masking these in as part of regular POLL_ADD prep, do it in
io_init_poll_iocb(), and include NVAL as that's generally unmaskable,
and RDHUP alongside the HUP that is already set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c791e4031fb2..5c000057a2ee 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4977,7 +4977,9 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	poll->head = NULL;
 	poll->done = false;
 	poll->canceled = false;
-	poll->events = events;
+#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+	/* mask in events that we always want/need */
+	poll->events = events | IO_POLL_UNMASK;
 	INIT_LIST_HEAD(&poll->wait.entry);
 	init_waitqueue_func_entry(&poll->wait, wake_func);
 }
@@ -5339,8 +5341,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
-		       (events & EPOLLEXCLUSIVE);
+	poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 6c2450ae55656f6b0370bfd4cb52ec8a4ecd0916 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 23 Feb 2021 12:40:22 +0000
Subject: io_uring: allocate memory for overflowed CQEs

Instead of using a request itself for overflowed CQE stashing, allocate a
separate entry. The disadvantage is that the allocation may fail and it
will be accounted as lost (see rings->cq_overflow), so we lose reliability
in case of memory pressure if the application is driving the CQ ring into
overflow. However, it opens a way for for multiple CQEs per an SQE and
even generating SQE-less CQEs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: use GFP_ATOMIC | __GFP_ACCOUNT]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 101 ++++++++++++++++++++++++++--------------------------------
 1 file changed, 46 insertions(+), 55 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c000057a2ee..05e8e274a918 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,6 +202,11 @@ struct io_mapped_ubuf {
 
 struct io_ring_ctx;
 
+struct io_overflow_cqe {
+	struct io_uring_cqe cqe;
+	struct list_head list;
+};
+
 struct io_rsrc_put {
 	struct list_head list;
 	union {
@@ -1401,41 +1406,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 }
 
 /* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-				       struct task_struct *tsk,
-				       struct files_struct *files)
+static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
 	struct io_rings *rings = ctx->rings;
-	struct io_kiocb *req, *tmp;
-	struct io_uring_cqe *cqe;
 	unsigned long flags;
 	bool all_flushed, posted;
-	LIST_HEAD(list);
 
 	if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
 		return false;
 
 	posted = false;
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
-		if (!io_match_task(req, tsk, files))
-			continue;
+	while (!list_empty(&ctx->cq_overflow_list)) {
+		struct io_uring_cqe *cqe = io_get_cqring(ctx);
+		struct io_overflow_cqe *ocqe;
 
-		cqe = io_get_cqring(ctx);
 		if (!cqe && !force)
 			break;
-
-		list_move(&req->compl.list, &list);
-		if (cqe) {
-			WRITE_ONCE(cqe->user_data, req->user_data);
-			WRITE_ONCE(cqe->res, req->result);
-			WRITE_ONCE(cqe->flags, req->compl.cflags);
-		} else {
-			ctx->cached_cq_overflow++;
+		ocqe = list_first_entry(&ctx->cq_overflow_list,
+					struct io_overflow_cqe, list);
+		if (cqe)
+			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+		else
 			WRITE_ONCE(ctx->rings->cq_overflow,
-				   ctx->cached_cq_overflow);
-		}
+				   ++ctx->cached_cq_overflow);
 		posted = true;
+		list_del(&ocqe->list);
+		kfree(ocqe);
 	}
 
 	all_flushed = list_empty(&ctx->cq_overflow_list);
@@ -1450,19 +1447,10 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	if (posted)
 		io_cqring_ev_posted(ctx);
-
-	while (!list_empty(&list)) {
-		req = list_first_entry(&list, struct io_kiocb, compl.list);
-		list_del(&req->compl.list);
-		io_put_req(req);
-	}
-
 	return all_flushed;
 }
 
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-				     struct task_struct *tsk,
-				     struct files_struct *files)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
 	bool ret = true;
 
@@ -1470,7 +1458,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 		/* iopoll syncs against uring_lock, not completion_lock */
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&ctx->uring_lock);
-		ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
+		ret = __io_cqring_overflow_flush(ctx, force);
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&ctx->uring_lock);
 	}
@@ -1531,29 +1519,33 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
-	} else if (ctx->cq_overflow_flushed ||
-		   atomic_read(&req->task->io_uring->in_idle)) {
-		/*
-		 * If we're in ring overflow flush mode, or in task cancel mode,
-		 * then we cannot store the request for later flushing, we need
-		 * to drop it on the floor.
-		 */
-		ctx->cached_cq_overflow++;
-		WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
-	} else {
+		return;
+	}
+	if (!ctx->cq_overflow_flushed &&
+	    !atomic_read(&req->task->io_uring->in_idle)) {
+		struct io_overflow_cqe *ocqe;
+
+		ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+		if (!ocqe)
+			goto overflow;
 		if (list_empty(&ctx->cq_overflow_list)) {
 			set_bit(0, &ctx->sq_check_overflow);
 			set_bit(0, &ctx->cq_check_overflow);
 			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
-		if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
-			io_clean_op(req);
-
-		req->result = res;
-		req->compl.cflags = cflags;
-		req_ref_get(req);
-		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
+		ocqe->cqe.user_data = req->user_data;
+		ocqe->cqe.res = res;
+		ocqe->cqe.flags = cflags;
+		list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+		return;
 	}
+overflow:
+	/*
+	 * If we're in ring overflow flush mode, or in task cancel mode,
+	 * or cannot allocate an overflow entry, then we need to drop it
+	 * on the floor.
+	 */
+	WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
 }
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -2398,7 +2390,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		 * already triggered a CQE (eg in error).
 		 */
 		if (test_bit(0, &ctx->cq_check_overflow))
-			__io_cqring_overflow_flush(ctx, false, NULL, NULL);
+			__io_cqring_overflow_flush(ctx, false);
 		if (io_cqring_events(ctx))
 			break;
 
@@ -6581,7 +6573,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
 	if (test_bit(0, &ctx->sq_check_overflow)) {
-		if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
+		if (!__io_cqring_overflow_flush(ctx, false))
 			return -EBUSY;
 	}
 
@@ -6881,7 +6873,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	int ret;
 
 	do {
-		io_cqring_overflow_flush(ctx, false, NULL, NULL);
+		io_cqring_overflow_flush(ctx, false);
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 		if (!io_run_task_work())
@@ -6913,7 +6905,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-		if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+		if (!io_cqring_overflow_flush(ctx, false)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -8616,7 +8608,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	/* if force is set, the ring is going away. always drop after that */
 	ctx->cq_overflow_flushed = 1;
 	if (ctx->rings)
-		__io_cqring_overflow_flush(ctx, true, NULL, NULL);
+		__io_cqring_overflow_flush(ctx, true);
 	xa_for_each(&ctx->personalities, index, creds)
 		io_unregister_personality(ctx, index);
 	mutex_unlock(&ctx->uring_lock);
@@ -8766,7 +8758,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		ret |= io_kill_timeouts(ctx, task, files);
 		ret |= io_run_task_work();
 		ret |= io_run_ctx_fallback(ctx);
-		io_cqring_overflow_flush(ctx, true, task, files);
 		if (!ret)
 			break;
 		cond_resched();
@@ -9185,7 +9176,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	 */
 	ret = 0;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		io_cqring_overflow_flush(ctx, false, NULL, NULL);
+		io_cqring_overflow_flush(ctx, false);
 
 		ret = -EOWNERDEAD;
 		if (unlikely(ctx->sq_data->thread == NULL)) {
-- 
cgit v1.2.3


From 7471e1afabf8a9adcb4659170f4e198c05f5b5a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Feb 2021 22:05:00 -0700
Subject: io_uring: include cflags in completion trace event

We should be including the completion flags for better introspection on
exactly what completion event was logged.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 05e8e274a918..e5c2bb258db0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1507,7 +1507,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(ctx, req->user_data, res);
+	trace_io_uring_complete(ctx, req->user_data, res, cflags);
 
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
-- 
cgit v1.2.3


From 88e41cf928a6e1a0eb5a9492e2d091ec6193cce4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Feb 2021 22:08:01 -0700
Subject: io_uring: add multishot mode for IORING_OP_POLL_ADD

The default io_uring poll mode is one-shot, where once the event triggers,
the poll command is completed and won't trigger any further events. If
we're doing repeated polling on the same file or socket, then it can be
more efficient to do multishot, where we keep triggering whenever the
event becomes true.

This deviates from the usual norm of having one CQE per SQE submitted. Add
a CQE flag, IORING_CQE_F_MORE, which tells the application to expect
further completion events from the submitted SQE. Right now the only user
of this is POLL_ADD in multishot mode.

Since sqe->poll_events is using the space that we normally use for adding
flags to commands, use sqe->len for the flag space for POLL_ADD. Multishot
mode is selected by setting IORING_POLL_ADD_MULTI in sqe->len. An
application should expect more CQEs for the specificed SQE if the CQE is
flagged with IORING_CQE_F_MORE. In multishot mode, only cancelation or an
error will terminate the poll request, in which case the flag will be
cleared.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 18 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e5c2bb258db0..80db5898e119 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4898,17 +4898,25 @@ static void io_poll_remove_double(struct io_kiocb *req)
 	}
 }
 
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned flags = IORING_CQE_F_MORE;
 
-	if (!error && req->poll.canceled)
+	if (!error && req->poll.canceled) {
 		error = -ECANCELED;
-
-	io_poll_remove_double(req);
-	req->poll.done = true;
-	io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+		req->poll.events |= EPOLLONESHOT;
+	}
+	if (error || (req->poll.events & EPOLLONESHOT)) {
+		io_poll_remove_double(req);
+		req->poll.done = true;
+		flags = 0;
+	}
+	if (!error)
+		error = mangle_poll(mask);
+	__io_cqring_fill_event(req, error, flags);
 	io_commit_cqring(ctx);
+	return !(flags & IORING_CQE_F_MORE);
 }
 
 static void io_poll_task_func(struct callback_head *cb)
@@ -4920,14 +4928,25 @@ static void io_poll_task_func(struct callback_head *cb)
 	if (io_poll_rewait(req, &req->poll)) {
 		spin_unlock_irq(&ctx->completion_lock);
 	} else {
-		hash_del(&req->hash_node);
-		io_poll_complete(req, req->result, 0);
+		bool done, post_ev;
+
+		post_ev = done = io_poll_complete(req, req->result, 0);
+		if (done) {
+			hash_del(&req->hash_node);
+		} else if (!(req->poll.events & EPOLLONESHOT)) {
+			post_ev = true;
+			req->result = 0;
+			add_wait_queue(req->poll.head, &req->poll.wait);
+		}
 		spin_unlock_irq(&ctx->completion_lock);
 
-		nxt = io_put_req_find_next(req);
-		io_cqring_ev_posted(ctx);
-		if (nxt)
-			__io_req_task_submit(nxt);
+		if (post_ev)
+			io_cqring_ev_posted(ctx);
+		if (done) {
+			nxt = io_put_req_find_next(req);
+			if (nxt)
+				__io_req_task_submit(nxt);
+		}
 	}
 }
 
@@ -4941,6 +4960,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 	/* for instances that support it check for an event match first: */
 	if (mask && !(mask & poll->events))
 		return 0;
+	if (!(poll->events & EPOLLONESHOT))
+		return poll->wait.func(&poll->wait, mode, sync, key);
 
 	list_del_init(&wait->entry);
 
@@ -5106,7 +5127,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 			ipt->error = 0;
 			mask = 0;
 		}
-		if (mask || ipt->error)
+		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
 			list_del_init(&poll->wait.entry);
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
@@ -5149,7 +5170,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	req->flags |= REQ_F_POLLED;
 	req->apoll = apoll;
 
-	mask = 0;
+	mask = EPOLLONESHOT;
 	if (def->pollin)
 		mask |= POLLIN | POLLRDNORM;
 	if (def->pollout)
@@ -5322,18 +5343,24 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_poll_iocb *poll = &req->poll;
-	u32 events;
+	u32 events, flags;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+	if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->len);
+	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
 	events = READ_ONCE(sqe->poll32_events);
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
+	if (!flags)
+		events |= EPOLLONESHOT;
+	poll->events = demangle_poll(events) |
+				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
 }
 
@@ -5357,7 +5384,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		io_put_req(req);
+		if (poll->events & EPOLLONESHOT)
+			io_put_req(req);
 	}
 	return ipt.error;
 }
-- 
cgit v1.2.3


From b2c3f7e1715605c045f46fb369d850ada4749388 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 08:58:04 -0700
Subject: io_uring: abstract out helper for removing poll waitqs/hashes

No functional changes in this patch, just preparation for kill multishot
poll on CQ overflow.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 80db5898e119..12f686283ade 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5216,7 +5216,7 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 	return do_complete;
 }
 
-static bool io_poll_remove_one(struct io_kiocb *req)
+static bool io_poll_remove_waitqs(struct io_kiocb *req)
 {
 	bool do_complete;
 
@@ -5236,6 +5236,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		}
 	}
 
+	return do_complete;
+}
+
+static bool io_poll_remove_one(struct io_kiocb *req)
+{
+	bool do_complete;
+
+	do_complete = io_poll_remove_waitqs(req);
 	if (do_complete) {
 		io_cqring_fill_event(req, -ECANCELED);
 		io_commit_cqring(req->ctx);
-- 
cgit v1.2.3


From 5082620fb2cab74b623c3bf5da5a222add564871 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 09:02:26 -0700
Subject: io_uring: terminate multishot poll for CQ ring overflow

If we hit overflow and fail to allocate an overflow entry for the
completion, terminate the multishot poll mode.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 12f686283ade..e23549330904 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1044,6 +1044,7 @@ static void io_rsrc_put_work(struct work_struct *work);
 static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_comp_state *cs,
 					struct io_ring_ctx *ctx);
+static bool io_poll_remove_waitqs(struct io_kiocb *req);
 static int io_req_prep_async(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -1501,7 +1502,7 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static void __io_cqring_fill_event(struct io_kiocb *req, long res,
+static bool __io_cqring_fill_event(struct io_kiocb *req, long res,
 				   unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1519,7 +1520,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
-		return;
+		return true;
 	}
 	if (!ctx->cq_overflow_flushed &&
 	    !atomic_read(&req->task->io_uring->in_idle)) {
@@ -1537,7 +1538,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
 		ocqe->cqe.res = res;
 		ocqe->cqe.flags = cflags;
 		list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
-		return;
+		return true;
 	}
 overflow:
 	/*
@@ -1546,6 +1547,7 @@ overflow:
 	 * on the floor.
 	 */
 	WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+	return false;
 }
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -4907,14 +4909,14 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 		error = -ECANCELED;
 		req->poll.events |= EPOLLONESHOT;
 	}
-	if (error || (req->poll.events & EPOLLONESHOT)) {
-		io_poll_remove_double(req);
+	if (!error)
+		error = mangle_poll(mask);
+	if (!__io_cqring_fill_event(req, error, flags) ||
+	    (req->poll.events & EPOLLONESHOT)) {
+		io_poll_remove_waitqs(req);
 		req->poll.done = true;
 		flags = 0;
 	}
-	if (!error)
-		error = mangle_poll(mask);
-	__io_cqring_fill_event(req, error, flags);
 	io_commit_cqring(ctx);
 	return !(flags & IORING_CQE_F_MORE);
 }
@@ -5205,6 +5207,8 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 {
 	bool do_complete = false;
 
+	if (!poll->head)
+		return false;
 	spin_lock(&poll->head->lock);
 	WRITE_ONCE(poll->canceled, true);
 	if (!list_empty(&poll->wait.entry)) {
-- 
cgit v1.2.3


From b2cb805f6dd40938c0398c94787741a08ed5e921 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Mar 2021 08:17:19 -0600
Subject: io_uring: abstract out a io_poll_find_helper()

We'll need this helper for another purpose, for now just abstract it
out and have io_poll_cancel() use it for lookups.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e23549330904..3c54e8c9f81f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5286,7 +5286,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	return posted != 0;
 }
 
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
 {
 	struct hlist_head *list;
 	struct io_kiocb *req;
@@ -5295,12 +5295,23 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
 	hlist_for_each_entry(req, list, hash_node) {
 		if (sqe_addr != req->user_data)
 			continue;
-		if (io_poll_remove_one(req))
-			return 0;
-		return -EALREADY;
+		return req;
 	}
 
-	return -ENOENT;
+	return NULL;
+}
+
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+{
+	struct io_kiocb *req;
+
+	req = io_poll_find(ctx, sqe_addr);
+	if (!req)
+		return -ENOENT;
+	if (io_poll_remove_one(req))
+		return 0;
+
+	return -EALREADY;
 }
 
 static int io_poll_remove_prep(struct io_kiocb *req,
-- 
cgit v1.2.3


From b69de288e913030082bed3a324ddc58be6c1e983 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Mar 2021 08:37:41 -0600
Subject: io_uring: allow events and user_data update of running poll requests

This adds two new POLL_ADD flags, IORING_POLL_UPDATE_EVENTS and
IORING_POLL_UPDATE_USER_DATA. As with the other POLL_ADD flag, these are
masked into sqe->len. If set, the POLL_ADD will have the following
behavior:

- sqe->addr must contain the the user_data of the poll request that
  needs to be modified. This field is otherwise invalid for a POLL_ADD
  command.

- If IORING_POLL_UPDATE_EVENTS is set, sqe->poll_events must contain the
  new mask for the existing poll request. There are no checks for whether
  these are identical or not, if a matching poll request is found, then it
  is re-armed with the new mask.

- If IORING_POLL_UPDATE_USER_DATA is set, sqe->off must contain the new
  user_data for the existing poll request.

A POLL_ADD with any of these flags set may complete with any of the
following results:

1) 0, which means that we successfully found the existing poll request
   specified, and performed the re-arm procedure. Any error from that
   re-arm will be exposed as a completion event for that original poll
   request, not for the update request.
2) -ENOENT, if no existing poll request was found with the given
   user_data.
3) -EALREADY, if the existing poll request was already in the process of
   being removed/canceled/completing.
4) -EACCES, if an attempt was made to modify an internal poll request
   (eg not one originally issued ass IORING_OP_POLL_ADD).

The usual -EINVAL cases apply as well, if any invalid fields are set
in the sqe for this command type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 87 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c54e8c9f81f..eeb165253491 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -486,7 +486,15 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
-	struct wait_queue_entry		wait;
+	bool				update_events;
+	bool				update_user_data;
+	union {
+		struct wait_queue_entry	wait;
+		struct {
+			u64		old_user_data;
+			u64		new_user_data;
+		};
+	};
 };
 
 struct io_poll_remove {
@@ -4911,8 +4919,9 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 	}
 	if (!error)
 		error = mangle_poll(mask);
-	if (!__io_cqring_fill_event(req, error, flags) ||
-	    (req->poll.events & EPOLLONESHOT)) {
+	if (req->poll.events & EPOLLONESHOT)
+		flags = 0;
+	if (!__io_cqring_fill_event(req, error, flags)) {
 		io_poll_remove_waitqs(req);
 		req->poll.done = true;
 		flags = 0;
@@ -4992,6 +5001,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	poll->head = NULL;
 	poll->done = false;
 	poll->canceled = false;
+	poll->update_events = poll->update_user_data = false;
 #define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 	/* mask in events that we always want/need */
 	poll->events = events | IO_POLL_UNMASK;
@@ -5370,24 +5380,36 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
+			IORING_POLL_UPDATE_USER_DATA))
 		return -EINVAL;
-
 	events = READ_ONCE(sqe->poll32_events);
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	if (!flags)
+	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
+	poll->update_events = poll->update_user_data = false;
+	if (flags & IORING_POLL_UPDATE_EVENTS) {
+		poll->update_events = true;
+		poll->old_user_data = READ_ONCE(sqe->addr);
+	}
+	if (flags & IORING_POLL_UPDATE_USER_DATA) {
+		poll->update_user_data = true;
+		poll->new_user_data = READ_ONCE(sqe->off);
+	}
+	if (!(poll->update_events || poll->update_user_data) &&
+	     (sqe->off || sqe->addr))
+		return -EINVAL;
 	poll->events = demangle_poll(events) |
 				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
 }
 
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_poll_add(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5413,6 +5435,63 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	return ipt.error;
 }
 
+static int io_poll_update(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *preq;
+	int ret;
+
+	spin_lock_irq(&ctx->completion_lock);
+	preq = io_poll_find(ctx, req->poll.old_user_data);
+	if (!preq) {
+		ret = -ENOENT;
+		goto err;
+	} else if (preq->opcode != IORING_OP_POLL_ADD) {
+		/* don't allow internal poll updates */
+		ret = -EACCES;
+		goto err;
+	}
+	if (!__io_poll_remove_one(preq, &preq->poll)) {
+		/* in process of completing/removal */
+		ret = -EALREADY;
+		goto err;
+	}
+	/* we now have a detached poll request. reissue. */
+	ret = 0;
+err:
+	spin_unlock_irq(&ctx->completion_lock);
+	if (ret < 0) {
+		req_set_fail_links(req);
+		io_req_complete(req, ret);
+		return 0;
+	}
+	/* only mask one event flags, keep behavior flags */
+	if (req->poll.update_events) {
+		preq->poll.events &= ~0xffff;
+		preq->poll.events |= req->poll.events & 0xffff;
+		preq->poll.events |= IO_POLL_UNMASK;
+	}
+	if (req->poll.update_user_data)
+		preq->user_data = req->poll.new_user_data;
+
+	/* complete update request, we're done with it */
+	io_req_complete(req, ret);
+
+	ret = __io_poll_add(preq);
+	if (ret < 0) {
+		req_set_fail_links(preq);
+		io_req_complete(preq, ret);
+	}
+	return 0;
+}
+
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+{
+	if (!req->poll.update_events && !req->poll.update_user_data)
+		return __io_poll_add(req);
+	return io_poll_update(req);
+}
+
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
-- 
cgit v1.2.3


From 685fe7feedb96771683437107ba72131410e2350 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 8 Mar 2021 09:37:51 -0700
Subject: io-wq: eliminate the need for a manager thread

io-wq relies on a manager thread to create/fork new workers, as needed.
But there's really no strong need for it anymore. We have the following
cases that fork a new worker:

1) Work queue. This is done from the task itself always, and it's trivial
   to create a worker off that path, if needed.

2) All workers have gone to sleep, and we have more work. This is called
   off the sched out path. For this case, use a task_work items to queue
   a fork-worker operation.

3) Hashed work completion. Don't think we need to do anything off this
   case. If need be, it could just use approach 2 as well.

Part of this change is incrementing the running worker count before the
fork, to avoid cases where we observe we need a worker and then queue
creation of one. Then new work comes in, we fork a new one. That last
queue operation should have waited for the previous worker to come up,
it's quite possible we don't even need it. Hence move the worker running
from before we fork it off to more efficiently handle that case.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index eeb165253491..9a8504294ab7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7911,7 +7911,8 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
 	return req ? &req->work : NULL;
 }
 
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
+					struct task_struct *task)
 {
 	struct io_wq_hash *hash;
 	struct io_wq_data data;
@@ -7928,6 +7929,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 	}
 
 	data.hash = hash;
+	data.task = task;
 	data.free_work = io_free_work;
 	data.do_work = io_wq_submit_work;
 
@@ -7953,7 +7955,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 		return ret;
 	}
 
-	tctx->io_wq = io_init_wq_offload(ctx);
+	tctx->io_wq = io_init_wq_offload(ctx, task);
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
-- 
cgit v1.2.3


From 548d819d1eed7b6bf86d36c8de2fbc54b69db571 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Mar 2021 10:21:35 -0600
Subject: io_uring: allow SQPOLL without CAP_SYS_ADMIN or CAP_SYS_NICE

Now that we have any worker being attached to the original task as
threads, accounting of CPU time is directly attributed to the original
task as well. This means that we no longer have to restrict SQPOLL to
needing elevated privileges, as it's really no different from just having
the task spawn a busy looping thread in userspace.

Reported-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a8504294ab7..852f9e908904 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8011,10 +8011,6 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		struct io_sq_data *sqd;
 		bool attached;
 
-		ret = -EPERM;
-		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
-			goto err;
-
 		sqd = io_get_sq_data(p, &attached);
 		if (IS_ERR(sqd)) {
 			ret = PTR_ERR(sqd);
-- 
cgit v1.2.3


From 50e96989d736b8e5623059815247be01ca6713c1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Mar 2021 22:59:01 +0000
Subject: io_uring: reg buffer overflow checks hardening

We are safe with overflows in io_sqe_buffer_register() because it will
just yield alloc failure, but it's nicer to check explicitly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2b0625551be3d97b80a5fd21c8cd79dc1c91f0b5.1616624589.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 852f9e908904..2be6f3f9578f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8404,6 +8404,8 @@ static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 
 static int io_buffer_validate(struct iovec *iov)
 {
+	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+
 	/*
 	 * Don't impose further limits on the size and buffer
 	 * constraints here, we'll -EINVAL later when IO is
@@ -8416,6 +8418,9 @@ static int io_buffer_validate(struct iovec *iov)
 	if (iov->iov_len > SZ_1G)
 		return -EFAULT;
 
+	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+		return -EOVERFLOW;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b2e720ace221f9be75fefdba7d0ebab9d05fc561 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Mar 2021 09:03:03 -0600
Subject: io_uring: fix race around poll update and poll triggering

Joakim reports that in some conditions he sees a multishot poll request
being canceled, and that it coincides with getting -EALREADY on
modification. As part of the poll update procedure, there's a small window
where the request is marked as canceled, and if this coincides with the
event actually triggering, then we can get a spurious -ECANCELED and
termination of the multishot request.

Don't mark the poll request as being canceled for update. We also don't
care if we race on removal unless it's a one-shot request, we can safely
updated for either case.

Fixes: b69de288e913 ("io_uring: allow events and user_data update of running poll requests")
Reported-by: Joakim Hassila <joj@mac.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2be6f3f9578f..c417708bc441 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5213,14 +5213,15 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 }
 
 static bool __io_poll_remove_one(struct io_kiocb *req,
-				 struct io_poll_iocb *poll)
+				 struct io_poll_iocb *poll, bool do_cancel)
 {
 	bool do_complete = false;
 
 	if (!poll->head)
 		return false;
 	spin_lock(&poll->head->lock);
-	WRITE_ONCE(poll->canceled, true);
+	if (do_cancel)
+		WRITE_ONCE(poll->canceled, true);
 	if (!list_empty(&poll->wait.entry)) {
 		list_del_init(&poll->wait.entry);
 		do_complete = true;
@@ -5237,12 +5238,12 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
 	io_poll_remove_double(req);
 
 	if (req->opcode == IORING_OP_POLL_ADD) {
-		do_complete = __io_poll_remove_one(req, &req->poll);
+		do_complete = __io_poll_remove_one(req, &req->poll, true);
 	} else {
 		struct async_poll *apoll = req->apoll;
 
 		/* non-poll requests have submit ref still */
-		do_complete = __io_poll_remove_one(req, &apoll->poll);
+		do_complete = __io_poll_remove_one(req, &apoll->poll, true);
 		if (do_complete) {
 			io_put_req(req);
 			kfree(apoll->double_poll);
@@ -5451,10 +5452,11 @@ static int io_poll_update(struct io_kiocb *req)
 		ret = -EACCES;
 		goto err;
 	}
-	if (!__io_poll_remove_one(preq, &preq->poll)) {
-		/* in process of completing/removal */
-		ret = -EALREADY;
-		goto err;
+	if (!__io_poll_remove_one(preq, &preq->poll, false)) {
+		if (preq->poll.events & EPOLLONESHOT) {
+			ret = -EALREADY;
+			goto err;
+		}
 	}
 	/* we now have a detached poll request. reissue. */
 	ret = 0;
-- 
cgit v1.2.3


From b895c9a632e70ad977c1c0e31e640be5c98b56c6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:40 +0100
Subject: io_uring: name rsrc bits consistently

Keep resource related structs' and functions' naming consistent, in
particular use "io_rsrc" prefix for everything.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/962f5acdf810f3a62831e65da3932cde24f6d9df.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 150 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 71 insertions(+), 79 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c417708bc441..938d8236cc3c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -219,22 +219,22 @@ struct fixed_rsrc_table {
 	struct file		**files;
 };
 
-struct fixed_rsrc_ref_node {
+struct io_rsrc_node {
 	struct percpu_ref		refs;
 	struct list_head		node;
 	struct list_head		rsrc_list;
-	struct fixed_rsrc_data		*rsrc_data;
+	struct io_rsrc_data		*rsrc_data;
 	void				(*rsrc_put)(struct io_ring_ctx *ctx,
 						    struct io_rsrc_put *prsrc);
 	struct llist_node		llist;
 	bool				done;
 };
 
-struct fixed_rsrc_data {
+struct io_rsrc_data {
 	struct fixed_rsrc_table		*table;
 	struct io_ring_ctx		*ctx;
 
-	struct fixed_rsrc_ref_node	*node;
+	struct io_rsrc_node		*node;
 	struct percpu_ref		refs;
 	struct completion		done;
 	bool				quiesce;
@@ -393,7 +393,7 @@ struct io_ring_ctx {
 	 * readers must ensure that ->refs is alive as long as the file* is
 	 * used. Only updated through io_uring_register(2).
 	 */
-	struct fixed_rsrc_data	*file_data;
+	struct io_rsrc_data	*file_data;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -447,7 +447,7 @@ struct io_ring_ctx {
 	struct llist_head		rsrc_put_llist;
 	struct list_head		rsrc_ref_list;
 	spinlock_t			rsrc_ref_lock;
-	struct fixed_rsrc_ref_node	*rsrc_backup_node;
+	struct io_rsrc_node		*rsrc_backup_node;
 
 	struct io_restriction		restrictions;
 
@@ -1027,9 +1027,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 struct files_struct *files);
 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
-			struct io_ring_ctx *ctx);
+static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
@@ -1075,7 +1074,7 @@ EXPORT_SYMBOL(io_uring_get_socket);
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-static inline void io_set_resource_node(struct io_kiocb *req)
+static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -6287,7 +6286,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 #endif
 #define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 
-static inline struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+static inline struct file **io_fixed_file_slot(struct io_rsrc_data *file_data,
 					       unsigned i)
 {
 	struct fixed_rsrc_table *table;
@@ -6321,7 +6320,7 @@ static struct file *io_file_get(struct io_submit_state *state,
 		file_ptr &= ~FFS_MASK;
 		/* mask in overlapping REQ_F and FFS bits */
 		req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
-		io_set_resource_node(req);
+		io_req_set_rsrc_node(req);
 	} else {
 		trace_io_uring_file_get(ctx, fd);
 		file = __io_file_get(state, fd);
@@ -7078,9 +7077,8 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 {
-	struct fixed_rsrc_data *data;
+	struct io_rsrc_data *data = container_of(ref, struct io_rsrc_data, refs);
 
-	data = container_of(ref, struct fixed_rsrc_data, refs);
 	complete(&data->done);
 }
 
@@ -7094,20 +7092,20 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 	spin_unlock_bh(&ctx->rsrc_ref_lock);
 }
 
-static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
-				 struct fixed_rsrc_data *rsrc_data,
-				 struct fixed_rsrc_ref_node *ref_node)
+static void io_rsrc_node_set(struct io_ring_ctx *ctx,
+			     struct io_rsrc_data *rsrc_data,
+			     struct io_rsrc_node *rsrc_node)
 {
 	io_rsrc_ref_lock(ctx);
-	rsrc_data->node = ref_node;
-	list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
+	rsrc_data->node = rsrc_node;
+	list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 	io_rsrc_ref_unlock(ctx);
 	percpu_ref_get(&rsrc_data->refs);
 }
 
-static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
+static void io_rsrc_node_kill(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
 {
-	struct fixed_rsrc_ref_node *ref_node = NULL;
+	struct io_rsrc_node *ref_node = NULL;
 
 	io_rsrc_ref_lock(ctx);
 	ref_node = data->node;
@@ -7117,21 +7115,21 @@ static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_dat
 		percpu_ref_kill(&ref_node->refs);
 }
 
-static int io_rsrc_refnode_prealloc(struct io_ring_ctx *ctx)
+static int io_rsrc_node_prealloc(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
-	ctx->rsrc_backup_node = alloc_fixed_rsrc_ref_node(ctx);
+	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
-static struct fixed_rsrc_ref_node *
-io_rsrc_refnode_get(struct io_ring_ctx *ctx,
-		    struct fixed_rsrc_data *rsrc_data,
-		    void (*rsrc_put)(struct io_ring_ctx *ctx,
-		                     struct io_rsrc_put *prsrc))
+static struct io_rsrc_node *
+io_rsrc_node_get(struct io_ring_ctx *ctx,
+		 struct io_rsrc_data *rsrc_data,
+		 void (*rsrc_put)(struct io_ring_ctx *ctx,
+		                  struct io_rsrc_put *prsrc))
 {
-	struct fixed_rsrc_ref_node *node = ctx->rsrc_backup_node;
+	struct io_rsrc_node *node = ctx->rsrc_backup_node;
 
 	WARN_ON_ONCE(!node);
 
@@ -7141,12 +7139,12 @@ io_rsrc_refnode_get(struct io_ring_ctx *ctx,
 	return node;
 }
 
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			       struct io_ring_ctx *ctx,
 			       void (*rsrc_put)(struct io_ring_ctx *ctx,
 			                        struct io_rsrc_put *prsrc))
 {
-	struct fixed_rsrc_ref_node *node;
+	struct io_rsrc_node *node;
 	int ret;
 
 	if (data->quiesce)
@@ -7154,10 +7152,10 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 
 	data->quiesce = true;
 	do {
-		ret = io_rsrc_refnode_prealloc(ctx);
+		ret = io_rsrc_node_prealloc(ctx);
 		if (ret)
 			break;
-		io_sqe_rsrc_kill_node(ctx, data);
+		io_rsrc_node_kill(ctx, data);
 		percpu_ref_kill(&data->refs);
 		flush_delayed_work(&ctx->rsrc_put_work);
 
@@ -7166,8 +7164,8 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 			break;
 
 		percpu_ref_resurrect(&data->refs);
-		node = io_rsrc_refnode_get(ctx, data, rsrc_put);
-		io_sqe_rsrc_set_node(ctx, data, node);
+		node = io_rsrc_node_get(ctx, data, rsrc_put);
+		io_rsrc_node_set(ctx, data, node);
 		reinit_completion(&data->done);
 
 		mutex_unlock(&ctx->uring_lock);
@@ -7179,9 +7177,9 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 	return ret;
 }
 
-static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx)
 {
-	struct fixed_rsrc_data *data;
+	struct io_rsrc_data *data;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
@@ -7197,7 +7195,7 @@ static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
 	return data;
 }
 
-static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
+static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
 	percpu_ref_exit(&data->refs);
 	kfree(data->table);
@@ -7206,7 +7204,7 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
 
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	struct fixed_rsrc_data *data = ctx->file_data;
+	struct io_rsrc_data *data = ctx->file_data;
 	unsigned nr_tables, i;
 	int ret;
 
@@ -7225,7 +7223,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
 		kfree(data->table[i].files);
-	free_fixed_rsrc_data(data);
+	io_rsrc_data_free(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
@@ -7454,7 +7452,7 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
+static int io_sqe_alloc_file_tables(struct io_rsrc_data *file_data,
 				    unsigned nr_tables, unsigned nr_files)
 {
 	int i;
@@ -7544,9 +7542,9 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 #endif
 }
 
-static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
+static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 {
-	struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
@@ -7570,10 +7568,10 @@ static void io_rsrc_put_work(struct work_struct *work)
 	node = llist_del_all(&ctx->rsrc_put_llist);
 
 	while (node) {
-		struct fixed_rsrc_ref_node *ref_node;
+		struct io_rsrc_node *ref_node;
 		struct llist_node *next = node->next;
 
-		ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+		ref_node = llist_entry(node, struct io_rsrc_node, llist);
 		__io_rsrc_put_work(ref_node);
 		node = next;
 	}
@@ -7581,27 +7579,23 @@ static void io_rsrc_put_work(struct work_struct *work)
 
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
-	struct fixed_rsrc_ref_node *ref_node;
-	struct fixed_rsrc_data *data;
-	struct io_ring_ctx *ctx;
+	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+	struct io_rsrc_data *data = node->rsrc_data;
+	struct io_ring_ctx *ctx = data->ctx;
 	bool first_add = false;
 	int delay = HZ;
 
-	ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
-	data = ref_node->rsrc_data;
-	ctx = data->ctx;
-
 	io_rsrc_ref_lock(ctx);
-	ref_node->done = true;
+	node->done = true;
 
 	while (!list_empty(&ctx->rsrc_ref_list)) {
-		ref_node = list_first_entry(&ctx->rsrc_ref_list,
-					struct fixed_rsrc_ref_node, node);
+		node = list_first_entry(&ctx->rsrc_ref_list,
+					    struct io_rsrc_node, node);
 		/* recycle ref nodes in order */
-		if (!ref_node->done)
+		if (!node->done)
 			break;
-		list_del(&ref_node->node);
-		first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
+		list_del(&node->node);
+		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 	}
 	io_rsrc_ref_unlock(ctx);
 
@@ -7614,10 +7608,9 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 		queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
-			struct io_ring_ctx *ctx)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
-	struct fixed_rsrc_ref_node *ref_node;
+	struct io_rsrc_node *ref_node;
 
 	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 	if (!ref_node)
@@ -7635,19 +7628,18 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 }
 
 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
-				     struct fixed_rsrc_ref_node *ref_node)
+				     struct io_rsrc_node *ref_node)
 {
 	ref_node->rsrc_data = ctx->file_data;
 	ref_node->rsrc_put = io_ring_file_put;
 }
 
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
+static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 {
 	percpu_ref_exit(&ref_node->refs);
 	kfree(ref_node);
 }
 
-
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
@@ -7655,8 +7647,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	unsigned nr_tables, i;
 	struct file *file;
 	int fd, ret = -ENOMEM;
-	struct fixed_rsrc_ref_node *ref_node;
-	struct fixed_rsrc_data *file_data;
+	struct io_rsrc_node *ref_node;
+	struct io_rsrc_data *file_data;
 
 	if (ctx->file_data)
 		return -EBUSY;
@@ -7665,7 +7657,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
 
-	file_data = alloc_fixed_rsrc_data(ctx);
+	file_data = io_rsrc_data_alloc(ctx);
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
@@ -7722,14 +7714,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	ref_node = alloc_fixed_rsrc_ref_node(ctx);
+	ref_node = io_rsrc_node_alloc(ctx);
 	if (!ref_node) {
 		io_sqe_files_unregister(ctx);
 		return -ENOMEM;
 	}
 	init_fixed_file_ref_node(ctx, ref_node);
 
-	io_sqe_rsrc_set_node(ctx, file_data, ref_node);
+	io_rsrc_node_set(ctx, file_data, ref_node);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7741,7 +7733,7 @@ out_fput:
 		kfree(file_data->table[i].files);
 	ctx->nr_user_files = 0;
 out_free:
-	free_fixed_rsrc_data(ctx->file_data);
+	io_rsrc_data_free(ctx->file_data);
 	ctx->file_data = NULL;
 	return ret;
 }
@@ -7789,10 +7781,10 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, void *rsrc)
 {
 	struct io_rsrc_put *prsrc;
-	struct fixed_rsrc_ref_node *ref_node = data->node;
+	struct io_rsrc_node *ref_node = data->node;
 
 	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 	if (!prsrc)
@@ -7804,7 +7796,7 @@ static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
 	return 0;
 }
 
-static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
+static inline int io_queue_file_removal(struct io_rsrc_data *data,
 					struct file *file)
 {
 	return io_queue_rsrc_removal(data, (void *)file);
@@ -7814,8 +7806,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update *up,
 				 unsigned nr_args)
 {
-	struct fixed_rsrc_data *data = ctx->file_data;
-	struct fixed_rsrc_ref_node *ref_node;
+	struct io_rsrc_data *data = ctx->file_data;
+	struct io_rsrc_node *ref_node;
 	struct file *file, **file_slot;
 	__s32 __user *fds;
 	int fd, i, err;
@@ -7826,7 +7818,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 	if (done > ctx->nr_user_files)
 		return -EINVAL;
-	err = io_rsrc_refnode_prealloc(ctx);
+	err = io_rsrc_node_prealloc(ctx);
 	if (err)
 		return err;
 
@@ -7882,8 +7874,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
-		ref_node = io_rsrc_refnode_get(ctx, data, io_ring_file_put);
-		io_sqe_rsrc_set_node(ctx, data, ref_node);
+		ref_node = io_rsrc_node_get(ctx, data, io_ring_file_put);
+		io_rsrc_node_set(ctx, data, ref_node);
 	}
 	return done ? done : err;
 }
@@ -8559,7 +8551,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_destroy_buffers(ctx);
 
 	if (ctx->rsrc_backup_node)
-		destroy_fixed_rsrc_ref_node(ctx->rsrc_backup_node);
+		io_rsrc_node_destroy(ctx->rsrc_backup_node);
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
-- 
cgit v1.2.3


From 221aa92409f945a19ce28c5cb54b4d9957f90715 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:41 +0100
Subject: io_uring: simplify io_rsrc_node_ref_zero

Replace queue_delayed_work() with mod_delayed_work() in
io_rsrc_node_ref_zero() as the later one can schedule a new work, and
cleanup it further for better readability.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3b2b23e3a1ea4bbf789cd61815d33e05d9ff945e.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 938d8236cc3c..ed7685caa8f8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7583,7 +7583,7 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 	struct io_rsrc_data *data = node->rsrc_data;
 	struct io_ring_ctx *ctx = data->ctx;
 	bool first_add = false;
-	int delay = HZ;
+	int delay;
 
 	io_rsrc_ref_lock(ctx);
 	node->done = true;
@@ -7599,13 +7599,9 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 	}
 	io_rsrc_ref_unlock(ctx);
 
-	if (percpu_ref_is_dying(&data->refs))
-		delay = 0;
-
-	if (!delay)
-		mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
-	else if (first_add)
-		queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+	delay = percpu_ref_is_dying(&data->refs) ? 0 : HZ;
+	if (first_add || !delay)
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From f3baed39929edc5fa0ce7a897567153c87551776 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:42 +0100
Subject: io_uring: use rsrc prealloc infra for files reg

Keep it consistent with update and use io_rsrc_node_prealloc() +
io_rsrc_node_get() in io_sqe_files_register() as well, that will be used
in future patches, not as error prone and allows to deduplicate
rsrc_node init.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cf87321e6be5e38f4dc7fe5079d2aa6945b1ace0.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ed7685caa8f8..8b4394f1285d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7623,13 +7623,6 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 	return ref_node;
 }
 
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
-				     struct io_rsrc_node *ref_node)
-{
-	ref_node->rsrc_data = ctx->file_data;
-	ref_node->rsrc_put = io_ring_file_put;
-}
-
 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 {
 	percpu_ref_exit(&ref_node->refs);
@@ -7642,7 +7635,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	__s32 __user *fds = (__s32 __user *) arg;
 	unsigned nr_tables, i;
 	struct file *file;
-	int fd, ret = -ENOMEM;
+	int fd, ret;
 	struct io_rsrc_node *ref_node;
 	struct io_rsrc_data *file_data;
 
@@ -7652,12 +7645,16 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
+	ret = io_rsrc_node_prealloc(ctx);
+	if (ret)
+		return ret;
 
 	file_data = io_rsrc_data_alloc(ctx);
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
 
+	ret = -ENOMEM;
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
 	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
 				   GFP_KERNEL);
@@ -7710,13 +7707,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	ref_node = io_rsrc_node_alloc(ctx);
-	if (!ref_node) {
-		io_sqe_files_unregister(ctx);
-		return -ENOMEM;
-	}
-	init_fixed_file_ref_node(ctx, ref_node);
-
+	ref_node = io_rsrc_node_get(ctx, ctx->file_data, io_ring_file_put);
 	io_rsrc_node_set(ctx, file_data, ref_node);
 	return ret;
 out_fput:
-- 
cgit v1.2.3


From 82fbcfa996e0b0f66ae0187082b0704d0ba50bdd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:43 +0100
Subject: io_uring: encapsulate rsrc node manipulations

io_rsrc_node_get() and io_rsrc_node_set() are always used together,
merge them into one so most users don't even see io_rsrc_node and don't
need to care about it.

It helped to catch io_sqe_files_register() inferring rsrc data argument
for get and set differently, not a problem but a good sign.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0827b080b2e61b3dec795380f7e1a1995595d41f.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8b4394f1285d..4012958cdfc8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7094,8 +7094,17 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 
 static void io_rsrc_node_set(struct io_ring_ctx *ctx,
 			     struct io_rsrc_data *rsrc_data,
-			     struct io_rsrc_node *rsrc_node)
+			     void (*rsrc_put)(struct io_ring_ctx *ctx,
+			                      struct io_rsrc_put *prsrc))
 {
+	struct io_rsrc_node *rsrc_node = ctx->rsrc_backup_node;
+
+	WARN_ON_ONCE(!rsrc_node);
+
+	ctx->rsrc_backup_node = NULL;
+	rsrc_node->rsrc_data = rsrc_data;
+	rsrc_node->rsrc_put = rsrc_put;
+
 	io_rsrc_ref_lock(ctx);
 	rsrc_data->node = rsrc_node;
 	list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
@@ -7123,28 +7132,11 @@ static int io_rsrc_node_prealloc(struct io_ring_ctx *ctx)
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
-static struct io_rsrc_node *
-io_rsrc_node_get(struct io_ring_ctx *ctx,
-		 struct io_rsrc_data *rsrc_data,
-		 void (*rsrc_put)(struct io_ring_ctx *ctx,
-		                  struct io_rsrc_put *prsrc))
-{
-	struct io_rsrc_node *node = ctx->rsrc_backup_node;
-
-	WARN_ON_ONCE(!node);
-
-	ctx->rsrc_backup_node = NULL;
-	node->rsrc_data = rsrc_data;
-	node->rsrc_put = rsrc_put;
-	return node;
-}
-
 static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			       struct io_ring_ctx *ctx,
 			       void (*rsrc_put)(struct io_ring_ctx *ctx,
 			                        struct io_rsrc_put *prsrc))
 {
-	struct io_rsrc_node *node;
 	int ret;
 
 	if (data->quiesce)
@@ -7164,8 +7156,7 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			break;
 
 		percpu_ref_resurrect(&data->refs);
-		node = io_rsrc_node_get(ctx, data, rsrc_put);
-		io_rsrc_node_set(ctx, data, node);
+		io_rsrc_node_set(ctx, data, rsrc_put);
 		reinit_completion(&data->done);
 
 		mutex_unlock(&ctx->uring_lock);
@@ -7636,7 +7627,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	unsigned nr_tables, i;
 	struct file *file;
 	int fd, ret;
-	struct io_rsrc_node *ref_node;
 	struct io_rsrc_data *file_data;
 
 	if (ctx->file_data)
@@ -7707,8 +7697,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	ref_node = io_rsrc_node_get(ctx, ctx->file_data, io_ring_file_put);
-	io_rsrc_node_set(ctx, file_data, ref_node);
+	io_rsrc_node_set(ctx, file_data, io_ring_file_put);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7794,7 +7783,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 unsigned nr_args)
 {
 	struct io_rsrc_data *data = ctx->file_data;
-	struct io_rsrc_node *ref_node;
 	struct file *file, **file_slot;
 	__s32 __user *fds;
 	int fd, i, err;
@@ -7861,8 +7849,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
-		ref_node = io_rsrc_node_get(ctx, data, io_ring_file_put);
-		io_rsrc_node_set(ctx, data, ref_node);
+		io_rsrc_node_set(ctx, data, io_ring_file_put);
 	}
 	return done ? done : err;
 }
-- 
cgit v1.2.3


From 40ae0ff70fb1379cb00041ef4061681e5e84e7f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:44 +0100
Subject: io_uring: move rsrc_put callback into io_rsrc_data

io_rsrc_node's callback operates only on a single io_rsrc_data and only
with its resources, so rsrc_put() callback is actually a property of
io_rsrc_data. Move it there, it makes code much nicecr.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9417c2fba3c09e8668f05747006a603d416d34b4.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4012958cdfc8..c7875bb9b126 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -224,16 +224,17 @@ struct io_rsrc_node {
 	struct list_head		node;
 	struct list_head		rsrc_list;
 	struct io_rsrc_data		*rsrc_data;
-	void				(*rsrc_put)(struct io_ring_ctx *ctx,
-						    struct io_rsrc_put *prsrc);
 	struct llist_node		llist;
 	bool				done;
 };
 
+typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+
 struct io_rsrc_data {
 	struct fixed_rsrc_table		*table;
 	struct io_ring_ctx		*ctx;
 
+	rsrc_put_fn			*do_put;
 	struct io_rsrc_node		*node;
 	struct percpu_ref		refs;
 	struct completion		done;
@@ -7093,9 +7094,7 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 }
 
 static void io_rsrc_node_set(struct io_ring_ctx *ctx,
-			     struct io_rsrc_data *rsrc_data,
-			     void (*rsrc_put)(struct io_ring_ctx *ctx,
-			                      struct io_rsrc_put *prsrc))
+			     struct io_rsrc_data *rsrc_data)
 {
 	struct io_rsrc_node *rsrc_node = ctx->rsrc_backup_node;
 
@@ -7103,7 +7102,6 @@ static void io_rsrc_node_set(struct io_ring_ctx *ctx,
 
 	ctx->rsrc_backup_node = NULL;
 	rsrc_node->rsrc_data = rsrc_data;
-	rsrc_node->rsrc_put = rsrc_put;
 
 	io_rsrc_ref_lock(ctx);
 	rsrc_data->node = rsrc_node;
@@ -7132,10 +7130,7 @@ static int io_rsrc_node_prealloc(struct io_ring_ctx *ctx)
 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 }
 
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-			       struct io_ring_ctx *ctx,
-			       void (*rsrc_put)(struct io_ring_ctx *ctx,
-			                        struct io_rsrc_put *prsrc))
+static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
 {
 	int ret;
 
@@ -7156,7 +7151,7 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			break;
 
 		percpu_ref_resurrect(&data->refs);
-		io_rsrc_node_set(ctx, data, rsrc_put);
+		io_rsrc_node_set(ctx, data);
 		reinit_completion(&data->done);
 
 		mutex_unlock(&ctx->uring_lock);
@@ -7168,7 +7163,8 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 	return ret;
 }
 
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx)
+static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+					       rsrc_put_fn *do_put)
 {
 	struct io_rsrc_data *data;
 
@@ -7182,6 +7178,7 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx)
 		return NULL;
 	}
 	data->ctx = ctx;
+	data->do_put = do_put;
 	init_completion(&data->done);
 	return data;
 }
@@ -7206,7 +7203,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	 */
 	if (!data || percpu_ref_is_dying(&data->refs))
 		return -ENXIO;
-	ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
+	ret = io_rsrc_ref_quiesce(data, ctx);
 	if (ret)
 		return ret;
 
@@ -7541,7 +7538,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 		list_del(&prsrc->list);
-		ref_node->rsrc_put(ctx, prsrc);
+		rsrc_data->do_put(ctx, prsrc);
 		kfree(prsrc);
 	}
 
@@ -7639,7 +7636,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (ret)
 		return ret;
 
-	file_data = io_rsrc_data_alloc(ctx);
+	file_data = io_rsrc_data_alloc(ctx, io_ring_file_put);
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
@@ -7697,7 +7694,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	io_rsrc_node_set(ctx, file_data, io_ring_file_put);
+	io_rsrc_node_set(ctx, file_data);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7849,7 +7846,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
-		io_rsrc_node_set(ctx, data, io_ring_file_put);
+		io_rsrc_node_set(ctx, data);
 	}
 	return done ? done : err;
 }
-- 
cgit v1.2.3


From e7c78371bbf749087ff6b1f37c0d60f0ae82572c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:45 +0100
Subject: io_uring: refactor io_queue_rsrc_removal()

Pass rsrc_node into io_queue_rsrc_removal() explicitly. Just a
simple preparation patch, makes following changes nicer.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/002889ce4de7baf287f2b010eef86ffe889174c6.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c7875bb9b126..2ac69d2ad4b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7754,27 +7754,20 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, void *rsrc)
+static int io_queue_rsrc_removal(struct io_rsrc_data *data,
+				 struct io_rsrc_node *node, void *rsrc)
 {
 	struct io_rsrc_put *prsrc;
-	struct io_rsrc_node *ref_node = data->node;
 
 	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 	if (!prsrc)
 		return -ENOMEM;
 
 	prsrc->rsrc = rsrc;
-	list_add(&prsrc->list, &ref_node->rsrc_list);
-
+	list_add(&prsrc->list, &node->rsrc_list);
 	return 0;
 }
 
-static inline int io_queue_file_removal(struct io_rsrc_data *data,
-					struct file *file)
-{
-	return io_queue_rsrc_removal(data, (void *)file);
-}
-
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update *up,
 				 unsigned nr_args)
@@ -7809,7 +7802,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 		if (*file_slot) {
 			file = (struct file *) ((unsigned long) *file_slot & FFS_MASK);
-			err = io_queue_file_removal(data, file);
+			err = io_queue_rsrc_removal(data, data->node, file);
 			if (err)
 				break;
 			*file_slot = NULL;
-- 
cgit v1.2.3


From a7f0ed5acdc9ce251c66b9380e08766e59fa4ee8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:46 +0100
Subject: io_uring: ctx-wide rsrc nodes

If we're going to ever support multiple types of resources we need
shared rsrc nodes to not bloat requests, that is implemented in this
patch. It also gives a nicer API and saves one pointer dereference
in io_req_set_rsrc_node().

We may say that all requests bound to a resource belong to one and only
one rsrc node, and considering that nodes are removed and recycled
strictly in-order, this separates requests into generations, where
generation are changed on each node switch (i.e. io_rsrc_node_switch()).

The API is simple, io_rsrc_node_switch() switches to a new generation if
needed, and also optionally kills a passed in io_rsrc_data. Each call to
io_rsrc_node_switch() have to be preceded with
io_rsrc_node_switch_start(). The start function is idempotent and should
not necessarily be followed by switch.

One difference is that once a node was set it will always retain a valid
rsrc node, even on unregister. It may be a nuisance at the moment, but
makes much sense for multiple types of resources. Another thing changed
is that nodes are bound to/associated with a io_rsrc_data later just
before killing (i.e. switching).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7e9c693b4b9a2f47aa784b616ce29843021bb65a.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 73 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2ac69d2ad4b3..81298999fd93 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -235,7 +235,6 @@ struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;
 
 	rsrc_put_fn			*do_put;
-	struct io_rsrc_node		*node;
 	struct percpu_ref		refs;
 	struct completion		done;
 	bool				quiesce;
@@ -448,6 +447,7 @@ struct io_ring_ctx {
 	struct llist_head		rsrc_put_llist;
 	struct list_head		rsrc_ref_list;
 	spinlock_t			rsrc_ref_lock;
+	struct io_rsrc_node		*rsrc_node;
 	struct io_rsrc_node		*rsrc_backup_node;
 
 	struct io_restriction		restrictions;
@@ -1080,7 +1080,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!req->fixed_rsrc_refs) {
-		req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
 		percpu_ref_get(req->fixed_rsrc_refs);
 	}
 }
@@ -7093,36 +7093,32 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 	spin_unlock_bh(&ctx->rsrc_ref_lock);
 }
 
-static void io_rsrc_node_set(struct io_ring_ctx *ctx,
-			     struct io_rsrc_data *rsrc_data)
+static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+				struct io_rsrc_data *data_to_kill)
 {
-	struct io_rsrc_node *rsrc_node = ctx->rsrc_backup_node;
+	WARN_ON_ONCE(!ctx->rsrc_backup_node);
+	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 
-	WARN_ON_ONCE(!rsrc_node);
+	if (data_to_kill) {
+		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 
-	ctx->rsrc_backup_node = NULL;
-	rsrc_node->rsrc_data = rsrc_data;
+		rsrc_node->rsrc_data = data_to_kill;
+		io_rsrc_ref_lock(ctx);
+		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+		io_rsrc_ref_unlock(ctx);
 
-	io_rsrc_ref_lock(ctx);
-	rsrc_data->node = rsrc_node;
-	list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-	io_rsrc_ref_unlock(ctx);
-	percpu_ref_get(&rsrc_data->refs);
-}
-
-static void io_rsrc_node_kill(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
-{
-	struct io_rsrc_node *ref_node = NULL;
+		percpu_ref_get(&data_to_kill->refs);
+		percpu_ref_kill(&rsrc_node->refs);
+		ctx->rsrc_node = NULL;
+	}
 
-	io_rsrc_ref_lock(ctx);
-	ref_node = data->node;
-	data->node = NULL;
-	io_rsrc_ref_unlock(ctx);
-	if (ref_node)
-		percpu_ref_kill(&ref_node->refs);
+	if (!ctx->rsrc_node) {
+		ctx->rsrc_node = ctx->rsrc_backup_node;
+		ctx->rsrc_backup_node = NULL;
+	}
 }
 
-static int io_rsrc_node_prealloc(struct io_ring_ctx *ctx)
+static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 {
 	if (ctx->rsrc_backup_node)
 		return 0;
@@ -7139,10 +7135,11 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 
 	data->quiesce = true;
 	do {
-		ret = io_rsrc_node_prealloc(ctx);
+		ret = io_rsrc_node_switch_start(ctx);
 		if (ret)
 			break;
-		io_rsrc_node_kill(ctx, data);
+		io_rsrc_node_switch(ctx, data);
+
 		percpu_ref_kill(&data->refs);
 		flush_delayed_work(&ctx->rsrc_put_work);
 
@@ -7151,7 +7148,6 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 			break;
 
 		percpu_ref_resurrect(&data->refs);
-		io_rsrc_node_set(ctx, data);
 		reinit_completion(&data->done);
 
 		mutex_unlock(&ctx->uring_lock);
@@ -7632,7 +7628,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
-	ret = io_rsrc_node_prealloc(ctx);
+	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
 
@@ -7694,7 +7690,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	io_rsrc_node_set(ctx, file_data);
+	io_rsrc_node_switch(ctx, NULL);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7783,7 +7779,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 	if (done > ctx->nr_user_files)
 		return -EINVAL;
-	err = io_rsrc_node_prealloc(ctx);
+	err = io_rsrc_node_switch_start(ctx);
 	if (err)
 		return err;
 
@@ -7802,7 +7798,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 		if (*file_slot) {
 			file = (struct file *) ((unsigned long) *file_slot & FFS_MASK);
-			err = io_queue_rsrc_removal(data, data->node, file);
+			err = io_queue_rsrc_removal(data, ctx->rsrc_node, file);
 			if (err)
 				break;
 			*file_slot = NULL;
@@ -7837,10 +7833,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		}
 	}
 
-	if (needs_switch) {
-		percpu_ref_kill(&data->node->refs);
-		io_rsrc_node_set(ctx, data);
-	}
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, data);
 	return done ? done : err;
 }
 
@@ -8514,8 +8508,15 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_eventfd_unregister(ctx);
 	io_destroy_buffers(ctx);
 
+	/* there are no registered resources left, nobody uses it */
+	if (ctx->rsrc_node)
+		io_rsrc_node_destroy(ctx->rsrc_node);
 	if (ctx->rsrc_backup_node)
 		io_rsrc_node_destroy(ctx->rsrc_backup_node);
+	flush_delayed_work(&ctx->rsrc_put_work);
+
+	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
+	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
-- 
cgit v1.2.3


From 28a9fe2521348ee350b65ae89e63c1def87b0cb6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:47 +0100
Subject: io_uring: reuse io_rsrc_node_destroy()

Reuse io_rsrc_node_destroy() in __io_rsrc_put_work(). Also move it to a
more appropriate place -- to the other node routines, and remove forward
declaration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cccafba41aee1e5bb59988704885b1340aef3a27.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 81298999fd93..59e666151754 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1028,7 +1028,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 struct files_struct *files);
 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
@@ -7093,6 +7092,12 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 	spin_unlock_bh(&ctx->rsrc_ref_lock);
 }
 
+static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+{
+	percpu_ref_exit(&ref_node->refs);
+	kfree(ref_node);
+}
+
 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 				struct io_rsrc_data *data_to_kill)
 {
@@ -7538,8 +7543,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		kfree(prsrc);
 	}
 
-	percpu_ref_exit(&ref_node->refs);
-	kfree(ref_node);
+	io_rsrc_node_destroy(ref_node);
 	percpu_ref_put(&rsrc_data->refs);
 }
 
@@ -7607,12 +7611,6 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 	return ref_node;
 }
 
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
-{
-	percpu_ref_exit(&ref_node->refs);
-	kfree(ref_node);
-}
-
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
-- 
cgit v1.2.3


From 215c39026023dbfb4026b670c318371252be909f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:48 +0100
Subject: io_uring: remove useless is_dying check on quiesce

rsrc_data refs should always be valid for potential submitters,
io_rsrc_ref_quiesce() restores it before unlocking, so
percpu_ref_is_dying() check in io_sqe_files_unregister() does nothing
and misleading. Concurrent quiesce is prevented with
struct io_rsrc_data::quiesce.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bf97055e1748ee3a382e66daf384a469eb90b931.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 59e666151754..373fe2b9e1f2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7135,6 +7135,7 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 {
 	int ret;
 
+	/* As we may drop ->uring_lock, other task may have started quiesce */
 	if (data->quiesce)
 		return -ENXIO;
 
@@ -7197,12 +7198,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	unsigned nr_tables, i;
 	int ret;
 
-	/*
-	 * percpu_ref_is_dying() is to stop parallel files unregister
-	 * Since we possibly drop uring lock later in this function to
-	 * run task work.
-	 */
-	if (!data || percpu_ref_is_dying(&data->refs))
+	if (!data)
 		return -ENXIO;
 	ret = io_rsrc_ref_quiesce(data, ctx);
 	if (ret)
-- 
cgit v1.2.3


From 89b5066ea1d96b321c0743259169c599d3f4f969 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:50 +0100
Subject: io_uring: combine lock/unlock sections on exit

io_ring_exit_work() already does uring_lock lock/unlock, no need to
repeat it for lock waiting trick in io_ring_ctx_free(). Move the waiting
with comments and spinlocking into io_ring_exit_work.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a8ae0589b0ea64ad4791e2c282e4e9b713dd7024.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 373fe2b9e1f2..edb5b9d0fd1a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8478,16 +8478,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
-	/*
-	 * Some may use context even when all refs and requests have been put,
-	 * and they are free to do so while still holding uring_lock or
-	 * completion_lock, see __io_req_task_submit(). Wait for them to finish.
-	 */
-	mutex_lock(&ctx->uring_lock);
-	mutex_unlock(&ctx->uring_lock);
-	spin_lock_irq(&ctx->completion_lock);
-	spin_unlock_irq(&ctx->completion_lock);
-
 	io_sq_thread_finish(ctx);
 	io_sqe_buffers_unregister(ctx);
 
@@ -8638,6 +8628,12 @@ static void io_ring_exit_work(struct work_struct *work)
 		WARN_ON_ONCE(time_after(jiffies, timeout));
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 
+	/*
+	 * Some may use context even when all refs and requests have been put,
+	 * and they are free to do so while still holding uring_lock or
+	 * completion_lock, see __io_req_task_submit(). Apart from other work,
+	 * this lock/unlock section also waits them to finish.
+	 */
 	mutex_lock(&ctx->uring_lock);
 	while (!list_empty(&ctx->tctx_list)) {
 		WARN_ON_ONCE(time_after(jiffies, timeout));
@@ -8658,6 +8654,8 @@ static void io_ring_exit_work(struct work_struct *work)
 		mutex_lock(&ctx->uring_lock);
 	}
 	mutex_unlock(&ctx->uring_lock);
+	spin_lock_irq(&ctx->completion_lock);
+	spin_unlock_irq(&ctx->completion_lock);
 
 	io_ring_ctx_free(ctx);
 }
-- 
cgit v1.2.3


From 7394161cb8bd26be43ebf0075e3b0197a6c3ca01 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:51 +0100
Subject: io_uring: better ref handling in poll_remove_one

Instead of io_put_req() to drop not a final ref, use req_ref_put(),
which is slimmer and will also check the invariant.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/85b5774ce13ae55cc2e705abdc8cbafe1212f1bd.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edb5b9d0fd1a..e92de7e206c0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5244,7 +5244,7 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
 		/* non-poll requests have submit ref still */
 		do_complete = __io_poll_remove_one(req, &apoll->poll, true);
 		if (do_complete) {
-			io_put_req(req);
+			req_ref_put(req);
 			kfree(apoll->double_poll);
 			kfree(apoll);
 		}
-- 
cgit v1.2.3


From e146a4a3f69e843a2153735875c64990aca244b1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:52 +0100
Subject: io_uring: remove unused hash_wait

No users of io_uring_ctx::hash_wait left, kill it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e25cb83c233a5f75f15275596b49fbafbea606fa.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e92de7e206c0..6a0317e851f7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -455,8 +455,6 @@ struct io_ring_ctx {
 	/* exit task_work */
 	struct callback_head		*exit_task_work;
 
-	struct wait_queue_head		hash_wait;
-
 	/* Keep this last, we don't need it for the fast path */
 	struct work_struct		exit_work;
 	struct list_head		tctx_list;
-- 
cgit v1.2.3


From 0aec38fda2b6e36c0b066a87ff727ace3666cade Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:53 +0100
Subject: io_uring: refactor io_async_cancel()

Remove extra tctx==NULL checks that are already done by
io_async_cancel_one().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70c2a8b958d942e86958a28af0452966ce1095b0.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6a0317e851f7..f53d93261e2b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5824,8 +5824,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
 
-		if (!tctx || !tctx->io_wq)
-			continue;
 		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
 		if (ret != -ENOENT)
 			break;
-- 
cgit v1.2.3


From 75769e3f7357171dbe040a5ed55445c2642295d1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:54 +0100
Subject: io_uring: improve import_fixed overflow checks

Replace a hand-coded overflow check with a specialised function. Even
though compilers are smart enough to generate identical binary (i.e.
check carry bit), but it's more foolproof and conveys the intention
better.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e437dcdc929bacbb6f11a4824ecbbf17225cb82a.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f53d93261e2b..e6508b19e19e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2785,8 +2785,8 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 	size_t len = req->rw.len;
 	struct io_mapped_ubuf *imu;
 	u16 index, buf_index = req->buf_index;
+	u64 buf_end, buf_addr = req->rw.addr;
 	size_t offset;
-	u64 buf_addr;
 
 	if (unlikely(buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
@@ -2794,11 +2794,10 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 	imu = &ctx->user_bufs[index];
 	buf_addr = req->rw.addr;
 
-	/* overflow */
-	if (buf_addr + len < buf_addr)
+	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 		return -EFAULT;
 	/* not inside the mapped region */
-	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+	if (buf_addr < imu->ubuf || buf_end > imu->ubuf + imu->len)
 		return -EFAULT;
 
 	/*
-- 
cgit v1.2.3


From 4751f53d74a688137de6a2a0b12ee591288c6dc8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:55 +0100
Subject: io_uring: store reg buffer end instead of length

It's a bit more convenient for us to store a registered buffer end
address instead of length, see struct io_mapped_ubuf, as it allow to not
recompute it every time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/39164403fe92f1dc437af134adeec2423cdf9395.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e6508b19e19e..47b02e652d84 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -194,7 +194,7 @@ enum io_uring_cmd_flags {
 
 struct io_mapped_ubuf {
 	u64		ubuf;
-	size_t		len;
+	u64		ubuf_end;
 	struct		bio_vec *bvec;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
@@ -2797,7 +2797,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 		return -EFAULT;
 	/* not inside the mapped region */
-	if (buf_addr < imu->ubuf || buf_end > imu->ubuf + imu->len)
+	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
 		return -EFAULT;
 
 	/*
@@ -8319,7 +8319,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	}
 	/* store original address for later verification */
 	imu->ubuf = ubuf;
-	imu->len = iov->iov_len;
+	imu->ubuf_end = ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
 	ret = 0;
 done:
@@ -9378,9 +9378,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
 		struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+		unsigned int len = buf->ubuf_end - buf->ubuf;
 
-		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
-						(unsigned int) buf->len);
+		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
 	}
 	if (has_lock && !xa_empty(&ctx->personalities)) {
 		unsigned long index;
-- 
cgit v1.2.3


From 47e90392c8ad982c25f58125e9be3fc4d476b9ed Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:56 +0100
Subject: io_uring: kill unused forward decls

Kill unused forward declarations for io_ring_file_put() and
io_queue_next(). Also btw rename the first one.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/64aa27c3f9662e14615cc119189f5eaf12989671.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 47b02e652d84..913929d9d05a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1027,14 +1027,12 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct files_struct *files);
 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
-static void io_queue_next(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -7459,7 +7457,7 @@ static int io_sqe_alloc_file_tables(struct io_rsrc_data *file_data,
 	return 1;
 }
 
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
 	struct file *file = prsrc->file;
 #if defined(CONFIG_UNIX)
@@ -7621,7 +7619,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (ret)
 		return ret;
 
-	file_data = io_rsrc_data_alloc(ctx, io_ring_file_put);
+	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put);
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
-- 
cgit v1.2.3


From e07785b0029165fdb1c72ac12fe42801ba5f9f61 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:57 +0100
Subject: io_uring: lock annotate timeouts and poll

Add timeout and poll ->comletion_lock annotations for Sparse, makes life
easier while looking at the functions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2345325643093d41543383ba985a735aeb899eac.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 913929d9d05a..7c5c3d46c6b7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4885,6 +4885,7 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
 }
 
 static void io_poll_remove_double(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_poll_iocb *poll = io_poll_get_double(req);
 
@@ -4903,6 +4904,7 @@ static void io_poll_remove_double(struct io_kiocb *req)
 }
 
 static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags = IORING_CQE_F_MORE;
@@ -5208,6 +5210,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 
 static bool __io_poll_remove_one(struct io_kiocb *req,
 				 struct io_poll_iocb *poll, bool do_cancel)
+	__must_hold(&req->ctx->completion_lock)
 {
 	bool do_complete = false;
 
@@ -5226,6 +5229,7 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 }
 
 static bool io_poll_remove_waitqs(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
 {
 	bool do_complete;
 
@@ -5249,6 +5253,7 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
 }
 
 static bool io_poll_remove_one(struct io_kiocb *req)
+	__must_hold(&req->ctx->completion_lock)
 {
 	bool do_complete;
 
@@ -5292,6 +5297,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
+	__must_hold(&ctx->completion_lock)
 {
 	struct hlist_head *list;
 	struct io_kiocb *req;
@@ -5307,6 +5313,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
 }
 
 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req;
 
@@ -5513,6 +5520,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 
 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 					   __u64 user_data)
+	__must_hold(&ctx->completion_lock)
 {
 	struct io_timeout_data *io;
 	struct io_kiocb *req;
@@ -5537,6 +5545,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 }
 
 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 
@@ -5551,6 +5560,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
+	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 	struct io_timeout_data *data;
-- 
cgit v1.2.3


From c4ea060e85eabe40f3572969daff4fc2f242b7b8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:58 +0100
Subject: io_uring: simplify overflow handling

Overflowed CQEs doesn't lock requests anymore, so we don't care so much
about cancelling them, so kill cq_overflow_flushed and simplify the
code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5799867aeba9e713c32f49aef78e5e1aef9fbc43.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7c5c3d46c6b7..99f5252ff2dc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -335,7 +335,6 @@ struct io_ring_ctx {
 	struct {
 		unsigned int		flags;
 		unsigned int		compat: 1;
-		unsigned int		cq_overflow_flushed: 1;
 		unsigned int		drain_next: 1;
 		unsigned int		eventfd_async: 1;
 		unsigned int		restricted: 1;
@@ -1525,8 +1524,7 @@ static bool __io_cqring_fill_event(struct io_kiocb *req, long res,
 		WRITE_ONCE(cqe->flags, cflags);
 		return true;
 	}
-	if (!ctx->cq_overflow_flushed &&
-	    !atomic_read(&req->task->io_uring->in_idle)) {
+	if (!atomic_read(&req->task->io_uring->in_idle)) {
 		struct io_overflow_cqe *ocqe;
 
 		ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
@@ -8491,6 +8489,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	mutex_lock(&ctx->uring_lock);
 	io_sqe_files_unregister(ctx);
+	if (ctx->rings)
+		__io_cqring_overflow_flush(ctx, true);
 	mutex_unlock(&ctx->uring_lock);
 	io_eventfd_unregister(ctx);
 	io_destroy_buffers(ctx);
@@ -8692,8 +8692,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 
 	mutex_lock(&ctx->uring_lock);
 	percpu_ref_kill(&ctx->refs);
-	/* if force is set, the ring is going away. always drop after that */
-	ctx->cq_overflow_flushed = 1;
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	xa_for_each(&ctx->personalities, index, creds)
-- 
cgit v1.2.3


From df9727affa058f4f18e388b30247650f8ae13cd8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:43:59 +0100
Subject: io_uring: put link timeout req consistently

Don't put linked timeout req in io_async_find_and_cancel() but do it in
io_link_timeout_fn(), so we have only one point for that and won't have
to do it differently as it's now (put vs put_deferred). Btw, improve a
bit io_async_find_and_cancel()'s locking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d75b70957f245275ab7cba83e0ac9c1b86aae78a.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 99f5252ff2dc..d575d0d15df5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5766,12 +5766,9 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	int ret;
 
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-	if (ret != -ENOENT) {
-		spin_lock_irqsave(&ctx->completion_lock, flags);
-		goto done;
-	}
-
 	spin_lock_irqsave(&ctx->completion_lock, flags);
+	if (ret != -ENOENT)
+		goto done;
 	ret = io_timeout_cancel(ctx, sqe_addr);
 	if (ret != -ENOENT)
 		goto done;
@@ -5786,7 +5783,6 @@ done:
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
 }
 
 static int io_async_cancel_prep(struct io_kiocb *req,
@@ -6361,8 +6357,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		io_put_req_deferred(prev, 1);
 	} else {
 		io_req_complete_post(req, -ETIME, 0);
-		io_put_req_deferred(req, 1);
 	}
+	io_put_req_deferred(req, 1);
 	return HRTIMER_NORESTART;
 }
 
-- 
cgit v1.2.3


From 044118069a23fdfb31677631cfdfc5e33b488752 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:00 +0100
Subject: io_uring: deduplicate NOSIGNAL setting

Set MSG_NOSIGNAL and REQ_F_NOWAIT in send/recv prep routines and don't
duplicate it in all four send/recv handlers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e1133a3ed1c0e192975b7341ea4b0bf91f63b132.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d575d0d15df5..5fb664e8ae49 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4323,9 +4323,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
+	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	if (sr->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -4354,12 +4356,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
-	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-	if (flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-	else if (issue_flags & IO_URING_F_NONBLOCK)
+	flags = req->sr_msg.msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
@@ -4402,12 +4401,9 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
 
-	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-	if (flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-	else if (issue_flags & IO_URING_F_NONBLOCK)
+	flags = req->sr_msg.msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&msg.msg_iter);
 
@@ -4550,10 +4546,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->bgid = READ_ONCE(sqe->buf_group);
+	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	if (sr->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -4594,12 +4592,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 				1, req->sr_msg.len);
 	}
 
-	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-	if (flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-	else if (force_nonblock)
+	flags = req->sr_msg.msg_flags;
+	if (force_nonblock)
 		flags |= MSG_DONTWAIT;
-
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
@@ -4657,12 +4652,9 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_iocb = NULL;
 	msg.msg_flags = 0;
 
-	flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-	if (flags & MSG_DONTWAIT)
-		req->flags |= REQ_F_NOWAIT;
-	else if (force_nonblock)
+	flags = req->sr_msg.msg_flags;
+	if (force_nonblock)
 		flags |= MSG_DONTWAIT;
-
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&msg.msg_iter);
 
-- 
cgit v1.2.3


From 9a321c98490c70653a4f0a10b28c45edbcf7a93d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:01 +0100
Subject: io_uring: set proper FFS* flags on reg file update

Set FFS_* flags (e.g. FFS_ASYNC_READ) not only in initial registration
but also on registered files update. Not a bug, but may miss getting
profit out of the feature.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/df29a841a2d3d3695b509cdffce5070777d9d942.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5fb664e8ae49..ede9d01efb3b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6292,6 +6292,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 	return (struct file *) ((unsigned long) *file_slot & FFS_MASK);
 }
 
+static void io_fixed_file_set(struct file **file_slot, struct file *file)
+{
+	unsigned long file_ptr = (unsigned long) file;
+
+	if (__io_file_supports_async(file, READ))
+		file_ptr |= FFS_ASYNC_READ;
+	if (__io_file_supports_async(file, WRITE))
+		file_ptr |= FFS_ASYNC_WRITE;
+	if (S_ISREG(file_inode(file)->i_mode))
+		file_ptr |= FFS_ISREG;
+	*file_slot = (struct file *)file_ptr;
+}
+
 static struct file *io_file_get(struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed)
 {
@@ -7631,8 +7644,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		unsigned long file_ptr;
-
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
@@ -7657,14 +7668,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		file_ptr = (unsigned long) file;
-		if (__io_file_supports_async(file, READ))
-			file_ptr |= FFS_ASYNC_READ;
-		if (__io_file_supports_async(file, WRITE))
-			file_ptr |= FFS_ASYNC_WRITE;
-		if (S_ISREG(file_inode(file)->i_mode))
-			file_ptr |= FFS_ISREG;
-		*io_fixed_file_slot(file_data, i) = (struct file *) file_ptr;
+		io_fixed_file_set(io_fixed_file_slot(file_data, i), file);
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7806,7 +7810,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			*file_slot = file;
+			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
 				*file_slot = NULL;
-- 
cgit v1.2.3


From f4f7d21ce46474128934caeb80dfb1e5396b596e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:02 +0100
Subject: io_uring: don't quiesce intial files register

There is no reason why we would want to fully quiesce ring on
IORING_REGISTER_FILES, if it's already registered we fail.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/563bb8060bb2d3efbc32fce6101678281c574d2a.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ede9d01efb3b..4700de36a8cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9815,6 +9815,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
+	case IORING_REGISTER_FILES:
 	case IORING_UNREGISTER_FILES:
 	case IORING_REGISTER_FILES_UPDATE:
 	case IORING_REGISTER_PROBE:
-- 
cgit v1.2.3


From 846a4ef22bf6d6ede4547fe8fa500385a90c64ba Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:03 +0100
Subject: io_uring: refactor file tables alloc/free

Introduce a heler io_free_file_tables() doing all the cleaning, there
are several places where it's hand coded. Also move all allocations into
io_sqe_alloc_file_tables() and rename it, so all of it is in one place.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/502a84ebf41ff119b095e59661e678eacb752bf8.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4700de36a8cc..663e1cf445ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7054,6 +7054,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
+static void io_free_file_tables(struct io_rsrc_data *data, unsigned nr_files)
+{
+	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+
+	for (i = 0; i < nr_tables; i++)
+		kfree(data->table[i].files);
+	kfree(data->table);
+	data->table = NULL;
+}
+
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 #if defined(CONFIG_UNIX)
@@ -7190,14 +7200,12 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
 	percpu_ref_exit(&data->refs);
-	kfree(data->table);
 	kfree(data);
 }
 
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_data *data = ctx->file_data;
-	unsigned nr_tables, i;
 	int ret;
 
 	if (!data)
@@ -7207,9 +7215,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		return ret;
 
 	__io_sqe_files_unregister(ctx);
-	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
-	for (i = 0; i < nr_tables; i++)
-		kfree(data->table[i].files);
+	io_free_file_tables(data, ctx->nr_user_files);
 	io_rsrc_data_free(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
@@ -7439,16 +7445,20 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static int io_sqe_alloc_file_tables(struct io_rsrc_data *file_data,
-				    unsigned nr_tables, unsigned nr_files)
+static bool io_alloc_file_tables(struct io_rsrc_data *file_data,
+				 unsigned nr_files)
 {
-	int i;
+	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+
+	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
+				   GFP_KERNEL);
+	if (!file_data->table)
+		return false;
 
 	for (i = 0; i < nr_tables; i++) {
 		struct fixed_rsrc_table *table = &file_data->table[i];
-		unsigned this_files;
+		unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
 
-		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
 		table->files = kcalloc(this_files, sizeof(struct file *),
 					GFP_KERNEL);
 		if (!table->files)
@@ -7457,13 +7467,10 @@ static int io_sqe_alloc_file_tables(struct io_rsrc_data *file_data,
 	}
 
 	if (i == nr_tables)
-		return 0;
+		return true;
 
-	for (i = 0; i < nr_tables; i++) {
-		struct fixed_rsrc_table *table = &file_data->table[i];
-		kfree(table->files);
-	}
-	return 1;
+	io_free_file_tables(file_data, nr_tables * IORING_MAX_FILES_TABLE);
+	return false;
 }
 
 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
@@ -7613,9 +7620,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
-	unsigned nr_tables, i;
 	struct file *file;
 	int fd, ret;
+	unsigned i;
 	struct io_rsrc_data *file_data;
 
 	if (ctx->file_data)
@@ -7634,13 +7641,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	ctx->file_data = file_data;
 
 	ret = -ENOMEM;
-	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
-	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
-				   GFP_KERNEL);
-	if (!file_data->table)
-		goto out_free;
-
-	if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+	if (!io_alloc_file_tables(file_data, nr_args))
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
@@ -7685,8 +7686,7 @@ out_fput:
 		if (file)
 			fput(file);
 	}
-	for (i = 0; i < nr_tables; i++)
-		kfree(file_data->table[i].files);
+	io_free_file_tables(file_data, nr_args);
 	ctx->nr_user_files = 0;
 out_free:
 	io_rsrc_data_free(ctx->file_data);
-- 
cgit v1.2.3


From a04b0ac0cb64fc403822de9288d68e6511ce6dc2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:04 +0100
Subject: io_uring: encapsulate fixed files into struct

Add struct io_fixed_file representing a single registered file, first to
hide ugly struct file **, which may be misleading, and secondly to
retype it to unsigned long as conversions to it and back to file * for
handling and masking FFS_* flags are getting nasty.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/78669731a605a7614c577c3de552631cfaf0869a.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 663e1cf445ca..4e5f93052a6b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -207,6 +207,11 @@ struct io_overflow_cqe {
 	struct list_head list;
 };
 
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
 struct io_rsrc_put {
 	struct list_head list;
 	union {
@@ -216,7 +221,7 @@ struct io_rsrc_put {
 };
 
 struct fixed_rsrc_table {
-	struct file		**files;
+	struct io_fixed_file *files;
 };
 
 struct io_rsrc_node {
@@ -6275,8 +6280,8 @@ static void io_wq_submit_work(struct io_wq_work *work)
 #endif
 #define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 
-static inline struct file **io_fixed_file_slot(struct io_rsrc_data *file_data,
-					       unsigned i)
+static inline struct io_fixed_file *io_fixed_file_slot(struct io_rsrc_data *file_data,
+						      unsigned i)
 {
 	struct fixed_rsrc_table *table;
 
@@ -6287,12 +6292,12 @@ static inline struct file **io_fixed_file_slot(struct io_rsrc_data *file_data,
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 					      int index)
 {
-	struct file **file_slot = io_fixed_file_slot(ctx->file_data, index);
+	struct io_fixed_file *slot = io_fixed_file_slot(ctx->file_data, index);
 
-	return (struct file *) ((unsigned long) *file_slot & FFS_MASK);
+	return (struct file *) (slot->file_ptr & FFS_MASK);
 }
 
-static void io_fixed_file_set(struct file **file_slot, struct file *file)
+static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
 {
 	unsigned long file_ptr = (unsigned long) file;
 
@@ -6302,7 +6307,7 @@ static void io_fixed_file_set(struct file **file_slot, struct file *file)
 		file_ptr |= FFS_ASYNC_WRITE;
 	if (S_ISREG(file_inode(file)->i_mode))
 		file_ptr |= FFS_ISREG;
-	*file_slot = (struct file *)file_ptr;
+	file_slot->file_ptr = file_ptr;
 }
 
 static struct file *io_file_get(struct io_submit_state *state,
@@ -6317,7 +6322,7 @@ static struct file *io_file_get(struct io_submit_state *state,
 		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 			return NULL;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
-		file_ptr = (unsigned long) *io_fixed_file_slot(ctx->file_data, fd);
+		file_ptr = io_fixed_file_slot(ctx->file_data, fd)->file_ptr;
 		file = (struct file *) (file_ptr & FFS_MASK);
 		file_ptr &= ~FFS_MASK;
 		/* mask in overlapping REQ_F and FFS bits */
@@ -7756,7 +7761,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 unsigned nr_args)
 {
 	struct io_rsrc_data *data = ctx->file_data;
-	struct file *file, **file_slot;
+	struct io_fixed_file *file_slot;
+	struct file *file;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
@@ -7783,12 +7789,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 		file_slot = io_fixed_file_slot(ctx->file_data, i);
 
-		if (*file_slot) {
-			file = (struct file *) ((unsigned long) *file_slot & FFS_MASK);
+		if (file_slot->file_ptr) {
+			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 			err = io_queue_rsrc_removal(data, ctx->rsrc_node, file);
 			if (err)
 				break;
-			*file_slot = NULL;
+			file_slot->file_ptr = 0;
 			needs_switch = true;
 		}
 		if (fd != -1) {
@@ -7813,7 +7819,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
-				*file_slot = NULL;
+				file_slot->file_ptr = 0;
 				fput(file);
 				break;
 			}
-- 
cgit v1.2.3


From 53a3126756d6edfe4fd5fa9037cd949df94dfe55 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Apr 2021 15:44:05 +0100
Subject: io_uring: kill outdated comment about splice punt

The splice/tee comment in io_prep_async_work() isn't relevant since the
section was moved, delete it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/892a549c89c3d422b679677b8e68ffd3fcb736b6.1617287883.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4e5f93052a6b..ceb5ddd36826 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1227,10 +1227,6 @@ static void io_prep_async_work(struct io_kiocb *req)
 	switch (req->opcode) {
 	case IORING_OP_SPLICE:
 	case IORING_OP_TEE:
-		/*
-		 * Splice operation will be punted aync, and here need to
-		 * modify io_wq_work.flags, so initialize io_wq_work firstly.
-		 */
 		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 		break;
-- 
cgit v1.2.3


From cb3b200e4f66524d03d6410dd51bcf42f265a4d0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Apr 2021 09:49:31 -0600
Subject: io_uring: don't attempt re-add of multishot poll request if racing

We currently allow racy updates to multishot requests, but we can end up
double adding the poll request if both completion and update does it.
Ensure that we skip re-add on the update side if someone else is
completing it.

Fixes: b69de288e913 ("io_uring: allow events and user_data update of running poll requests")
Reported-by: Joakim Hassila <joj@mac.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ceb5ddd36826..0fcb0f477d9e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5432,6 +5432,7 @@ static int io_poll_update(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *preq;
+	bool completing;
 	int ret;
 
 	spin_lock_irq(&ctx->completion_lock);
@@ -5444,17 +5445,22 @@ static int io_poll_update(struct io_kiocb *req)
 		ret = -EACCES;
 		goto err;
 	}
-	if (!__io_poll_remove_one(preq, &preq->poll, false)) {
-		if (preq->poll.events & EPOLLONESHOT) {
-			ret = -EALREADY;
-			goto err;
-		}
+
+	/*
+	 * Don't allow racy completion with singleshot, as we cannot safely
+	 * update those. For multishot, if we're racing with completion, just
+	 * let completion re-add it.
+	 */
+	completing = !__io_poll_remove_one(preq, &preq->poll, false);
+	if (completing && (preq->poll.events & EPOLLONESHOT)) {
+		ret = -EALREADY;
+		goto err;
 	}
 	/* we now have a detached poll request. reissue. */
 	ret = 0;
 err:
-	spin_unlock_irq(&ctx->completion_lock);
 	if (ret < 0) {
+		spin_unlock_irq(&ctx->completion_lock);
 		req_set_fail_links(req);
 		io_req_complete(req, ret);
 		return 0;
@@ -5468,13 +5474,17 @@ err:
 	if (req->poll.update_user_data)
 		preq->user_data = req->poll.new_user_data;
 
+	spin_unlock_irq(&ctx->completion_lock);
+
 	/* complete update request, we're done with it */
 	io_req_complete(req, ret);
 
-	ret = __io_poll_add(preq);
-	if (ret < 0) {
-		req_set_fail_links(preq);
-		io_req_complete(preq, ret);
+	if (!completing) {
+		ret = __io_poll_add(preq);
+		if (ret < 0) {
+			req_set_fail_links(preq);
+			io_req_complete(preq, ret);
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From f40b964a66ace54cda811d8ba96eccec210cd7ad Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 Apr 2021 09:13:19 +0100
Subject: io_uring: clean up io_poll_task_func()

io_poll_complete() always fills an event (even an overflowed one), so we
always should do io_cqring_ev_posted() afterwards. And that's what is
currently happening, because second EPOLLONESHOT check is always true,
it can't return !done for oneshots.

Remove those branching, it's much easier to read.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fcb0f477d9e..33bb14313084 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4926,20 +4926,18 @@ static void io_poll_task_func(struct callback_head *cb)
 	if (io_poll_rewait(req, &req->poll)) {
 		spin_unlock_irq(&ctx->completion_lock);
 	} else {
-		bool done, post_ev;
+		bool done;
 
-		post_ev = done = io_poll_complete(req, req->result, 0);
+		done = io_poll_complete(req, req->result, 0);
 		if (done) {
 			hash_del(&req->hash_node);
-		} else if (!(req->poll.events & EPOLLONESHOT)) {
-			post_ev = true;
+		} else {
 			req->result = 0;
 			add_wait_queue(req->poll.head, &req->poll.wait);
 		}
 		spin_unlock_irq(&ctx->completion_lock);
+		io_cqring_ev_posted(ctx);
 
-		if (post_ev)
-			io_cqring_ev_posted(ctx);
 		if (done) {
 			nxt = io_put_req_find_next(req);
 			if (nxt)
-- 
cgit v1.2.3


From e27414bef7b4f25f4569401e42bc68d9fdfc3125 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 Apr 2021 09:13:20 +0100
Subject: io_uring: refactor io_poll_complete()

Remove error parameter from io_poll_complete(), 0 is always passed,
and do a bit of cleaning on top.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 33bb14313084..88c94627c743 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4894,18 +4894,19 @@ static void io_poll_remove_double(struct io_kiocb *req)
 	}
 }
 
-static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags = IORING_CQE_F_MORE;
+	int error;
 
-	if (!error && req->poll.canceled) {
+	if (READ_ONCE(req->poll.canceled)) {
 		error = -ECANCELED;
 		req->poll.events |= EPOLLONESHOT;
-	}
-	if (!error)
+	} else {
 		error = mangle_poll(mask);
+	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
 	if (!__io_cqring_fill_event(req, error, flags)) {
@@ -4928,7 +4929,7 @@ static void io_poll_task_func(struct callback_head *cb)
 	} else {
 		bool done;
 
-		done = io_poll_complete(req, req->result, 0);
+		done = io_poll_complete(req, req->result);
 		if (done) {
 			hash_del(&req->hash_node);
 		} else {
@@ -5414,7 +5415,7 @@ static int __io_poll_add(struct io_kiocb *req)
 
 	if (mask) { /* no async, we'd stolen it */
 		ipt.error = 0;
-		io_poll_complete(req, mask, 0);
+		io_poll_complete(req, mask);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
-- 
cgit v1.2.3


From 0ea13b448ee75ef0c68c18d207f6c488f143e725 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 Apr 2021 09:13:21 +0100
Subject: io_uring: simplify apoll hash removal

hash_del() works well with non-hashed nodes, there's no need to check
if it is hashed first.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 88c94627c743..abab95767307 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5059,10 +5059,7 @@ static void io_async_task_func(struct callback_head *cb)
 		return;
 	}
 
-	/* If req is still hashed, it cannot have been canceled. Don't check. */
-	if (hash_hashed(&req->hash_node))
-		hash_del(&req->hash_node);
-
+	hash_del(&req->hash_node);
 	io_poll_remove_double(req);
 	spin_unlock_irq(&ctx->completion_lock);
 
-- 
cgit v1.2.3


From 368b2080853f4694db780528c942f191f1c1687c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:25 +0100
Subject: io_uring: unify task and files cancel loops

Move tracked inflight number check up the stack into
__io_uring_files_cancel() so it's similar to task cancel. Will be used
for further cleaning.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dca5a395efebd1e3e0f3bbc6b9640c5e8aa7e468.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 74 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 33 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index abab95767307..20420164cb13 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8863,28 +8863,6 @@ static int io_uring_count_inflight(struct io_ring_ctx *ctx,
 	return cnt;
 }
 
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
-				  struct task_struct *task,
-				  struct files_struct *files)
-{
-	while (!list_empty_careful(&ctx->inflight_list)) {
-		DEFINE_WAIT(wait);
-		int inflight;
-
-		inflight = io_uring_count_inflight(ctx, task, files);
-		if (!inflight)
-			break;
-
-		io_uring_try_cancel_requests(ctx, task, files);
-
-		prepare_to_wait(&task->io_uring->wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (inflight == io_uring_count_inflight(ctx, task, files))
-			schedule();
-		finish_wait(&task->io_uring->wait, &wait);
-	}
-}
-
 static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
@@ -8970,6 +8948,19 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	}
 }
 
+static s64 tctx_inflight_tracked(struct task_struct *task,
+				 struct files_struct *files)
+{
+	struct io_uring_task *tctx = task->io_uring;
+	struct io_tctx_node *node;
+	unsigned long index;
+	s64 cnt = 0;
+
+	xa_for_each(&tctx->xa, index, node)
+		cnt += io_uring_count_inflight(node->ctx, task, files);
+	return cnt;
+}
+
 static s64 tctx_inflight(struct io_uring_task *tctx)
 {
 	return percpu_counter_sum(&tctx->inflight);
@@ -9008,14 +8999,12 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 		wait_for_completion(&work.completion);
 }
 
-void __io_uring_files_cancel(struct files_struct *files)
+static void io_uring_try_cancel(struct files_struct *files)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
 	unsigned long index;
 
-	/* make sure overflow events are dropped */
-	atomic_inc(&tctx->in_idle);
 	xa_for_each(&tctx->xa, index, node) {
 		struct io_ring_ctx *ctx = node->ctx;
 
@@ -9023,14 +9012,8 @@ void __io_uring_files_cancel(struct files_struct *files)
 			io_sqpoll_cancel_sync(ctx);
 			continue;
 		}
-		io_uring_cancel_files(ctx, current, files);
-		if (!files)
-			io_uring_try_cancel_requests(ctx, current, NULL);
+		io_uring_try_cancel_requests(ctx, current, files);
 	}
-	atomic_dec(&tctx->in_idle);
-
-	if (files)
-		io_uring_clean_tctx(tctx);
 }
 
 /* should only be called by SQPOLL task */
@@ -9064,6 +9047,31 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 	atomic_dec(&tctx->in_idle);
 }
 
+void __io_uring_files_cancel(struct files_struct *files)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	DEFINE_WAIT(wait);
+	s64 inflight;
+
+	/* make sure overflow events are dropped */
+	atomic_inc(&tctx->in_idle);
+	do {
+		/* read completions before cancelations */
+		inflight = tctx_inflight_tracked(current, files);
+		if (!inflight)
+			break;
+		io_uring_try_cancel(files);
+
+		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (inflight == tctx_inflight_tracked(current, files))
+			schedule();
+		finish_wait(&tctx->wait, &wait);
+	} while (1);
+	atomic_dec(&tctx->in_idle);
+
+	io_uring_clean_tctx(tctx);
+}
+
 /*
  * Find any io_uring fd that this task has registered or done IO on, and cancel
  * requests.
@@ -9083,7 +9091,7 @@ void __io_uring_task_cancel(void)
 		inflight = tctx_inflight(tctx);
 		if (!inflight)
 			break;
-		__io_uring_files_cancel(NULL);
+		io_uring_try_cancel(NULL);
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-- 
cgit v1.2.3


From b303fe2e5a3802b0b1fb8d997e5c9caef48f6dd8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:26 +0100
Subject: io_uring: track inflight requests through counter

Instead of keeping requests in a inflight_list, just track them with a
per tctx atomic counter. Apart from it being much easier and more
consistent with task cancel, it frees ->inflight_entry from being shared
between iopoll and cancel-track, so less headache for us.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3c2ee0863cd7eeefa605f3eaff4c1c461a6f1157.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 56 ++++++++++----------------------------------------------
 1 file changed, 10 insertions(+), 46 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 20420164cb13..99d617c7eae9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -442,9 +442,6 @@ struct io_ring_ctx {
 		struct hlist_head	*cancel_hash;
 		unsigned		cancel_hash_bits;
 		bool			poll_multi_file;
-
-		spinlock_t		inflight_lock;
-		struct list_head	inflight_list;
 	} ____cacheline_aligned_in_smp;
 
 	struct delayed_work		rsrc_put_work;
@@ -471,6 +468,7 @@ struct io_uring_task {
 	const struct io_ring_ctx *last;
 	struct io_wq		*io_wq;
 	struct percpu_counter	inflight;
+	atomic_t		inflight_tracked;
 	atomic_t		in_idle;
 
 	spinlock_t		task_lock;
@@ -831,10 +829,7 @@ struct io_kiocb {
 	struct io_kiocb			*link;
 	struct percpu_ref		*fixed_rsrc_refs;
 
-	/*
-	 * 1. used with ctx->iopoll_list with reads/writes
-	 * 2. to track reqs with ->files (see io_op_def::file_table)
-	 */
+	/* used with ctx->iopoll_list with reads/writes */
 	struct list_head		inflight_entry;
 	union {
 		struct io_task_work	io_task_work;
@@ -1162,8 +1157,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->iopoll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
-	spin_lock_init(&ctx->inflight_lock);
-	INIT_LIST_HEAD(&ctx->inflight_list);
 	spin_lock_init(&ctx->rsrc_ref_lock);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@ -1192,14 +1185,9 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 
 static void io_req_track_inflight(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
 	if (!(req->flags & REQ_F_INFLIGHT)) {
 		req->flags |= REQ_F_INFLIGHT;
-
-		spin_lock_irq(&ctx->inflight_lock);
-		list_add(&req->inflight_entry, &ctx->inflight_list);
-		spin_unlock_irq(&ctx->inflight_lock);
+		atomic_inc(&current->io_uring->inflight_tracked);
 	}
 }
 
@@ -1717,12 +1705,9 @@ static void io_dismantle_req(struct io_kiocb *req)
 		io_clean_op(req);
 
 		if (req->flags & REQ_F_INFLIGHT) {
-			struct io_ring_ctx *ctx = req->ctx;
-			unsigned long flags;
+			struct io_uring_task *tctx = req->task->io_uring;
 
-			spin_lock_irqsave(&ctx->inflight_lock, flags);
-			list_del(&req->inflight_entry);
-			spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+			atomic_dec(&tctx->inflight_tracked);
 			req->flags &= ~REQ_F_INFLIGHT;
 		}
 	}
@@ -7914,6 +7899,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	init_waitqueue_head(&tctx->wait);
 	tctx->last = NULL;
 	atomic_set(&tctx->in_idle, 0);
+	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
@@ -8849,20 +8835,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	}
 }
 
-static int io_uring_count_inflight(struct io_ring_ctx *ctx,
-				   struct task_struct *task,
-				   struct files_struct *files)
-{
-	struct io_kiocb *req;
-	int cnt = 0;
-
-	spin_lock_irq(&ctx->inflight_lock);
-	list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
-		cnt += io_match_task(req, task, files);
-	spin_unlock_irq(&ctx->inflight_lock);
-	return cnt;
-}
-
 static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
@@ -8948,17 +8920,9 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	}
 }
 
-static s64 tctx_inflight_tracked(struct task_struct *task,
-				 struct files_struct *files)
+static s64 tctx_inflight_tracked(struct io_uring_task *tctx)
 {
-	struct io_uring_task *tctx = task->io_uring;
-	struct io_tctx_node *node;
-	unsigned long index;
-	s64 cnt = 0;
-
-	xa_for_each(&tctx->xa, index, node)
-		cnt += io_uring_count_inflight(node->ctx, task, files);
-	return cnt;
+	return atomic_read(&tctx->inflight_tracked);
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -9057,13 +9021,13 @@ void __io_uring_files_cancel(struct files_struct *files)
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight_tracked(current, files);
+		inflight = tctx_inflight_tracked(tctx);
 		if (!inflight)
 			break;
 		io_uring_try_cancel(files);
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (inflight == tctx_inflight_tracked(current, files))
+		if (inflight == tctx_inflight_tracked(tctx))
 			schedule();
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
-- 
cgit v1.2.3


From 3f48cf18f886c97a7e775af10696bfed9ddcff31 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:27 +0100
Subject: io_uring: unify files and task cancel

Now __io_uring_cancel() and __io_uring_files_cancel() are very similar
and mostly differ by how we count requests, merge them and allow
tctx_inflight() to handle counting.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1a5986a97df4dc1378f3fe0ca1eb483dbcf42112.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 56 ++++++++++++++------------------------------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 99d617c7eae9..8e2a0f9acba8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8920,13 +8920,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	}
 }
 
-static s64 tctx_inflight_tracked(struct io_uring_task *tctx)
-{
-	return atomic_read(&tctx->inflight_tracked);
-}
-
-static s64 tctx_inflight(struct io_uring_task *tctx)
+static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 {
+	if (tracked)
+		return atomic_read(&tctx->inflight_tracked);
 	return percpu_counter_sum(&tctx->inflight);
 }
 
@@ -8993,7 +8990,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx);
+		inflight = tctx_inflight(tctx, false);
 		if (!inflight)
 			break;
 		io_uring_try_cancel_requests(ctx, current, NULL);
@@ -9004,43 +9001,18 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 		 * avoids a race where a completion comes in before we did
 		 * prepare_to_wait().
 		 */
-		if (inflight == tctx_inflight(tctx))
+		if (inflight == tctx_inflight(tctx, false))
 			schedule();
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
 	atomic_dec(&tctx->in_idle);
 }
 
-void __io_uring_files_cancel(struct files_struct *files)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	DEFINE_WAIT(wait);
-	s64 inflight;
-
-	/* make sure overflow events are dropped */
-	atomic_inc(&tctx->in_idle);
-	do {
-		/* read completions before cancelations */
-		inflight = tctx_inflight_tracked(tctx);
-		if (!inflight)
-			break;
-		io_uring_try_cancel(files);
-
-		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (inflight == tctx_inflight_tracked(tctx))
-			schedule();
-		finish_wait(&tctx->wait, &wait);
-	} while (1);
-	atomic_dec(&tctx->in_idle);
-
-	io_uring_clean_tctx(tctx);
-}
-
 /*
  * Find any io_uring fd that this task has registered or done IO on, and cancel
  * requests.
  */
-void __io_uring_task_cancel(void)
+void __io_uring_cancel(struct files_struct *files)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	DEFINE_WAIT(wait);
@@ -9048,15 +9020,14 @@ void __io_uring_task_cancel(void)
 
 	/* make sure overflow events are dropped */
 	atomic_inc(&tctx->in_idle);
-	__io_uring_files_cancel(NULL);
+	io_uring_try_cancel(files);
 
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx);
+		inflight = tctx_inflight(tctx, !!files);
 		if (!inflight)
 			break;
-		io_uring_try_cancel(NULL);
-
+		io_uring_try_cancel(files);
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		/*
@@ -9064,16 +9035,17 @@ void __io_uring_task_cancel(void)
 		 * avoids a race where a completion comes in before we did
 		 * prepare_to_wait().
 		 */
-		if (inflight == tctx_inflight(tctx))
+		if (inflight == tctx_inflight(tctx, !!files))
 			schedule();
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
-
 	atomic_dec(&tctx->in_idle);
 
 	io_uring_clean_tctx(tctx);
-	/* all current's requests should be gone, we can kill tctx */
-	__io_uring_free(current);
+	if (!files) {
+		/* for exec all current's requests should be gone, kill tctx */
+		__io_uring_free(current);
+	}
 }
 
 static void *io_uring_validate_mmap_request(struct file *file,
-- 
cgit v1.2.3


From a1fde923e3065a89abccfeef95096c933f6a954c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:28 +0100
Subject: io_uring: refactor io_close

A small refactoring shrinking it and making easier to read.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/19b24eed7cd491a0243b50366dd2a23b558e2665.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8e2a0f9acba8..f886a5b2ea57 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4181,11 +4181,9 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 	struct files_struct *files = current->files;
 	struct io_close *close = &req->close;
 	struct fdtable *fdt;
-	struct file *file;
-	int ret;
+	struct file *file = NULL;
+	int ret = -EBADF;
 
-	file = NULL;
-	ret = -EBADF;
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	if (close->fd >= fdt->max_fds) {
@@ -4193,12 +4191,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 		goto err;
 	}
 	file = fdt->fd[close->fd];
-	if (!file) {
-		spin_unlock(&files->file_lock);
-		goto err;
-	}
-
-	if (file->f_op == &io_uring_fops) {
+	if (!file || file->f_op == &io_uring_fops) {
 		spin_unlock(&files->file_lock);
 		file = NULL;
 		goto err;
-- 
cgit v1.2.3


From 0bdf3398b06ef1082b7d796039d34fc61a1285ea Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:29 +0100
Subject: io_uring: enable inline completion for more cases

Take advantage of delayed/inline completion flushing and pass right
issue flags for completion of open, open2, fadvise and poll remove
opcodes. All others either already use it or always punted and never
executed inline.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0badc7512e82f7350b73bb09abbebbecbdd5dab8.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f886a5b2ea57..9aaf3233961e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3845,7 +3845,7 @@ err:
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -4123,7 +4123,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -5319,7 +5319,7 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4af3417a347d06c8632346a6a9035c28b1dd94b4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:30 +0100
Subject: io_uring: refactor compat_msghdr import

Add an entry for user pointer to compat_msghdr into io_connect, so it's
explicit that we may use it as this, and removes annoying casts.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/73fd644dea1518f528d3648981cf777ce6e537e9.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9aaf3233961e..1779306c50d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -570,8 +570,9 @@ struct io_connect {
 struct io_sr_msg {
 	struct file			*file;
 	union {
-		struct user_msghdr __user *umsg;
-		void __user		*buf;
+		struct compat_msghdr __user	*umsg_compat;
+		struct user_msghdr __user	*umsg;
+		void __user			*buf;
 	};
 	int				msg_flags;
 	int				bgid;
@@ -4435,16 +4436,14 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
-	struct compat_msghdr __user *msg_compat;
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct compat_iovec __user *uiov;
 	compat_uptr_t ptr;
 	compat_size_t len;
 	int ret;
 
-	msg_compat = (struct compat_msghdr __user *) sr->umsg;
-	ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
-					&ptr, &len);
+	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
+				  &ptr, &len);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 44c769de6ffc3f1ea524fc9b7517c97078796e29 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:31 +0100
Subject: io_uring: optimise non-eventfd post-event

Eventfd is not the canonical way of using io_uring, annotate
io_should_trigger_evfd() with likely so it improves code generation for
non-eventfd branch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/42fdaa51c68d39479f02cef4fe5bcb24624d60fa.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1779306c50d2..108b0c49db64 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1355,13 +1355,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 
 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 {
-	if (!ctx->cq_ev_fd)
+	if (likely(!ctx->cq_ev_fd))
 		return false;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return false;
-	if (!ctx->eventfd_async)
-		return true;
-	return io_wq_current_is_worker();
+	return !ctx->eventfd_async || io_wq_current_is_worker();
 }
 
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From ff64216423d46396db2ca8b92fc75cc00ee6df4f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:32 +0100
Subject: io_uring: always pass cflags into fill_event()

A simple preparation patch inlining io_cqring_fill_event(), which only
role was to pass cflags=0 into an actual fill event. It helps to keep
number of related helpers sane in following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/704f9c85b7d9843e4ad50a9f057200c58f5adc6e.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 108b0c49db64..71e771106c10 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1028,7 +1028,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
-static void io_cqring_fill_event(struct io_kiocb *req, long res);
+static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_dismantle_req(struct io_kiocb *req);
@@ -1258,7 +1258,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
-		io_cqring_fill_event(req, status);
+		io_cqring_fill_event(req, status, 0);
 		io_put_req_deferred(req, 1);
 	}
 }
@@ -1492,8 +1492,8 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static bool __io_cqring_fill_event(struct io_kiocb *req, long res,
-				   unsigned int cflags)
+static bool io_cqring_fill_event(struct io_kiocb *req, long res,
+				 unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_cqe *cqe;
@@ -1539,11 +1539,6 @@ overflow:
 	return false;
 }
 
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
-{
-	__io_cqring_fill_event(req, res, 0);
-}
-
 static void io_req_complete_post(struct io_kiocb *req, long res,
 				 unsigned int cflags)
 {
@@ -1551,7 +1546,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	__io_cqring_fill_event(req, res, cflags);
+	io_cqring_fill_event(req, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -1767,7 +1762,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		ret = hrtimer_try_to_cancel(&io->timer);
 		if (ret != -1) {
-			io_cqring_fill_event(link, -ECANCELED);
+			io_cqring_fill_event(link, -ECANCELED, 0);
 			io_put_req_deferred(link, 1);
 			return true;
 		}
@@ -1786,7 +1781,7 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_fill_event(link, -ECANCELED);
+		io_cqring_fill_event(link, -ECANCELED, 0);
 		io_put_req_deferred(link, 2);
 		link = nxt;
 	}
@@ -2106,7 +2101,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
 		req = cs->reqs[i];
-		__io_cqring_fill_event(req, req->result, req->compl.cflags);
+		io_cqring_fill_event(req, req->result, req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -2246,7 +2241,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_rw_kbuf(req);
 
-		__io_cqring_fill_event(req, req->result, cflags);
+		io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
@@ -4884,7 +4879,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!__io_cqring_fill_event(req, error, flags)) {
+	if (!io_cqring_fill_event(req, error, flags)) {
 		io_poll_remove_waitqs(req);
 		req->poll.done = true;
 		flags = 0;
@@ -5221,7 +5216,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 
 	do_complete = io_poll_remove_waitqs(req);
 	if (do_complete) {
-		io_cqring_fill_event(req, -ECANCELED);
+		io_cqring_fill_event(req, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail_links(req);
 		io_put_req_deferred(req, 1);
@@ -5480,7 +5475,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	atomic_set(&req->ctx->cq_timeouts,
 		atomic_read(&req->ctx->cq_timeouts) + 1);
 
-	io_cqring_fill_event(req, -ETIME);
+	io_cqring_fill_event(req, -ETIME, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -5525,7 +5520,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 		return PTR_ERR(req);
 
 	req_set_fail_links(req);
-	io_cqring_fill_event(req, -ECANCELED);
+	io_cqring_fill_event(req, -ECANCELED, 0);
 	io_put_req_deferred(req, 1);
 	return 0;
 }
@@ -5598,7 +5593,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
 					io_translate_timeout_mode(tr->flags));
 
-	io_cqring_fill_event(req, ret);
+	io_cqring_fill_event(req, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -5750,7 +5745,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 done:
 	if (!ret)
 		ret = success_ret;
-	io_cqring_fill_event(req, ret);
+	io_cqring_fill_event(req, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
@@ -5807,7 +5802,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 
 	spin_lock_irq(&ctx->completion_lock);
 done:
-	io_cqring_fill_event(req, ret);
+	io_cqring_fill_event(req, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
-- 
cgit v1.2.3


From 8d13326e56c1a2b4e3af89843e1376b72a2ae6b7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:33 +0100
Subject: io_uring: optimise fill_event() by inlining

There are three cases where we much care about performance of
io_cqring_fill_event() -- flushing inline completions, iopoll and
io_req_complete_post(). Inline a hot part of fill_event() into them.

All others are not as important and we don't want to bloat binary for
them, so add a noinline version of the function for all other use
use cases.

nops test(batch=32): 16.932 vs 17.822 KIOPS

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a11d59424bf4417aca33f5ec21008bb3b0ebd11e.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 71e771106c10..3a837d2b8331 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1336,7 +1336,7 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 }
 
-static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned tail;
@@ -1492,26 +1492,11 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static bool io_cqring_fill_event(struct io_kiocb *req, long res,
-				 unsigned int cflags)
+static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
+				     unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(ctx, req->user_data, res, cflags);
-
-	/*
-	 * If we can't get a cq entry, userspace overflowed the
-	 * submission (by quite a lot). Increment the overflow count in
-	 * the ring.
-	 */
-	cqe = io_get_cqring(ctx);
-	if (likely(cqe)) {
-		WRITE_ONCE(cqe->user_data, req->user_data);
-		WRITE_ONCE(cqe->res, res);
-		WRITE_ONCE(cqe->flags, cflags);
-		return true;
-	}
 	if (!atomic_read(&req->task->io_uring->in_idle)) {
 		struct io_overflow_cqe *ocqe;
 
@@ -1539,6 +1524,36 @@ overflow:
 	return false;
 }
 
+static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
+					     unsigned int cflags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_uring_cqe *cqe;
+
+	trace_io_uring_complete(ctx, req->user_data, res, cflags);
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqring(ctx);
+	if (likely(cqe)) {
+		WRITE_ONCE(cqe->user_data, req->user_data);
+		WRITE_ONCE(cqe->res, res);
+		WRITE_ONCE(cqe->flags, cflags);
+		return true;
+	}
+	return io_cqring_event_overflow(req, res, cflags);
+}
+
+/* not as hot to bloat with inlining */
+static noinline bool io_cqring_fill_event(struct io_kiocb *req, long res,
+					  unsigned int cflags)
+{
+	return __io_cqring_fill_event(req, res, cflags);
+}
+
 static void io_req_complete_post(struct io_kiocb *req, long res,
 				 unsigned int cflags)
 {
@@ -1546,7 +1561,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(req, res, cflags);
+	__io_cqring_fill_event(req, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2101,7 +2116,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
 		req = cs->reqs[i];
-		io_cqring_fill_event(req, req->result, req->compl.cflags);
+		__io_cqring_fill_event(req, req->result, req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -2241,7 +2256,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_rw_kbuf(req);
 
-		io_cqring_fill_event(req, req->result, cflags);
+		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
-- 
cgit v1.2.3


From a1ff1e3f0e1cb8e314220e7af8eb3155da343bf9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 12 Apr 2021 06:40:02 -0600
Subject: io_uring: provide io_resubmit_prep() stub for !CONFIG_BLOCK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Randy reports the following error on CONFIG_BLOCK not being set:

../fs/io_uring.c: In function ‘kiocb_done’:
../fs/io_uring.c:2766:7: error: implicit declaration of function ‘io_resubmit_prep’; did you mean ‘io_put_req’? [-Werror=implicit-function-declaration]
   if (io_resubmit_prep(req)) {

Provide a dummy stub for io_resubmit_prep() like we do for
io_rw_should_reissue(), which also helps remove an ifdef sequence from
io_complete_rw_iopoll() as well.

Fixes: 8c130827f417 ("io_uring: don't alter iopoll reissue fail ret code")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3a837d2b8331..aa29918944f6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2464,6 +2464,10 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	return true;
 }
 #else
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	return false;
+}
 static bool io_rw_should_reissue(struct io_kiocb *req)
 {
 	return false;
@@ -2504,14 +2508,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 	if (unlikely(res != req->result)) {
-		bool fail = true;
-
-#ifdef CONFIG_BLOCK
-		if (res == -EAGAIN && io_rw_should_reissue(req) &&
-		    io_resubmit_prep(req))
-			fail = false;
-#endif
-		if (fail) {
+		if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
+		    io_resubmit_prep(req))) {
 			req_set_fail_links(req);
 			req->flags |= REQ_F_DONT_REISSUE;
 		}
-- 
cgit v1.2.3


From 3e9424989b59fbab5b46d1db29b271cd29643ab4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:34 +0100
Subject: io_uring: simplify io_rsrc_data refcounting

We don't take many references of struct io_rsrc_data, only one per each
io_rsrc_node, so using percpu refs is overkill. Use atomic ref instead,
which is much simpler.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1551d90f7c9b183cf2f0d7b5e5b923430acb03fa.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 ++++++++++++++++--------------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aa29918944f6..1eb815660fe8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -240,7 +240,7 @@ struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;
 
 	rsrc_put_fn			*do_put;
-	struct percpu_ref		refs;
+	atomic_t			refs;
 	struct completion		done;
 	bool				quiesce;
 };
@@ -7077,13 +7077,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #endif
 }
 
-static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
-{
-	struct io_rsrc_data *data = container_of(ref, struct io_rsrc_data, refs);
-
-	complete(&data->done);
-}
-
 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
 {
 	spin_lock_bh(&ctx->rsrc_ref_lock);
@@ -7114,7 +7107,7 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 		io_rsrc_ref_unlock(ctx);
 
-		percpu_ref_get(&data_to_kill->refs);
+		atomic_inc(&data_to_kill->refs);
 		percpu_ref_kill(&rsrc_node->refs);
 		ctx->rsrc_node = NULL;
 	}
@@ -7148,14 +7141,17 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 			break;
 		io_rsrc_node_switch(ctx, data);
 
-		percpu_ref_kill(&data->refs);
+		/* kill initial ref, already quiesced if zero */
+		if (atomic_dec_and_test(&data->refs))
+			break;
 		flush_delayed_work(&ctx->rsrc_put_work);
-
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret)
 			break;
 
-		percpu_ref_resurrect(&data->refs);
+		atomic_inc(&data->refs);
+		/* wait for all works potentially completing data->done */
+		flush_delayed_work(&ctx->rsrc_put_work);
 		reinit_completion(&data->done);
 
 		mutex_unlock(&ctx->uring_lock);
@@ -7176,23 +7172,13 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	if (!data)
 		return NULL;
 
-	if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
-			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
-		kfree(data);
-		return NULL;
-	}
+	atomic_set(&data->refs, 1);
 	data->ctx = ctx;
 	data->do_put = do_put;
 	init_completion(&data->done);
 	return data;
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
-{
-	percpu_ref_exit(&data->refs);
-	kfree(data);
-}
-
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_data *data = ctx->file_data;
@@ -7206,7 +7192,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 	__io_sqe_files_unregister(ctx);
 	io_free_file_tables(data, ctx->nr_user_files);
-	io_rsrc_data_free(data);
+	kfree(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
@@ -7539,7 +7525,8 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	}
 
 	io_rsrc_node_destroy(ref_node);
-	percpu_ref_put(&rsrc_data->refs);
+	if (atomic_dec_and_test(&rsrc_data->refs))
+		complete(&rsrc_data->done);
 }
 
 static void io_rsrc_put_work(struct work_struct *work)
@@ -7563,10 +7550,8 @@ static void io_rsrc_put_work(struct work_struct *work)
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-	struct io_rsrc_data *data = node->rsrc_data;
-	struct io_ring_ctx *ctx = data->ctx;
+	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 	bool first_add = false;
-	int delay;
 
 	io_rsrc_ref_lock(ctx);
 	node->done = true;
@@ -7582,9 +7567,8 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 	}
 	io_rsrc_ref_unlock(ctx);
 
-	delay = percpu_ref_is_dying(&data->refs) ? 0 : HZ;
-	if (first_add || !delay)
-		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+	if (first_add)
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
 }
 
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
@@ -7679,7 +7663,7 @@ out_fput:
 	io_free_file_tables(file_data, nr_args);
 	ctx->nr_user_files = 0;
 out_free:
-	io_rsrc_data_free(ctx->file_data);
+	kfree(ctx->file_data);
 	ctx->file_data = NULL;
 	return ret;
 }
-- 
cgit v1.2.3


From 7f61a1e9ef511660d66ea926b5899559fe94b1d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:35 +0100
Subject: io_uring: add buffer unmap helper

Add a helper for unmapping registered buffers, better than double
indexing and will be reused in the future.

Suggested-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66cbc6ea863be865bac7b7080ed6a3d5c542b71f.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1eb815660fe8..e9a2f8f39eb3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8093,25 +8093,27 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 	return off;
 }
 
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
+{
+	unsigned int i;
+
+	for (i = 0; i < imu->nr_bvecs; i++)
+		unpin_user_page(imu->bvec[i].bv_page);
+	if (imu->acct_pages)
+		io_unaccount_mem(ctx, imu->acct_pages);
+	kvfree(imu->bvec);
+	imu->nr_bvecs = 0;
+}
+
 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	int i, j;
+	unsigned int i;
 
 	if (!ctx->user_bufs)
 		return -ENXIO;
 
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
-
-		for (j = 0; j < imu->nr_bvecs; j++)
-			unpin_user_page(imu->bvec[j].bv_page);
-
-		if (imu->acct_pages)
-			io_unaccount_mem(ctx, imu->acct_pages);
-		kvfree(imu->bvec);
-		imu->nr_bvecs = 0;
-	}
-
+	for (i = 0; i < ctx->nr_user_bufs; i++)
+		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 	kfree(ctx->user_bufs);
 	ctx->user_bufs = NULL;
 	ctx->nr_user_bufs = 0;
-- 
cgit v1.2.3


From 87094465d01a248cd888b81da0e6bc10324d4dc0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:36 +0100
Subject: io_uring: cleanup buffer register

In preparation for more changes do a little cleanup of
io_sqe_buffers_register(). Move all args/invariant checking into it from
io_buffers_map_alloc(), because it's confusing. And add a bit more
cleaning for the loop.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/93292cb9708c8455e5070cc855861d94e11ca042.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e9a2f8f39eb3..76c8bd6a25d1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8306,17 +8306,8 @@ done:
 
 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 {
-	if (ctx->user_bufs)
-		return -EBUSY;
-	if (!nr_args || nr_args > UIO_MAXIOV)
-		return -EINVAL;
-
-	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
-					GFP_KERNEL);
-	if (!ctx->user_bufs)
-		return -ENOMEM;
-
-	return 0;
+	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	return ctx->user_bufs ? 0 : -ENOMEM;
 }
 
 static int io_buffer_validate(struct iovec *iov)
@@ -8348,26 +8339,26 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	struct iovec iov;
 	struct page *last_hpage = NULL;
 
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > UIO_MAXIOV)
+		return -EINVAL;
 	ret = io_buffers_map_alloc(ctx, nr_args);
 	if (ret)
 		return ret;
 
-	for (i = 0; i < nr_args; i++) {
+	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
 
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
-
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-
 		ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
 		if (ret)
 			break;
-
-		ctx->nr_user_bufs++;
 	}
 
 	if (ret)
-- 
cgit v1.2.3


From aeca241b0bdd831ad5706605f5e09b44fe940220 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:37 +0100
Subject: io_uring: split file table from rsrc nodes

We don't need to store file tables in rsrc nodes, for now it's easier to
handle tables not generically, so move file tables into the context. A
nice side effect is having one less pointer dereference for request with
fixed file initialisation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/de9fc4cd3545f24c26c03be4556f58ba3d18b9c3.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 76c8bd6a25d1..880c7743ab09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -220,8 +220,9 @@ struct io_rsrc_put {
 	};
 };
 
-struct fixed_rsrc_table {
-	struct io_fixed_file *files;
+struct io_file_table {
+	/* two level table */
+	struct io_fixed_file **files;
 };
 
 struct io_rsrc_node {
@@ -236,7 +237,6 @@ struct io_rsrc_node {
 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
 struct io_rsrc_data {
-	struct fixed_rsrc_table		*table;
 	struct io_ring_ctx		*ctx;
 
 	rsrc_put_fn			*do_put;
@@ -398,6 +398,7 @@ struct io_ring_ctx {
 	 * used. Only updated through io_uring_register(2).
 	 */
 	struct io_rsrc_data	*file_data;
+	struct io_file_table	file_table;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -6265,19 +6266,19 @@ static void io_wq_submit_work(struct io_wq_work *work)
 #endif
 #define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 
-static inline struct io_fixed_file *io_fixed_file_slot(struct io_rsrc_data *file_data,
+static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 						      unsigned i)
 {
-	struct fixed_rsrc_table *table;
+	struct io_fixed_file *table_l2;
 
-	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-	return &table->files[i & IORING_FILE_TABLE_MASK];
+	table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
+	return &table_l2[i & IORING_FILE_TABLE_MASK];
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 					      int index)
 {
-	struct io_fixed_file *slot = io_fixed_file_slot(ctx->file_data, index);
+	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
 
 	return (struct file *) (slot->file_ptr & FFS_MASK);
 }
@@ -6307,7 +6308,7 @@ static struct file *io_file_get(struct io_submit_state *state,
 		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 			return NULL;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
-		file_ptr = io_fixed_file_slot(ctx->file_data, fd)->file_ptr;
+		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
 		file = (struct file *) (file_ptr & FFS_MASK);
 		file_ptr &= ~FFS_MASK;
 		/* mask in overlapping REQ_F and FFS bits */
@@ -7044,14 +7045,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_free_file_tables(struct io_rsrc_data *data, unsigned nr_files)
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
 {
 	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
 
 	for (i = 0; i < nr_tables; i++)
-		kfree(data->table[i].files);
-	kfree(data->table);
-	data->table = NULL;
+		kfree(table->files[i]);
+	kfree(table->files);
+	table->files = NULL;
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -7191,7 +7192,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		return ret;
 
 	__io_sqe_files_unregister(ctx);
-	io_free_file_tables(data, ctx->nr_user_files);
+	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
 	kfree(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
@@ -7421,23 +7422,20 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static bool io_alloc_file_tables(struct io_rsrc_data *file_data,
-				 unsigned nr_files)
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
 	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
 
-	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
-				   GFP_KERNEL);
-	if (!file_data->table)
+	table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
+	if (!table->files)
 		return false;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_rsrc_table *table = &file_data->table[i];
 		unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
 
-		table->files = kcalloc(this_files, sizeof(struct file *),
+		table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
 					GFP_KERNEL);
-		if (!table->files)
+		if (!table->files[i])
 			break;
 		nr_files -= this_files;
 	}
@@ -7445,7 +7443,7 @@ static bool io_alloc_file_tables(struct io_rsrc_data *file_data,
 	if (i == nr_tables)
 		return true;
 
-	io_free_file_tables(file_data, nr_tables * IORING_MAX_FILES_TABLE);
+	io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
 	return false;
 }
 
@@ -7613,9 +7611,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
-
 	ret = -ENOMEM;
-	if (!io_alloc_file_tables(file_data, nr_args))
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
@@ -7643,7 +7640,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		io_fixed_file_set(io_fixed_file_slot(file_data, i), file);
+		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7660,7 +7657,7 @@ out_fput:
 		if (file)
 			fput(file);
 	}
-	io_free_file_tables(file_data, nr_args);
+	io_free_file_tables(&ctx->file_table, nr_args);
 	ctx->nr_user_files = 0;
 out_free:
 	kfree(ctx->file_data);
@@ -7756,7 +7753,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		file_slot = io_fixed_file_slot(ctx->file_data, i);
+		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 
 		if (file_slot->file_ptr) {
 			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-- 
cgit v1.2.3


From 88885f66e8c66311923c16caf1ccb6415ebfef72 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:38 +0100
Subject: io_uring: improve sqo stop

Set IO_SQ_THREAD_SHOULD_STOP before taking sqd lock, so the sqpoll task
sees earlier. Not a problem, it will stop eventually. Also check
invariant that it's stopped only once.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/653b24ee93843a50ff65a45847d9138f5adb76d7.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 880c7743ab09..7f2590c5d194 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7229,9 +7229,10 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
 static void io_sq_thread_stop(struct io_sq_data *sqd)
 {
 	WARN_ON_ONCE(sqd->thread == current);
+	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
 
-	mutex_lock(&sqd->lock);
 	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+	mutex_lock(&sqd->lock);
 	if (sqd->thread)
 		wake_up_process(sqd->thread);
 	mutex_unlock(&sqd->lock);
-- 
cgit v1.2.3


From e4335ed33eb54ba00c58557753dc84c0ee762ef1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:39 +0100
Subject: io_uring: improve hardlink code generation

req_set_fail_links() condition checking is bulky. Even though it's
always in a slow path, it's inlined and generates lots of extra code,
simplify it be moving HARDLINK checking into helpers killing linked
requests.

          text    data     bss     dec     hex filename
before:  79318   12330       8   91656   16608 ./fs/io_uring.o
after:   79126   12330       8   91464   16548 ./fs/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/96a9387db658a9d5a44ecbfd57c2a62cb888c9b6.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7f2590c5d194..15e18c03fba8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1101,7 +1101,7 @@ static bool io_match_task(struct io_kiocb *head,
 
 static inline void req_set_fail_links(struct io_kiocb *req)
 {
-	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+	if (req->flags & REQ_F_LINK)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
@@ -1810,7 +1810,8 @@ static bool io_disarm_next(struct io_kiocb *req)
 
 	if (likely(req->flags & REQ_F_LINK_TIMEOUT))
 		posted = io_kill_linked_timeout(req);
-	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+	if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+		     !(req->flags & REQ_F_HARDLINK))) {
 		posted |= (req->link != NULL);
 		io_fail_links(req);
 	}
-- 
cgit v1.2.3


From f70865db5ff35f5ed0c7e9ef63e7cca3d4947f04 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:40 +0100
Subject: io_uring: return back safer resurrect

Revert of revert of "io_uring: wait potential ->release() on resurrect",
which adds a helper for resurrect not racing completion reinit, as was
removed because of a strange bug with no clear root or link to the
patch.

Was improved, instead of rcu_synchronize(), just wait_for_completion()
because we're at 0 refs and it will happen very shortly. Specifically
use non-interruptible version to ignore all pending signals that may
have ended prior interruptible wait.

This reverts commit cb5e1b81304e089ee3ca948db4d29f71902eb575.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7a080c20f686d026efade810b116b72f88abaff9.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 15e18c03fba8..8564c7908126 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1081,6 +1081,18 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 	}
 }
 
+static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+{
+	bool got = percpu_ref_tryget(ref);
+
+	/* already at zero, wait for ->release() */
+	if (!got)
+		wait_for_completion(compl);
+	percpu_ref_resurrect(ref);
+	if (got)
+		percpu_ref_put(ref);
+}
+
 static bool io_match_task(struct io_kiocb *head,
 			  struct task_struct *task,
 			  struct files_struct *files)
@@ -9788,12 +9800,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			if (ret < 0)
 				break;
 		} while (1);
-
 		mutex_lock(&ctx->uring_lock);
 
 		if (ret) {
-			percpu_ref_resurrect(&ctx->refs);
-			goto out_quiesce;
+			io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+			return ret;
 		}
 	}
 
@@ -9886,7 +9897,6 @@ out:
 	if (io_register_op_must_quiesce(opcode)) {
 		/* bring the ctx back to life */
 		percpu_ref_reinit(&ctx->refs);
-out_quiesce:
 		reinit_completion(&ctx->ref_comp);
 	}
 	return ret;
-- 
cgit v1.2.3


From 084804002e512427bfe52b448cb7cac0d4209b64 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:38 +0100
Subject: io_uring: fix leaking reg files on exit

If io_sqe_files_unregister() faults on io_rsrc_ref_quiesce(), it will
fail to do unregister leaving files referenced. And that may well happen
because of a strayed signal or just because it does allocations inside.

In io_ring_ctx_free() do an unsafe version of unregister, as it's
guaranteed to not have requests by that point and so quiesce is useless.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e696e9eade571b51997d0dc1d01f144c6d685c05.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8564c7908126..1af8bb5f7d56 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7089,6 +7089,10 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 			fput(file);
 	}
 #endif
+	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+	kfree(ctx->file_data);
+	ctx->file_data = NULL;
+	ctx->nr_user_files = 0;
 }
 
 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7195,21 +7199,14 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	struct io_rsrc_data *data = ctx->file_data;
 	int ret;
 
-	if (!data)
+	if (!ctx->file_data)
 		return -ENXIO;
-	ret = io_rsrc_ref_quiesce(data, ctx);
-	if (ret)
-		return ret;
-
-	__io_sqe_files_unregister(ctx);
-	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
-	kfree(data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-	return 0;
+	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+	if (!ret)
+		__io_sqe_files_unregister(ctx);
+	return ret;
 }
 
 static void io_sq_thread_unpark(struct io_sq_data *sqd)
@@ -7659,7 +7656,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 
 	ret = io_sqe_files_scm(ctx);
 	if (ret) {
-		io_sqe_files_unregister(ctx);
+		__io_sqe_files_unregister(ctx);
 		return ret;
 	}
 
@@ -8460,7 +8457,11 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	}
 
 	mutex_lock(&ctx->uring_lock);
-	io_sqe_files_unregister(ctx);
+	if (ctx->file_data) {
+		if (!atomic_dec_and_test(&ctx->file_data->refs))
+			wait_for_completion(&ctx->file_data->done);
+		__io_sqe_files_unregister(ctx);
+	}
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	mutex_unlock(&ctx->uring_lock);
-- 
cgit v1.2.3


From 66d2d00d0ac44f98499dc7ec61e2289eb8b138e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:39 +0100
Subject: io_uring: fix uninit old data for poll event upd

Both IORING_POLL_UPDATE_EVENTS and IORING_POLL_UPDATE_USER_DATA need
old_user_data to find/cancel a poll request, but it's set only for the
first one.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ab08fd35b7652e977f9a475f01741b04102297f1.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1af8bb5f7d56..57ee3d29182f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5379,17 +5379,17 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
 	poll->update_events = poll->update_user_data = false;
-	if (flags & IORING_POLL_UPDATE_EVENTS) {
-		poll->update_events = true;
+
+	if (flags & (IORING_POLL_UPDATE_EVENTS|IORING_POLL_UPDATE_USER_DATA)) {
 		poll->old_user_data = READ_ONCE(sqe->addr);
+		poll->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+		poll->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
+		if (poll->update_user_data)
+			poll->new_user_data = READ_ONCE(sqe->off);
+	} else {
+		if (sqe->off || sqe->addr)
+			return -EINVAL;
 	}
-	if (flags & IORING_POLL_UPDATE_USER_DATA) {
-		poll->update_user_data = true;
-		poll->new_user_data = READ_ONCE(sqe->off);
-	}
-	if (!(poll->update_events || poll->update_user_data) &&
-	     (sqe->off || sqe->addr))
-		return -EINVAL;
 	poll->events = demangle_poll(events) |
 				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
-- 
cgit v1.2.3


From 9d8058926be7008c1dd49a4e5fb33044f17873c1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:40 +0100
Subject: io_uring: split poll and poll update structures

struct io_poll_iocb became pretty nasty combining also update fields.
Split them, so we would have more clarity to it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b2f74d64ffebb57a648f791681af086c7211e3a4.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 55 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 57ee3d29182f..7afb2f57b6ac 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -488,15 +488,16 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
+	struct wait_queue_entry		wait;
+};
+
+struct io_poll_update {
+	struct file			*file;
+	u64				old_user_data;
+	u64				new_user_data;
+	__poll_t			events;
 	bool				update_events;
 	bool				update_user_data;
-	union {
-		struct wait_queue_entry	wait;
-		struct {
-			u64		old_user_data;
-			u64		new_user_data;
-		};
-	};
 };
 
 struct io_poll_remove {
@@ -713,6 +714,7 @@ enum {
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
+	REQ_F_POLL_UPDATE_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_ASYNC_READ_BIT,
 	REQ_F_ASYNC_WRITE_BIT,
@@ -760,6 +762,8 @@ enum {
 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
 	/* don't attempt request reissue, see io_rw_reissue() */
 	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
+	/* switches between poll and poll update */
+	REQ_F_POLL_UPDATE	= BIT(REQ_F_POLL_UPDATE_BIT),
 	/* supports async reads */
 	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
 	/* supports async writes */
@@ -789,6 +793,7 @@ struct io_kiocb {
 		struct file		*file;
 		struct io_rw		rw;
 		struct io_poll_iocb	poll;
+		struct io_poll_update	poll_update;
 		struct io_poll_remove	poll_remove;
 		struct io_accept	accept;
 		struct io_sync		sync;
@@ -4984,7 +4989,6 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	poll->head = NULL;
 	poll->done = false;
 	poll->canceled = false;
-	poll->update_events = poll->update_user_data = false;
 #define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 	/* mask in events that we always want/need */
 	poll->events = events | IO_POLL_UNMASK;
@@ -5361,7 +5365,6 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll_iocb *poll = &req->poll;
 	u32 events, flags;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -5378,20 +5381,26 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 #endif
 	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
-	poll->update_events = poll->update_user_data = false;
+	events = demangle_poll(events) |
+				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 
 	if (flags & (IORING_POLL_UPDATE_EVENTS|IORING_POLL_UPDATE_USER_DATA)) {
-		poll->old_user_data = READ_ONCE(sqe->addr);
-		poll->update_events = flags & IORING_POLL_UPDATE_EVENTS;
-		poll->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
-		if (poll->update_user_data)
-			poll->new_user_data = READ_ONCE(sqe->off);
+		struct io_poll_update *poll_upd = &req->poll_update;
+
+		req->flags |= REQ_F_POLL_UPDATE;
+		poll_upd->events = events;
+		poll_upd->old_user_data = READ_ONCE(sqe->addr);
+		poll_upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+		poll_upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
+		if (poll_upd->update_user_data)
+			poll_upd->new_user_data = READ_ONCE(sqe->off);
 	} else {
+		struct io_poll_iocb *poll = &req->poll;
+
+		poll->events = events;
 		if (sqe->off || sqe->addr)
 			return -EINVAL;
 	}
-	poll->events = demangle_poll(events) |
-				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
 }
 
@@ -5429,7 +5438,7 @@ static int io_poll_update(struct io_kiocb *req)
 	int ret;
 
 	spin_lock_irq(&ctx->completion_lock);
-	preq = io_poll_find(ctx, req->poll.old_user_data);
+	preq = io_poll_find(ctx, req->poll_update.old_user_data);
 	if (!preq) {
 		ret = -ENOENT;
 		goto err;
@@ -5459,13 +5468,13 @@ err:
 		return 0;
 	}
 	/* only mask one event flags, keep behavior flags */
-	if (req->poll.update_events) {
+	if (req->poll_update.update_events) {
 		preq->poll.events &= ~0xffff;
-		preq->poll.events |= req->poll.events & 0xffff;
+		preq->poll.events |= req->poll_update.events & 0xffff;
 		preq->poll.events |= IO_POLL_UNMASK;
 	}
-	if (req->poll.update_user_data)
-		preq->user_data = req->poll.new_user_data;
+	if (req->poll_update.update_user_data)
+		preq->user_data = req->poll_update.new_user_data;
 
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -5484,7 +5493,7 @@ err:
 
 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
-	if (!req->poll.update_events && !req->poll.update_user_data)
+	if (!(req->flags & REQ_F_POLL_UPDATE))
 		return __io_poll_add(req);
 	return io_poll_update(req);
 }
-- 
cgit v1.2.3


From 8c855885b8b35af24f45cdd288a9b6ba6274a8ac Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:41 +0100
Subject: io_uring: add timeout completion_lock annotation

Add one more sparse locking annotation for readability in
io_kill_timeout().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bdbb22026024eac29203c1aa0045c4954a2488d1.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7afb2f57b6ac..640e26bf3f4f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1267,6 +1267,7 @@ static void io_queue_async_work(struct io_kiocb *req)
 }
 
 static void io_kill_timeout(struct io_kiocb *req, int status)
+	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_timeout_data *io = req->async_data;
 	int ret;
-- 
cgit v1.2.3


From fd9c7bc542dae7cca3b02c77f7863823d54ddee0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:42 +0100
Subject: io_uring: refactor hrtimer_try_to_cancel uses

Don't save return values of hrtimer_try_to_cancel() in a variable, but
use right away. It's in general safer to not have an intermediate
variable, which may be reused and passed out wrongly, but it be
contracted out. Also clean io_timeout_extract().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d2566ef7ce632e6882dc13e022a26249b3fd30b5.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 640e26bf3f4f..158ad55a6c14 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1270,10 +1270,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_timeout_data *io = req->async_data;
-	int ret;
 
-	ret = hrtimer_try_to_cancel(&io->timer);
-	if (ret != -1) {
+	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
@@ -1790,12 +1788,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 	 */
 	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
 		struct io_timeout_data *io = link->async_data;
-		int ret;
 
 		io_remove_next_linked(req);
 		link->timeout.head = NULL;
-		ret = hrtimer_try_to_cancel(&io->timer);
-		if (ret != -1) {
+		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			io_cqring_fill_event(link, -ECANCELED, 0);
 			io_put_req_deferred(link, 1);
 			return true;
@@ -5528,21 +5524,18 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 {
 	struct io_timeout_data *io;
 	struct io_kiocb *req;
-	int ret = -ENOENT;
+	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		if (user_data == req->user_data) {
-			ret = 0;
+		found = user_data == req->user_data;
+		if (found)
 			break;
-		}
 	}
-
-	if (ret == -ENOENT)
-		return ERR_PTR(ret);
+	if (!found)
+		return ERR_PTR(-ENOENT);
 
 	io = req->async_data;
-	ret = hrtimer_try_to_cancel(&io->timer);
-	if (ret == -1)
+	if (hrtimer_try_to_cancel(&io->timer) == -1)
 		return ERR_PTR(-EALREADY);
 	list_del_init(&req->timeout.list);
 	return req;
-- 
cgit v1.2.3


From e31001a3abb81a2dba976b842b8ab65d123bca2a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:43 +0100
Subject: io_uring: clean up io_poll_remove_waitqs()

Move some parts of io_poll_remove_waitqs() that are opcode independent.
Looks better and stresses that both do __io_poll_remove_one().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bbc717f82117cc335c89cbe67ec8d72608178732.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 158ad55a6c14..1627d69a570c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5219,21 +5219,16 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
 	bool do_complete;
 
 	io_poll_remove_double(req);
+	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
-	if (req->opcode == IORING_OP_POLL_ADD) {
-		do_complete = __io_poll_remove_one(req, &req->poll, true);
-	} else {
+	if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
 		struct async_poll *apoll = req->apoll;
 
 		/* non-poll requests have submit ref still */
-		do_complete = __io_poll_remove_one(req, &apoll->poll, true);
-		if (do_complete) {
-			req_ref_put(req);
-			kfree(apoll->double_poll);
-			kfree(apoll);
-		}
+		req_ref_put(req);
+		kfree(apoll->double_poll);
+		kfree(apoll);
 	}
-
 	return do_complete;
 }
 
-- 
cgit v1.2.3


From cce4b8b0ce1f9fdf67f4f73ed12a2da2a085d5e3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:44 +0100
Subject: io_uring: don't fail overflow on in_idle

As CQE overflows are now untied from requests and so don't hold any
ref, we don't need to handle exiting/exec'ing cases there anymore.
Moreover, it's much nicer in regards to userspace to save overflowed
CQEs whenever possible, so remove failing on in_idle.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d873b7dab75c7f3039ead9628a745bea01f2cfd2.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1627d69a570c..3632b5a4f13f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1513,32 +1513,28 @@ static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
 				     unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_overflow_cqe *ocqe;
 
-	if (!atomic_read(&req->task->io_uring->in_idle)) {
-		struct io_overflow_cqe *ocqe;
-
-		ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
-		if (!ocqe)
-			goto overflow;
-		if (list_empty(&ctx->cq_overflow_list)) {
-			set_bit(0, &ctx->sq_check_overflow);
-			set_bit(0, &ctx->cq_check_overflow);
-			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
-		}
-		ocqe->cqe.user_data = req->user_data;
-		ocqe->cqe.res = res;
-		ocqe->cqe.flags = cflags;
-		list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
-		return true;
+	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+	if (!ocqe) {
+		/*
+		 * If we're in ring overflow flush mode, or in task cancel mode,
+		 * or cannot allocate an overflow entry, then we need to drop it
+		 * on the floor.
+		 */
+		WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+		return false;
 	}
-overflow:
-	/*
-	 * If we're in ring overflow flush mode, or in task cancel mode,
-	 * or cannot allocate an overflow entry, then we need to drop it
-	 * on the floor.
-	 */
-	WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
-	return false;
+	if (list_empty(&ctx->cq_overflow_list)) {
+		set_bit(0, &ctx->sq_check_overflow);
+		set_bit(0, &ctx->cq_check_overflow);
+		ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+	}
+	ocqe->cqe.user_data = req->user_data;
+	ocqe->cqe.res = res;
+	ocqe->cqe.flags = cflags;
+	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+	return true;
 }
 
 static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
-- 
cgit v1.2.3


From e9979b36a467dcdb2073ec8391a2c167971bee46 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:45 +0100
Subject: io_uring: skip futile iopoll iterations

The only way to get out of io_iopoll_getevents() and continue iterating
is to have empty iopoll_list, otherwise the main loop would just exit.
So, instead of the unlock on 8th time heuristic, do that based on
iopoll_list.

Also, as no one can add new requests to iopoll_list while
io_iopoll_check() hold uring_lock, it's useless to spin with the list
empty, return in that case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5b8ebe84f5fff7ffa1f708952dfef7fc78b668e2.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3632b5a4f13f..ffcb3ec4de95 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2385,7 +2385,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 {
 	unsigned int nr_events = 0;
-	int iters = 0, ret = 0;
+	int ret = 0;
 
 	/*
 	 * We disallow the app entering submit/complete with polling, but we
@@ -2414,10 +2414,13 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		 * forever, while the workqueue is stuck trying to acquire the
 		 * very same mutex.
 		 */
-		if (!(++iters & 7)) {
+		if (list_empty(&ctx->iopoll_list)) {
 			mutex_unlock(&ctx->uring_lock);
 			io_run_task_work();
 			mutex_lock(&ctx->uring_lock);
+
+			if (list_empty(&ctx->iopoll_list))
+				break;
 		}
 
 		ret = io_iopoll_getevents(ctx, &nr_events, min);
-- 
cgit v1.2.3


From f39c8a5b1130fe17db9c66d08aa473d9587543a9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 Apr 2021 02:58:46 +0100
Subject: io_uring: inline io_iopoll_getevents()

io_iopoll_getevents() is of no use to us anymore, io_iopoll_check()
handles all the cases.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7e50b8917390f38bee4f822c6f4a6a98a27be037.1618278933.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 +++++++++++++---------------------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ffcb3ec4de95..a807c3f22778 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2329,27 +2329,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	return ret;
 }
 
-/*
- * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
- * non-spinning poll check - we'll still enter the driver poll loop, but only
- * as a non-spinning completion check.
- */
-static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
-				long min)
-{
-	while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
-		int ret;
-
-		ret = io_do_iopoll(ctx, nr_events, min);
-		if (ret < 0)
-			return ret;
-		if (*nr_events >= min)
-			return 0;
-	}
-
-	return 1;
-}
-
 /*
  * We can't just wait for polled events to come to us, we have to actively
  * find and complete them.
@@ -2393,17 +2372,16 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	 * that got punted to a workqueue.
 	 */
 	mutex_lock(&ctx->uring_lock);
+	/*
+	 * Don't enter poll loop if we already have events pending.
+	 * If we do, we can potentially be spinning for commands that
+	 * already triggered a CQE (eg in error).
+	 */
+	if (test_bit(0, &ctx->cq_check_overflow))
+		__io_cqring_overflow_flush(ctx, false);
+	if (io_cqring_events(ctx))
+		goto out;
 	do {
-		/*
-		 * Don't enter poll loop if we already have events pending.
-		 * If we do, we can potentially be spinning for commands that
-		 * already triggered a CQE (eg in error).
-		 */
-		if (test_bit(0, &ctx->cq_check_overflow))
-			__io_cqring_overflow_flush(ctx, false);
-		if (io_cqring_events(ctx))
-			break;
-
 		/*
 		 * If a submit got punted to a workqueue, we can have the
 		 * application entering polling for a command before it gets
@@ -2422,13 +2400,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 			if (list_empty(&ctx->iopoll_list))
 				break;
 		}
-
-		ret = io_iopoll_getevents(ctx, &nr_events, min);
-		if (ret <= 0)
-			break;
-		ret = 0;
-	} while (min && !nr_events && !need_resched());
-
+		ret = io_do_iopoll(ctx, &nr_events, min);
+	} while (!ret && nr_events < min && !need_resched());
+out:
 	mutex_unlock(&ctx->uring_lock);
 	return ret;
 }
@@ -2539,7 +2513,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 /*
  * After the iocb has been issued, it's safe to be found on the poll list.
  * Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_iopoll_getevents() thread before the issuer is done
+ * find it from a io_do_iopoll() thread before the issuer is done
  * accessing the kiocb cookie.
  */
 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
-- 
cgit v1.2.3


From 7f00651aebc9af600be1d9df2a775eeeaee6bebb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 14 Apr 2021 13:38:34 +0100
Subject: io_uring: refactor io_ring_exit_work()

Don't reinit io_ring_exit_work()'s exit work/completions on each
iteration, that's wasteful. Also add list_rotate_left(), so if we failed
to complete the task job, we don't try it again and again but defer it
until others are processed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a807c3f22778..f0e6349f9919 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8575,6 +8575,9 @@ static void io_ring_exit_work(struct work_struct *work)
 		WARN_ON_ONCE(time_after(jiffies, timeout));
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 
+	init_completion(&exit.completion);
+	init_task_work(&exit.task_work, io_tctx_exit_cb);
+	exit.ctx = ctx;
 	/*
 	 * Some may use context even when all refs and requests have been put,
 	 * and they are free to do so while still holding uring_lock or
@@ -8587,9 +8590,8 @@ static void io_ring_exit_work(struct work_struct *work)
 
 		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
 					ctx_node);
-		exit.ctx = ctx;
-		init_completion(&exit.completion);
-		init_task_work(&exit.task_work, io_tctx_exit_cb);
+		/* don't spin on a single task if cancellation failed */
+		list_rotate_left(&ctx->tctx_list);
 		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
 		if (WARN_ON_ONCE(ret))
 			continue;
@@ -8597,7 +8599,6 @@ static void io_ring_exit_work(struct work_struct *work)
 
 		mutex_unlock(&ctx->uring_lock);
 		wait_for_completion(&exit.completion);
-		cond_resched();
 		mutex_lock(&ctx->uring_lock);
 	}
 	mutex_unlock(&ctx->uring_lock);
-- 
cgit v1.2.3


From 9ba5fac8cf3b607652397f863dc229bbc8c3cbc1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 14 Apr 2021 13:38:35 +0100
Subject: io_uring: fix POLL_REMOVE removing apoll

Don't allow REQ_OP_POLL_REMOVE to kill apoll requests, users should not
know about it. Also, remove weird -EACCESS in io_poll_update(), it
shouldn't know anything about apoll, and have to work even if happened
to have a poll and an async poll'ed request with same user_data.

Fixes: b69de288e913 ("io_uring: allow events and user_data update of running poll requests")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f0e6349f9919..28fc99bf9ede 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5249,7 +5249,8 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	return posted != 0;
 }
 
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
+				     bool poll_only)
 	__must_hold(&ctx->completion_lock)
 {
 	struct hlist_head *list;
@@ -5259,18 +5260,20 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
 	hlist_for_each_entry(req, list, hash_node) {
 		if (sqe_addr != req->user_data)
 			continue;
+		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+			continue;
 		return req;
 	}
-
 	return NULL;
 }
 
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
+			  bool poll_only)
 	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req;
 
-	req = io_poll_find(ctx, sqe_addr);
+	req = io_poll_find(ctx, sqe_addr, poll_only);
 	if (!req)
 		return -ENOENT;
 	if (io_poll_remove_one(req))
@@ -5302,7 +5305,7 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	spin_lock_irq(&ctx->completion_lock);
-	ret = io_poll_cancel(ctx, req->poll_remove.addr);
+	ret = io_poll_cancel(ctx, req->poll_remove.addr, true);
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (ret < 0)
@@ -5403,14 +5406,10 @@ static int io_poll_update(struct io_kiocb *req)
 	int ret;
 
 	spin_lock_irq(&ctx->completion_lock);
-	preq = io_poll_find(ctx, req->poll_update.old_user_data);
+	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
 	if (!preq) {
 		ret = -ENOENT;
 		goto err;
-	} else if (preq->opcode != IORING_OP_POLL_ADD) {
-		/* don't allow internal poll updates */
-		ret = -EACCES;
-		goto err;
 	}
 
 	/*
@@ -5739,7 +5738,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	ret = io_timeout_cancel(ctx, sqe_addr);
 	if (ret != -ENOENT)
 		goto done;
-	ret = io_poll_cancel(ctx, sqe_addr);
+	ret = io_poll_cancel(ctx, sqe_addr, false);
 done:
 	if (!ret)
 		ret = success_ret;
@@ -5781,7 +5780,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_timeout_cancel(ctx, sqe_addr);
 	if (ret != -ENOENT)
 		goto done;
-	ret = io_poll_cancel(ctx, sqe_addr);
+	ret = io_poll_cancel(ctx, sqe_addr, false);
 	if (ret != -ENOENT)
 		goto done;
 	spin_unlock_irq(&ctx->completion_lock);
-- 
cgit v1.2.3


From 9096af3e9c8734a34703bd9fb5ab14292296f911 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 14 Apr 2021 13:38:36 +0100
Subject: io_uring: add helper for parsing poll events

Isolate poll mask SQE parsing and preparations into a new function,
which will be reused shortly.

Fixes: b69de288e913 ("io_uring: allow events and user_data update of running poll requests")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 28fc99bf9ede..9db4c99dfbf4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5282,6 +5282,20 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
 	return -EALREADY;
 }
 
+static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+				     unsigned int flags)
+{
+	u32 events;
+
+	events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+	events = swahw32(events);
+#endif
+	if (!(flags & IORING_POLL_ADD_MULTI))
+		events |= EPOLLONESHOT;
+	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+}
+
 static int io_poll_remove_prep(struct io_kiocb *req,
 			       const struct io_uring_sqe *sqe)
 {
@@ -5343,14 +5357,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
 			IORING_POLL_UPDATE_USER_DATA))
 		return -EINVAL;
-	events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
-	events = swahw32(events);
-#endif
-	if (!(flags & IORING_POLL_ADD_MULTI))
-		events |= EPOLLONESHOT;
-	events = demangle_poll(events) |
-				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+
+	events = io_poll_parse_events(sqe, flags);
 
 	if (flags & (IORING_POLL_UPDATE_EVENTS|IORING_POLL_UPDATE_USER_DATA)) {
 		struct io_poll_update *poll_upd = &req->poll_update;
-- 
cgit v1.2.3


From c5de00366e3e675f9e321983d9bd357c1fbea0e9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 14 Apr 2021 13:38:37 +0100
Subject: io_uring: move poll update into remove not add

Having poll update function as a part of IORING_OP_POLL_ADD is not
great, we have to do hack around struct layouts and add some overhead in
the way of more popular POLL_ADD. Even more serious drawback is that
POLL_ADD requires file and always grabs it, and so poll update, which
doesn't need it.

Incorporate poll update into IORING_OP_POLL_REMOVE instead of
IORING_OP_POLL_ADD. It also more consistent with timeout remove/update.

Fixes: b69de288e913 ("io_uring: allow events and user_data update of running poll requests")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 104 +++++++++++++++++++++-------------------------------------
 1 file changed, 38 insertions(+), 66 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9db4c99dfbf4..ab14692b05b4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -500,11 +500,6 @@ struct io_poll_update {
 	bool				update_user_data;
 };
 
-struct io_poll_remove {
-	struct file			*file;
-	u64				addr;
-};
-
 struct io_close {
 	struct file			*file;
 	int				fd;
@@ -714,7 +709,6 @@ enum {
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
-	REQ_F_POLL_UPDATE_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_ASYNC_READ_BIT,
 	REQ_F_ASYNC_WRITE_BIT,
@@ -762,8 +756,6 @@ enum {
 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
 	/* don't attempt request reissue, see io_rw_reissue() */
 	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
-	/* switches between poll and poll update */
-	REQ_F_POLL_UPDATE	= BIT(REQ_F_POLL_UPDATE_BIT),
 	/* supports async reads */
 	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
 	/* supports async writes */
@@ -794,7 +786,6 @@ struct io_kiocb {
 		struct io_rw		rw;
 		struct io_poll_iocb	poll;
 		struct io_poll_update	poll_update;
-		struct io_poll_remove	poll_remove;
 		struct io_accept	accept;
 		struct io_sync		sync;
 		struct io_cancel	cancel;
@@ -5296,35 +5287,36 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 }
 
-static int io_poll_remove_prep(struct io_kiocb *req,
+static int io_poll_update_prep(struct io_kiocb *req,
 			       const struct io_uring_sqe *sqe)
 {
+	struct io_poll_update *upd = &req->poll_update;
+	u32 flags;
+
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
-	    sqe->poll_events)
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->len);
+	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+		      IORING_POLL_ADD_MULTI))
+		return -EINVAL;
+	/* meaningless without update */
+	if (flags == IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
-	req->poll_remove.addr = READ_ONCE(sqe->addr);
-	return 0;
-}
-
-/*
- * Find a running poll command that matches one specified in sqe->addr,
- * and remove it if found.
- */
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret;
+	upd->old_user_data = READ_ONCE(sqe->addr);
+	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
 
-	spin_lock_irq(&ctx->completion_lock);
-	ret = io_poll_cancel(ctx, req->poll_remove.addr, true);
-	spin_unlock_irq(&ctx->completion_lock);
+	upd->new_user_data = READ_ONCE(sqe->off);
+	if (!upd->update_user_data && upd->new_user_data)
+		return -EINVAL;
+	if (upd->update_events)
+		upd->events = io_poll_parse_events(sqe, flags);
+	else if (sqe->poll32_events)
+		return -EINVAL;
 
-	if (ret < 0)
-		req_set_fail_links(req);
-	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -5347,40 +5339,22 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	u32 events, flags;
+	struct io_poll_iocb *poll = &req->poll;
+	u32 flags;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
-			IORING_POLL_UPDATE_USER_DATA))
+	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
-	events = io_poll_parse_events(sqe, flags);
-
-	if (flags & (IORING_POLL_UPDATE_EVENTS|IORING_POLL_UPDATE_USER_DATA)) {
-		struct io_poll_update *poll_upd = &req->poll_update;
-
-		req->flags |= REQ_F_POLL_UPDATE;
-		poll_upd->events = events;
-		poll_upd->old_user_data = READ_ONCE(sqe->addr);
-		poll_upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
-		poll_upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
-		if (poll_upd->update_user_data)
-			poll_upd->new_user_data = READ_ONCE(sqe->off);
-	} else {
-		struct io_poll_iocb *poll = &req->poll;
-
-		poll->events = events;
-		if (sqe->off || sqe->addr)
-			return -EINVAL;
-	}
+	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
 
-static int __io_poll_add(struct io_kiocb *req)
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5406,7 +5380,7 @@ static int __io_poll_add(struct io_kiocb *req)
 	return ipt.error;
 }
 
-static int io_poll_update(struct io_kiocb *req)
+static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *preq;
@@ -5420,6 +5394,12 @@ static int io_poll_update(struct io_kiocb *req)
 		goto err;
 	}
 
+	if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
+		completing = true;
+		ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
+		goto err;
+	}
+
 	/*
 	 * Don't allow racy completion with singleshot, as we cannot safely
 	 * update those. For multishot, if we're racing with completion, just
@@ -5447,14 +5427,13 @@ err:
 	}
 	if (req->poll_update.update_user_data)
 		preq->user_data = req->poll_update.new_user_data;
-
 	spin_unlock_irq(&ctx->completion_lock);
 
 	/* complete update request, we're done with it */
 	io_req_complete(req, ret);
 
 	if (!completing) {
-		ret = __io_poll_add(preq);
+		ret = io_poll_add(preq, issue_flags);
 		if (ret < 0) {
 			req_set_fail_links(preq);
 			io_req_complete(preq, ret);
@@ -5463,13 +5442,6 @@ err:
 	return 0;
 }
 
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
-{
-	if (!(req->flags & REQ_F_POLL_UPDATE))
-		return __io_poll_add(req);
-	return io_poll_update(req);
-}
-
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -5874,7 +5846,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	case IORING_OP_POLL_ADD:
 		return io_poll_add_prep(req, sqe);
 	case IORING_OP_POLL_REMOVE:
-		return io_poll_remove_prep(req, sqe);
+		return io_poll_update_prep(req, sqe);
 	case IORING_OP_FSYNC:
 		return io_fsync_prep(req, sqe);
 	case IORING_OP_SYNC_FILE_RANGE:
@@ -6105,7 +6077,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_poll_add(req, issue_flags);
 		break;
 	case IORING_OP_POLL_REMOVE:
-		ret = io_poll_remove(req, issue_flags);
+		ret = io_poll_update(req, issue_flags);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
 		ret = io_sync_file_range(req, issue_flags);
-- 
cgit v1.2.3


From ea6a693d862d4f0edd748a1fa3fc6faf2c39afb2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Apr 2021 09:47:13 -0600
Subject: io_uring: disable multishot poll for double poll add cases

The re-add handling isn't correct for the multi wait case, so let's
just disable it for now explicitly until we can get that sorted out. This
just turns it into a one-shot request. Since we pass back whether or not
a poll request terminates in multishot mode on completion, this should
not break properly behaving applications that check for IORING_CQE_F_MORE
on completion.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ab14692b05b4..4803e31e9301 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4976,6 +4976,12 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 			pt->error = -EINVAL;
 			return;
 		}
+		/*
+		 * Can't handle multishot for double wait for now, turn it
+		 * into one-shot mode.
+		 */
+		if (!(req->poll.events & EPOLLONESHOT))
+			req->poll.events |= EPOLLONESHOT;
 		/* double add on the same waitqueue head, ignore */
 		if (poll->head == head)
 			return;
-- 
cgit v1.2.3


From 4e3d9ff905cd3e6fc80a1f54b89c3aca67bc72be Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Apr 2021 17:44:34 -0600
Subject: io_uring: put flag checking for needing req cleanup in one spot

We have this in two spots right now, which is a bit fragile. In
preparation for moving REQ_F_POLLED cleanup into the same spot, move
the check into a separate helper so we only have it once.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4803e31e9301..8e6dcb69f3e9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1598,10 +1598,15 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	}
 }
 
+static inline bool io_req_needs_clean(struct io_kiocb *req)
+{
+	return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP);
+}
+
 static void io_req_complete_state(struct io_kiocb *req, long res,
 				  unsigned int cflags)
 {
-	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+	if (io_req_needs_clean(req))
 		io_clean_op(req);
 	req->result = res;
 	req->compl.cflags = cflags;
@@ -1713,10 +1718,8 @@ static void io_dismantle_req(struct io_kiocb *req)
 
 	if (!(flags & REQ_F_FIXED_FILE))
 		io_put_file(req->file);
-	if (flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
-		     REQ_F_INFLIGHT)) {
+	if (io_req_needs_clean(req) || (req->flags & REQ_F_INFLIGHT)) {
 		io_clean_op(req);
-
 		if (req->flags & REQ_F_INFLIGHT) {
 			struct io_uring_task *tctx = req->task->io_uring;
 
-- 
cgit v1.2.3


From 75652a30ff67539999148859da071ede862090ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Apr 2021 09:52:40 -0600
Subject: io_uring: tie req->apoll to request lifetime

We manage these separately right now, just tie it to the request lifetime
and make it be part of the usual REQ_F_NEED_CLEANUP logic.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8e6dcb69f3e9..10b2367138be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1600,7 +1600,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 
 static inline bool io_req_needs_clean(struct io_kiocb *req)
 {
-	return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP);
+	return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
+				REQ_F_POLLED);
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -5038,9 +5039,6 @@ static void io_async_task_func(struct callback_head *cb)
 		__io_req_task_submit(req);
 	else
 		io_req_complete_failed(req, -ECANCELED);
-
-	kfree(apoll->double_poll);
-	kfree(apoll);
 }
 
 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -5156,8 +5154,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	if (ret || ipt.error) {
 		io_poll_remove_double(req);
 		spin_unlock_irq(&ctx->completion_lock);
-		kfree(apoll->double_poll);
-		kfree(apoll);
 		return false;
 	}
 	spin_unlock_irq(&ctx->completion_lock);
@@ -5195,12 +5191,8 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
 	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
 	if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
-		struct async_poll *apoll = req->apoll;
-
 		/* non-poll requests have submit ref still */
 		req_ref_put(req);
-		kfree(apoll->double_poll);
-		kfree(apoll);
 	}
 	return do_complete;
 }
@@ -6054,6 +6046,11 @@ static void io_clean_op(struct io_kiocb *req)
 		}
 		req->flags &= ~REQ_F_NEED_CLEANUP;
 	}
+	if ((req->flags & REQ_F_POLLED) && req->apoll) {
+		kfree(req->apoll->double_poll);
+		kfree(req->apoll);
+		req->apoll = NULL;
+	}
 }
 
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
-- 
cgit v1.2.3


From a7be7c23cfdd2cb57609fd2d607923a9cb2a305d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Apr 2021 11:31:14 -0600
Subject: io_uring: fix merge error for async resubmit

A hand-edit while applying this patch on top of a new base resulted in
a reverted check for re-issue, resulting in spurious -EAGAIN errors.

Fixes: 8c130827f417 ("io_uring: don't alter iopoll reissue fail ret code")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 10b2367138be..55892e0227dd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2738,7 +2738,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 
 	if (check_reissue && req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
-		if (!io_resubmit_prep(req)) {
+		if (io_resubmit_prep(req)) {
 			req_ref_get(req);
 			io_queue_async_work(req);
 		} else {
-- 
cgit v1.2.3


From c82d5bc703825a47af5c600e82e1e0c1db49e036 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 15 Apr 2021 13:40:50 +0100
Subject: io_uring: don't fail submit with overflow backlog

Don't fail submission attempts if there are CQEs in the overflow
backlog, but give away the decision making to the userspace. It
might be very inconvenient to the userspace, especially if
submission and completion are done by different threads.

We can remove it because of recent changes, where requests
are now not locked by the backlog, backlog entries are allocated
separately, so they take less space and cgroup accounted.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 55892e0227dd..81bfe91aed0a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6664,12 +6664,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
 	int submitted = 0;
 
-	/* if we have a backlog and couldn't flush it all, return BUSY */
-	if (test_bit(0, &ctx->sq_check_overflow)) {
-		if (!__io_cqring_overflow_flush(ctx, false))
-			return -EBUSY;
-	}
-
 	/* make sure SQ entry isn't read before tail */
 	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
 
-- 
cgit v1.2.3


From 38134ada0ceea3e848fe993263c0ff6207fd46e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 15 Apr 2021 13:07:39 +0100
Subject: io_uring: fix overflows checks in provide buffers

Colin reported before possible overflow and sign extension problems in
io_provide_buffers_prep(). As Linus pointed out previous attempt did nothing
useful, see d81269fecb8ce ("io_uring: fix provide_buffers sign extension").

Do that with help of check_<op>_overflow helpers. And fix struct
io_provide_buf::len type, as it doesn't make much sense to keep it
signed.

Reported-by: Colin Ian King <colin.king@canonical.com>
Fixes: efe68c1ca8f49 ("io_uring: validate the full range of provided buffers for access")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/46538827e70fce5f6cdb50897cff4cacc490f380.1618488258.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 81bfe91aed0a..bae13811d2f3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -626,7 +626,7 @@ struct io_splice {
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
-	__s32				len;
+	__u32				len;
 	__u32				bgid;
 	__u16				nbufs;
 	__u16				bid;
@@ -3918,7 +3918,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 static int io_provide_buffers_prep(struct io_kiocb *req,
 				   const struct io_uring_sqe *sqe)
 {
-	unsigned long size;
+	unsigned long size, tmp_check;
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
@@ -3932,6 +3932,12 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
 	p->addr = READ_ONCE(sqe->addr);
 	p->len = READ_ONCE(sqe->len);
 
+	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+				&size))
+		return -EOVERFLOW;
+	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+		return -EOVERFLOW;
+
 	size = (unsigned long)p->len * p->nbufs;
 	if (!access_ok(u64_to_user_ptr(p->addr), size))
 		return -EFAULT;
-- 
cgit v1.2.3


From 75c4021aacbd9b5cc13b173d32b49007fd8ccada Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 15 Apr 2021 13:07:40 +0100
Subject: io_uring: check register restriction afore quiesce

Move restriction checks of __io_uring_register() before quiesce, saves
from waiting for requests in fail case and simplifies the code a bit.
Also add array_index_nospec() for safety

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/88d7913c9280ee848fdb7b584eea37a465391cee.1618488258.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bae13811d2f3..e491b815df8c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9738,6 +9738,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (percpu_ref_is_dying(&ctx->refs))
 		return -ENXIO;
 
+	if (ctx->restricted) {
+		if (opcode >= IORING_REGISTER_LAST)
+			return -EINVAL;
+		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+		if (!test_bit(opcode, ctx->restrictions.register_op))
+			return -EACCES;
+	}
+
 	if (io_register_op_must_quiesce(opcode)) {
 		percpu_ref_kill(&ctx->refs);
 
@@ -9766,18 +9774,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		}
 	}
 
-	if (ctx->restricted) {
-		if (opcode >= IORING_REGISTER_LAST) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (!test_bit(opcode, ctx->restrictions.register_op)) {
-			ret = -EACCES;
-			goto out;
-		}
-	}
-
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
 		ret = io_sqe_buffers_register(ctx, arg, nr_args);
@@ -9851,7 +9847,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		break;
 	}
 
-out:
 	if (io_register_op_must_quiesce(opcode)) {
 		/* bring the ctx back to life */
 		percpu_ref_reinit(&ctx->refs);
-- 
cgit v1.2.3


From 3b763ba1c77da5806e4fdc5684285814fe970c98 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 18 Apr 2021 14:52:08 +0100
Subject: io_uring: remove extra sqpoll submission halting

SQPOLL task won't submit requests for a context that is currently dying,
so no need to remove ctx from sqd_list prior the main loop of
io_ring_exit_work(). Kill it, will be removed by io_sq_thread_finish()
and only brings confusion and lockups.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f220c2b786ba0f9499bebc9f3cd9714d29efb6a5.1618752958.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e491b815df8c..4fc54cd40470 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6750,6 +6750,10 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		if (!list_empty(&ctx->iopoll_list))
 			io_do_iopoll(ctx, &nr_events, 0);
 
+		/*
+		 * Don't submit if refs are dying, good for io_uring_register(),
+		 * but also it is relied upon by io_ring_exit_work()
+		 */
 		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
 		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 			ret = io_submit_sqes(ctx, to_submit);
@@ -8540,14 +8544,6 @@ static void io_ring_exit_work(struct work_struct *work)
 	struct io_tctx_node *node;
 	int ret;
 
-	/* prevent SQPOLL from submitting new requests */
-	if (ctx->sq_data) {
-		io_sq_thread_park(ctx->sq_data);
-		list_del_init(&ctx->sqd_list);
-		io_sqd_update_thread_idle(ctx->sq_data);
-		io_sq_thread_unpark(ctx->sq_data);
-	}
-
 	/*
 	 * If we're doing polled IO and end up having requests being
 	 * submitted async (out-of-line), then completions can come in while
-- 
cgit v1.2.3


From 734551df6f9bedfbefcd113ede665945e9de0b99 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 18 Apr 2021 14:52:09 +0100
Subject: io_uring: fix shared sqpoll cancellation hangs

[  736.982891] INFO: task iou-sqp-4294:4295 blocked for more than 122 seconds.
[  736.982897] Call Trace:
[  736.982901]  schedule+0x68/0xe0
[  736.982903]  io_uring_cancel_sqpoll+0xdb/0x110
[  736.982908]  io_sqpoll_cancel_cb+0x24/0x30
[  736.982911]  io_run_task_work_head+0x28/0x50
[  736.982913]  io_sq_thread+0x4e3/0x720

We call io_uring_cancel_sqpoll() one by one for each ctx either in
sq_thread() itself or via task works, and it's intended to cancel all
requests of a specified context. However the function uses per-task
counters to track the number of inflight requests, so it counts more
requests than available via currect io_uring ctx and goes to sleep for
them to appear (e.g. from IRQ), that will never happen.

Cancel a bit more than before, i.e. all ctxs that share sqpoll
and continue to use shared counters. Don't forget that we should not
remove ctx from the list before running that task_work sqpoll-cancel,
otherwise the function wouldn't be able to find the context and will
hang.

Reported-by: Joakim Hassila <joj@mac.com>
Reported-by: Jens Axboe <axboe@kernel.dk>
Fixes: 37d1e2e3642e2 ("io_uring: move SQPOLL thread io-wq forked worker")
Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1bded7e6c6b32e0bae25fce36be2868e46b116a0.1618752958.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4fc54cd40470..58f12bfbfb44 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1022,7 +1022,7 @@ static void io_uring_del_task_file(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 struct files_struct *files);
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
 static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
@@ -6870,15 +6870,14 @@ static int io_sq_thread(void *data)
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
 
-	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-		io_uring_cancel_sqpoll(ctx);
+	io_uring_cancel_sqpoll(sqd);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 		io_ring_set_wakeup_flag(ctx);
-	mutex_unlock(&sqd->lock);
-
 	io_run_task_work();
 	io_run_task_work_head(&sqd->park_task_work);
+	mutex_unlock(&sqd->lock);
+
 	complete(&sqd->exited);
 	do_exit(0);
 }
@@ -8870,11 +8869,11 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 static void io_sqpoll_cancel_cb(struct callback_head *cb)
 {
 	struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
-	struct io_ring_ctx *ctx = work->ctx;
-	struct io_sq_data *sqd = ctx->sq_data;
+	struct io_sq_data *sqd = work->ctx->sq_data;
 
 	if (sqd->thread)
-		io_uring_cancel_sqpoll(ctx);
+		io_uring_cancel_sqpoll(sqd);
+	list_del_init(&work->ctx->sqd_list);
 	complete(&work->completion);
 }
 
@@ -8885,7 +8884,6 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 	struct task_struct *task;
 
 	io_sq_thread_park(sqd);
-	list_del_init(&ctx->sqd_list);
 	io_sqd_update_thread_idle(sqd);
 	task = sqd->thread;
 	if (task) {
@@ -8893,6 +8891,8 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 		init_task_work(&work.task_work, io_sqpoll_cancel_cb);
 		io_task_work_add_head(&sqd->park_task_work, &work.task_work);
 		wake_up_process(task);
+	} else {
+		list_del_init(&ctx->sqd_list);
 	}
 	io_sq_thread_unpark(sqd);
 
@@ -8918,14 +8918,14 @@ static void io_uring_try_cancel(struct files_struct *files)
 }
 
 /* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 {
-	struct io_sq_data *sqd = ctx->sq_data;
 	struct io_uring_task *tctx = current->io_uring;
+	struct io_ring_ctx *ctx;
 	s64 inflight;
 	DEFINE_WAIT(wait);
 
-	WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
+	WARN_ON_ONCE(!sqd || sqd->thread != current);
 
 	atomic_inc(&tctx->in_idle);
 	do {
@@ -8933,7 +8933,8 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 		inflight = tctx_inflight(tctx, false);
 		if (!inflight)
 			break;
-		io_uring_try_cancel_requests(ctx, current, NULL);
+		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+			io_uring_try_cancel_requests(ctx, current, NULL);
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 		/*
-- 
cgit v1.2.3


From 3a0a690235923b838390500fd46edc23bed092e0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Apr 2021 12:03:31 +0100
Subject: io_uring: move inflight un-tracking into cleanup

REQ_F_INFLIGHT deaccounting doesn't do any spinlocking or resource
freeing anymore, so it's safe to move it into the normal cleanup flow,
i.e. into io_clean_op(), so making it cleaner.

Also move io_req_needs_clean() to be first in io_dismantle_req() so it
doesn't reload req->flags.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/90653a3a5de4107e3a00536fa4c2ea5f2c38a4ac.1618916549.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 58f12bfbfb44..34c1864eee21 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1601,7 +1601,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 static inline bool io_req_needs_clean(struct io_kiocb *req)
 {
 	return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
-				REQ_F_POLLED);
+				REQ_F_POLLED | REQ_F_INFLIGHT);
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1717,17 +1717,10 @@ static void io_dismantle_req(struct io_kiocb *req)
 {
 	unsigned int flags = req->flags;
 
+	if (io_req_needs_clean(req))
+		io_clean_op(req);
 	if (!(flags & REQ_F_FIXED_FILE))
 		io_put_file(req->file);
-	if (io_req_needs_clean(req) || (req->flags & REQ_F_INFLIGHT)) {
-		io_clean_op(req);
-		if (req->flags & REQ_F_INFLIGHT) {
-			struct io_uring_task *tctx = req->task->io_uring;
-
-			atomic_dec(&tctx->inflight_tracked);
-			req->flags &= ~REQ_F_INFLIGHT;
-		}
-	}
 	if (req->fixed_rsrc_refs)
 		percpu_ref_put(req->fixed_rsrc_refs);
 	if (req->async_data)
@@ -6057,6 +6050,12 @@ static void io_clean_op(struct io_kiocb *req)
 		kfree(req->apoll);
 		req->apoll = NULL;
 	}
+	if (req->flags & REQ_F_INFLIGHT) {
+		struct io_uring_task *tctx = req->task->io_uring;
+
+		atomic_dec(&tctx->inflight_tracked);
+		req->flags &= ~REQ_F_INFLIGHT;
+	}
 }
 
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
-- 
cgit v1.2.3


From 07db298a1c96bdba2102d60ad51fcecb961177c9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Apr 2021 12:03:32 +0100
Subject: io_uring: safer sq_creds putting

Put sq_creds as a part of io_ring_ctx_free(), it's easy to miss doing it
in io_sq_thread_finish(), especially considering past mistakes related
to ring creation failures.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3becb1866467a1de82a97345a0a90d7fb8ff875e.1618916549.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 34c1864eee21..e621c9f5729f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7229,8 +7229,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
 
 		io_put_sq_data(sqd);
 		ctx->sq_data = NULL;
-		if (ctx->sq_creds)
-			put_cred(ctx->sq_creds);
 	}
 }
 
@@ -8425,6 +8423,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 	io_eventfd_unregister(ctx);
 	io_destroy_buffers(ctx);
+	if (ctx->sq_creds)
+		put_cred(ctx->sq_creds);
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
-- 
cgit v1.2.3


From f2a48dd09b8e933f59570692e1382b81d4fddc49 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Apr 2021 12:03:33 +0100
Subject: io_uring: refactor io_sq_offload_create()

Just a bit of code tossing in io_sq_offload_create(), so it looks a bit
better. No functional changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/939776f90de8d2cdd0414e1baa29c8ec0926b561.1618916549.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e621c9f5729f..2d49197845e8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7879,11 +7879,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		f = fdget(p->wq_fd);
 		if (!f.file)
 			return -ENXIO;
-		if (f.file->f_op != &io_uring_fops) {
-			fdput(f);
-			return -EINVAL;
-		}
 		fdput(f);
+		if (f.file->f_op != &io_uring_fops)
+			return -EINVAL;
 	}
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		struct task_struct *tsk;
@@ -7902,13 +7900,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		if (!ctx->sq_thread_idle)
 			ctx->sq_thread_idle = HZ;
 
-		ret = 0;
 		io_sq_thread_park(sqd);
 		list_add(&ctx->sqd_list, &sqd->ctx_list);
 		io_sqd_update_thread_idle(sqd);
 		/* don't attach to a dying SQPOLL thread, would be racy */
-		if (attached && !sqd->thread)
-			ret = -ENXIO;
+		ret = (attached && !sqd->thread) ? -ENXIO : 0;
 		io_sq_thread_unpark(sqd);
 
 		if (ret < 0)
@@ -7920,11 +7916,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			int cpu = p->sq_thread_cpu;
 
 			ret = -EINVAL;
-			if (cpu >= nr_cpu_ids)
-				goto err_sqpoll;
-			if (!cpu_online(cpu))
+			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 				goto err_sqpoll;
-
 			sqd->sq_cpu = cpu;
 		} else {
 			sqd->sq_cpu = -1;
@@ -7950,12 +7943,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 	}
 
 	return 0;
+err_sqpoll:
+	complete(&ctx->sq_data->exited);
 err:
 	io_sq_thread_finish(ctx);
 	return ret;
-err_sqpoll:
-	complete(&ctx->sq_data->exited);
-	goto err;
 }
 
 static inline void __io_unaccount_mem(struct user_struct *user,
-- 
cgit v1.2.3


From 724cb4f9ec905173f32c5bd08fec26abaecc6a1d Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Wed, 21 Apr 2021 23:19:11 +0800
Subject: io_uring: check sqring and iopoll_list before shedule

do this to avoid race below:

         userspace                         kernel

                               |  check sqring and iopoll_list
submit sqe                     |
check IORING_SQ_NEED_WAKEUP    |
(which is not set)    |        |
                               |  set IORING_SQ_NEED_WAKEUP
wait cqe                       |  schedule(never wakeup again)

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/1619018351-75883-1-git-send-email-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2d49197845e8..40f38256499c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6839,27 +6839,29 @@ static int io_sq_thread(void *data)
 			continue;
 		}
 
-		needs_sched = true;
 		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
-		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-			    !list_empty_careful(&ctx->iopoll_list)) {
-				needs_sched = false;
-				break;
-			}
-			if (io_sqring_entries(ctx)) {
-				needs_sched = false;
-				break;
-			}
-		}
-
-		if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+		if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_set_wakeup_flag(ctx);
 
-			mutex_unlock(&sqd->lock);
-			schedule();
-			mutex_lock(&sqd->lock);
+			needs_sched = true;
+			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+				    !list_empty_careful(&ctx->iopoll_list)) {
+					needs_sched = false;
+					break;
+				}
+				if (io_sqring_entries(ctx)) {
+					needs_sched = false;
+					break;
+				}
+			}
+
+			if (needs_sched) {
+				mutex_unlock(&sqd->lock);
+				schedule();
+				mutex_lock(&sqd->lock);
+			}
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_clear_wakeup_flag(ctx);
 		}
-- 
cgit v1.2.3


From fff4db76be297bd4124a503948435a3917d7a702 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:15 +0100
Subject: io_uring: move __io_sqe_files_unregister

A preparation patch moving __io_sqe_files_unregister() definition closer
to other "files" functions without any modification.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/95caf17fe837e67bd1f878395f07049062a010d4.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 40f38256499c..0a35881cdd0b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7028,33 +7028,6 @@ static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
 	table->files = NULL;
 }
 
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if defined(CONFIG_UNIX)
-	if (ctx->ring_sock) {
-		struct sock *sock = ctx->ring_sock->sk;
-		struct sk_buff *skb;
-
-		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-			kfree_skb(skb);
-	}
-#else
-	int i;
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file;
-
-		file = io_file_from_index(ctx, i);
-		if (file)
-			fput(file);
-	}
-#endif
-	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
-	kfree(ctx->file_data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-}
-
 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
 {
 	spin_lock_bh(&ctx->rsrc_ref_lock);
@@ -7157,6 +7130,33 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	return data;
 }
 
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#else
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct file *file;
+
+		file = io_file_from_index(ctx, i);
+		if (file)
+			fput(file);
+	}
+#endif
+	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+	kfree(ctx->file_data);
+	ctx->file_data = NULL;
+	ctx->nr_user_files = 0;
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	int ret;
-- 
cgit v1.2.3


From 44b31f2fa2c4b6479a578e74e4ed6bf7ad243955 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:16 +0100
Subject: io_uring: return back rsrc data free helper

Add io_rsrc_data_free() helper for destroying rsrc_data, easier for
search and the function will get more stuff to destroy shortly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/562d1d53b5ff184f15b8949a63d76ef19c4ba9ec.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0a35881cdd0b..03f4b461c429 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7114,6 +7114,11 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 	return ret;
 }
 
+static void io_rsrc_data_free(struct io_rsrc_data *data)
+{
+	kfree(data);
+}
+
 static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 					       rsrc_put_fn *do_put)
 {
@@ -7152,7 +7157,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	}
 #endif
 	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
-	kfree(ctx->file_data);
+	io_rsrc_data_free(ctx->file_data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 }
@@ -7629,7 +7634,7 @@ out_fput:
 	io_free_file_tables(&ctx->file_table, nr_args);
 	ctx->nr_user_files = 0;
 out_free:
-	kfree(ctx->file_data);
+	io_rsrc_data_free(ctx->file_data);
 	ctx->file_data = NULL;
 	return ret;
 }
-- 
cgit v1.2.3


From d4d19c19d6ae93f99a57c50ccf6d084213e964bd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:17 +0100
Subject: io_uring: decouple CQE filling from requests

Make __io_cqring_fill_event() agnostic of struct io_kiocb, pass all the
data needed directly into it. Will be used to post rsrc removal
completions, which don't have an associated request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c9b8da9e42772db2033547dfebe479dc972a0f2c.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 55 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 03f4b461c429..cb10d3266d1a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1025,7 +1025,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
-static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
+static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+				 long res, unsigned int cflags);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_dismantle_req(struct io_kiocb *req);
@@ -1266,7 +1267,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
-		io_cqring_fill_event(req, status, 0);
+		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
 		io_put_req_deferred(req, 1);
 	}
 }
@@ -1500,10 +1501,9 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
-				     unsigned int cflags)
+static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+				     long res, unsigned int cflags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_overflow_cqe *ocqe;
 
 	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
@@ -1521,20 +1521,19 @@ static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
 		set_bit(0, &ctx->cq_check_overflow);
 		ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 	}
-	ocqe->cqe.user_data = req->user_data;
+	ocqe->cqe.user_data = user_data;
 	ocqe->cqe.res = res;
 	ocqe->cqe.flags = cflags;
 	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 	return true;
 }
 
-static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
-					     unsigned int cflags)
+static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+					  long res, unsigned int cflags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(ctx, req->user_data, res, cflags);
+	trace_io_uring_complete(ctx, user_data, res, cflags);
 
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
@@ -1543,19 +1542,19 @@ static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
 	 */
 	cqe = io_get_cqring(ctx);
 	if (likely(cqe)) {
-		WRITE_ONCE(cqe->user_data, req->user_data);
+		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
 		return true;
 	}
-	return io_cqring_event_overflow(req, res, cflags);
+	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 }
 
 /* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_kiocb *req, long res,
-					  unsigned int cflags)
+static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+					  long res, unsigned int cflags)
 {
-	return __io_cqring_fill_event(req, res, cflags);
+	return __io_cqring_fill_event(ctx, user_data, res, cflags);
 }
 
 static void io_req_complete_post(struct io_kiocb *req, long res,
@@ -1565,7 +1564,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	__io_cqring_fill_event(req, res, cflags);
+	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -1776,7 +1775,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		io_remove_next_linked(req);
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
-			io_cqring_fill_event(link, -ECANCELED, 0);
+			io_cqring_fill_event(link->ctx, link->user_data,
+					     -ECANCELED, 0);
 			io_put_req_deferred(link, 1);
 			return true;
 		}
@@ -1795,7 +1795,7 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_fill_event(link, -ECANCELED, 0);
+		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
 		io_put_req_deferred(link, 2);
 		link = nxt;
 	}
@@ -2116,7 +2116,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
 		req = cs->reqs[i];
-		__io_cqring_fill_event(req, req->result, req->compl.cflags);
+		__io_cqring_fill_event(ctx, req->user_data, req->result,
+					req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -2256,7 +2257,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_rw_kbuf(req);
 
-		__io_cqring_fill_event(req, req->result, cflags);
+		__io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
@@ -4875,7 +4876,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!io_cqring_fill_event(req, error, flags)) {
+	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
 		io_poll_remove_waitqs(req);
 		req->poll.done = true;
 		flags = 0;
@@ -5203,7 +5204,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 
 	do_complete = io_poll_remove_waitqs(req);
 	if (do_complete) {
-		io_cqring_fill_event(req, -ECANCELED, 0);
+		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail_links(req);
 		io_put_req_deferred(req, 1);
@@ -5455,7 +5456,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	atomic_set(&req->ctx->cq_timeouts,
 		atomic_read(&req->ctx->cq_timeouts) + 1);
 
-	io_cqring_fill_event(req, -ETIME, 0);
+	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -5497,7 +5498,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 		return PTR_ERR(req);
 
 	req_set_fail_links(req);
-	io_cqring_fill_event(req, -ECANCELED, 0);
+	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
 	io_put_req_deferred(req, 1);
 	return 0;
 }
@@ -5570,7 +5571,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
 					io_translate_timeout_mode(tr->flags));
 
-	io_cqring_fill_event(req, ret, 0);
+	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -5722,7 +5723,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 done:
 	if (!ret)
 		ret = success_ret;
-	io_cqring_fill_event(req, ret, 0);
+	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
@@ -5779,7 +5780,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 
 	spin_lock_irq(&ctx->completion_lock);
 done:
-	io_cqring_fill_event(req, ret, 0);
+	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
-- 
cgit v1.2.3


From b60c8dce33895f79cbb54700fbeffc7db8aee3f7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:18 +0100
Subject: io_uring: preparation for rsrc tagging

We need a way to notify userspace when a lazily removed resource
actually died out. This will be done by associating a tag, which is u64
exactly like req->user_data, with each rsrc (e.g. buffer of file). A CQE
will be posted once a resource is actually put down.

Tag 0 is a special value set by default, for whcih it don't generate an
CQE, so providing the old behaviour.

Don't expose it to the userspace yet, but prepare internally, allocate
buffers, add all posting hooks, etc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2e6beec5eabe7216bb61fb93cdf5aaf65812a9b0.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cb10d3266d1a..9796998ef100 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -214,6 +214,7 @@ struct io_fixed_file {
 
 struct io_rsrc_put {
 	struct list_head list;
+	u64 tag;
 	union {
 		void *rsrc;
 		struct file *file;
@@ -239,6 +240,7 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;
 
+	u64				*tags;
 	rsrc_put_fn			*do_put;
 	atomic_t			refs;
 	struct completion		done;
@@ -7117,11 +7119,13 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
 
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
+	kvfree(data->tags);
 	kfree(data);
 }
 
 static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
-					       rsrc_put_fn *do_put)
+					       rsrc_put_fn *do_put,
+					       unsigned nr)
 {
 	struct io_rsrc_data *data;
 
@@ -7129,6 +7133,12 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 	if (!data)
 		return NULL;
 
+	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+	if (!data->tags) {
+		kfree(data);
+		return NULL;
+	}
+
 	atomic_set(&data->refs, 1);
 	data->ctx = ctx;
 	data->do_put = do_put;
@@ -7493,6 +7503,20 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 		list_del(&prsrc->list);
+
+		if (prsrc->tag) {
+			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+			unsigned long flags;
+
+			io_ring_submit_lock(ctx, lock_ring);
+			spin_lock_irqsave(&ctx->completion_lock, flags);
+			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+			io_commit_cqring(ctx);
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+			io_cqring_ev_posted(ctx);
+			io_ring_submit_unlock(ctx, lock_ring);
+		}
+
 		rsrc_data->do_put(ctx, prsrc);
 		kfree(prsrc);
 	}
@@ -7582,7 +7606,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (ret)
 		return ret;
 
-	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put);
+	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
 	if (!file_data)
 		return -ENOMEM;
 	ctx->file_data = file_data;
@@ -7683,7 +7707,7 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_queue_rsrc_removal(struct io_rsrc_data *data,
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 				 struct io_rsrc_node *node, void *rsrc)
 {
 	struct io_rsrc_put *prsrc;
@@ -7692,6 +7716,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data,
 	if (!prsrc)
 		return -ENOMEM;
 
+	prsrc->tag = data->tags[idx];
 	prsrc->rsrc = rsrc;
 	list_add(&prsrc->list, &node->rsrc_list);
 	return 0;
@@ -7732,7 +7757,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 		if (file_slot->file_ptr) {
 			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-			err = io_queue_rsrc_removal(data, ctx->rsrc_node, file);
+			err = io_queue_rsrc_removal(data, up->offset + done,
+						    ctx->rsrc_node, file);
 			if (err)
 				break;
 			file_slot->file_ptr = 0;
-- 
cgit v1.2.3


From 98f0b3b4f1d51911492b9d6eda4add0ec562179b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:19 +0100
Subject: io_uring: add generic path for rsrc update

Extract some common parts for rsrc update, will be used reg buffers
support dynamic (i.e. quiesce-lee) managing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b49c3ff6b9ff0e530295767604fe4de64d349e04.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 79 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9796998ef100..86a957d1059b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1035,9 +1035,9 @@ static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_rsrc_update *ip,
-				 unsigned nr_args);
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+				     struct io_uring_rsrc_update *up,
+				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
@@ -5824,7 +5824,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.data = req->rsrc_update.arg;
 
 	mutex_lock(&ctx->uring_lock);
-	ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
+	ret = __io_register_rsrc_update(ctx, IORING_REGISTER_FILES_UPDATE,
+					&up, req->rsrc_update.nr_args);
 	mutex_unlock(&ctx->uring_lock);
 
 	if (ret < 0)
@@ -7726,25 +7727,20 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update *up,
 				 unsigned nr_args)
 {
+	__s32 __user *fds = u64_to_user_ptr(up->data);
 	struct io_rsrc_data *data = ctx->file_data;
 	struct io_fixed_file *file_slot;
 	struct file *file;
-	__s32 __user *fds;
-	int fd, i, err;
-	__u32 done;
+	int fd, i, err = 0;
+	unsigned int done;
 	bool needs_switch = false;
 
-	if (check_add_overflow(up->offset, nr_args, &done))
-		return -EOVERFLOW;
-	if (done > ctx->nr_user_files)
+	if (!ctx->file_data)
+		return -ENXIO;
+	if (up->offset + nr_args > ctx->nr_user_files)
 		return -EINVAL;
-	err = io_rsrc_node_switch_start(ctx);
-	if (err)
-		return err;
 
-	fds = u64_to_user_ptr(up->data);
 	for (done = 0; done < nr_args; done++) {
-		err = 0;
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
 			break;
@@ -7798,23 +7794,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	return done ? done : err;
 }
 
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
-			       unsigned nr_args)
-{
-	struct io_uring_rsrc_update up;
-
-	if (!ctx->file_data)
-		return -ENXIO;
-	if (!nr_args)
-		return -EINVAL;
-	if (copy_from_user(&up, arg, sizeof(up)))
-		return -EFAULT;
-	if (up.resv)
-		return -EINVAL;
-
-	return __io_sqe_files_update(ctx, &up, nr_args);
-}
-
 static struct io_wq_work *io_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -9730,6 +9709,40 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+				     struct io_uring_rsrc_update *up,
+				     unsigned nr_args)
+{
+	__u32 tmp;
+	int err;
+
+	if (check_add_overflow(up->offset, nr_args, &tmp))
+		return -EOVERFLOW;
+	err = io_rsrc_node_switch_start(ctx);
+	if (err)
+		return err;
+
+	switch (opcode) {
+	case IORING_REGISTER_FILES_UPDATE:
+		return __io_sqe_files_update(ctx, up, nr_args);
+	}
+	return -EINVAL;
+}
+
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+				   void __user *arg, unsigned nr_args)
+{
+	struct io_uring_rsrc_update up;
+
+	if (!nr_args)
+		return -EINVAL;
+	if (copy_from_user(&up, arg, sizeof(up)))
+		return -EFAULT;
+	if (up.resv)
+		return -EINVAL;
+	return __io_register_rsrc_update(ctx, opcode, &up, nr_args);
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -9816,7 +9829,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES_UPDATE:
-		ret = io_sqe_files_update(ctx, arg, nr_args);
+		ret = io_register_rsrc_update(ctx, opcode, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
 	case IORING_REGISTER_EVENTFD_ASYNC:
-- 
cgit v1.2.3


From fdecb66281e165927059419c3b1de09ffe4f8369 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:20 +0100
Subject: io_uring: enumerate dynamic resources

As resources are getting more support and common parts, it'll be more
convenient to index resources and use it for indexing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0be63e9310212d5601d36277c2946ff7a040485.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86a957d1059b..36b9651588f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1035,7 +1035,7 @@ static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
@@ -5824,7 +5824,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.data = req->rsrc_update.arg;
 
 	mutex_lock(&ctx->uring_lock);
-	ret = __io_register_rsrc_update(ctx, IORING_REGISTER_FILES_UPDATE,
+	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 					&up, req->rsrc_update.nr_args);
 	mutex_unlock(&ctx->uring_lock);
 
@@ -9709,7 +9709,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	return 0;
 }
 
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update *up,
 				     unsigned nr_args)
 {
@@ -9722,14 +9722,14 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
 	if (err)
 		return err;
 
-	switch (opcode) {
-	case IORING_REGISTER_FILES_UPDATE:
+	switch (type) {
+	case IORING_RSRC_FILE:
 		return __io_sqe_files_update(ctx, up, nr_args);
 	}
 	return -EINVAL;
 }
 
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				   void __user *arg, unsigned nr_args)
 {
 	struct io_uring_rsrc_update up;
@@ -9740,7 +9740,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
 		return -EFAULT;
 	if (up.resv)
 		return -EINVAL;
-	return __io_register_rsrc_update(ctx, opcode, &up, nr_args);
+	return __io_register_rsrc_update(ctx, type, &up, nr_args);
 }
 
 static bool io_register_op_must_quiesce(int op)
@@ -9829,7 +9829,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES_UPDATE:
-		ret = io_register_rsrc_update(ctx, opcode, arg, nr_args);
+		ret = io_register_rsrc_update(ctx, IORING_RSRC_FILE, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
 	case IORING_REGISTER_EVENTFD_ASYNC:
-- 
cgit v1.2.3


From 792e35824be9af9fb4dac956229fb97bda04e25e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:21 +0100
Subject: io_uring: add IORING_REGISTER_RSRC

Add a new io_uring_register() opcode for rsrc registeration. Instead of
accepting a pointer to resources, fds or iovecs, it @arg is now pointing
to a struct io_uring_rsrc_register, and the second argument tells how
large that struct is to make it easily extendible by adding new fields.

All that is done mainly to be able to pass in a pointer with tags. Pass
it in and enable CQE posting for file resources. Doesn't support setting
tags on update yet.

A design choice made here is to not post CQEs on rsrc de-registration,
but only when we updated-removed it by rsrc dynamic update.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c498aaec32a4bb277b2406b9069662c02cdda98c.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 36b9651588f9..0088eeef82ec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7589,7 +7589,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 }
 
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-				 unsigned nr_args)
+				 unsigned nr_args, u64 __user *tags)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
 	struct file *file;
@@ -7616,17 +7616,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		u64 tag = 0;
+
+		if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
+		    copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
 		}
 		/* allow sparse sets */
-		if (fd == -1)
+		if (fd == -1) {
+			ret = -EINVAL;
+			if (unlikely(tag))
+				goto out_fput;
 			continue;
+		}
 
 		file = fget(fd);
 		ret = -EBADF;
-		if (!file)
+		if (unlikely(!file))
 			goto out_fput;
 
 		/*
@@ -7640,6 +7647,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
+		ctx->file_data->tags[i] = tag;
 		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 	}
 
@@ -9743,6 +9751,29 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	return __io_register_rsrc_update(ctx, type, &up, nr_args);
 }
 
+static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned int size)
+{
+	struct io_uring_rsrc_register rr;
+
+	/* keep it extendible */
+	if (size != sizeof(rr))
+		return -EINVAL;
+
+	memset(&rr, 0, sizeof(rr));
+	if (copy_from_user(&rr, arg, size))
+		return -EFAULT;
+	if (!rr.nr)
+		return -EINVAL;
+
+	switch (rr.type) {
+	case IORING_RSRC_FILE:
+		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+					     rr.nr, u64_to_user_ptr(rr.tags));
+	}
+	return -EINVAL;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -9752,6 +9783,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_PROBE:
 	case IORING_REGISTER_PERSONALITY:
 	case IORING_UNREGISTER_PERSONALITY:
+	case IORING_REGISTER_RSRC:
 		return false;
 	default:
 		return true;
@@ -9820,7 +9852,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_buffers_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES:
-		ret = io_sqe_files_register(ctx, arg, nr_args);
+		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_FILES:
 		ret = -EINVAL;
@@ -9877,6 +9909,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_RESTRICTIONS:
 		ret = io_register_restrictions(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_RSRC:
+		ret = io_register_rsrc(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From c3bdad0271834214be01c1d687c262bf80da6eb0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:22 +0100
Subject: io_uring: add generic rsrc update with tags

Add IORING_REGISTER_RSRC_UPDATE, which also supports passing in rsrc
tags. Implement it for registered files.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d4dc66df204212f64835ffca2c4eb5e8363f2f05.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0088eeef82ec..a4c37a2cc690 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,7 @@ static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update *up,
+				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
@@ -5814,7 +5814,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_rsrc_update up;
+	struct io_uring_rsrc_update2 up;
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5822,6 +5822,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 
 	up.offset = req->rsrc_update.offset;
 	up.data = req->rsrc_update.arg;
+	up.nr = 0;
+	up.tags = 0;
 
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
@@ -7732,9 +7734,10 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_rsrc_update *up,
+				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
 {
+	u64 __user *tags = u64_to_user_ptr(up->tags);
 	__s32 __user *fds = u64_to_user_ptr(up->data);
 	struct io_rsrc_data *data = ctx->file_data;
 	struct io_fixed_file *file_slot;
@@ -7749,10 +7752,17 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+		u64 tag = 0;
+
+		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+			err = -EINVAL;
+			break;
+		}
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
@@ -7787,6 +7797,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
+			data->tags[up->offset + done] = tag;
 			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
@@ -9718,12 +9729,14 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 }
 
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update *up,
+				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args)
 {
 	__u32 tmp;
 	int err;
 
+	if (up->resv)
+		return -EINVAL;
 	if (check_add_overflow(up->offset, nr_args, &tmp))
 		return -EOVERFLOW;
 	err = io_rsrc_node_switch_start(ctx);
@@ -9737,18 +9750,31 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	return -EINVAL;
 }
 
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				   void __user *arg, unsigned nr_args)
+static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+				    unsigned nr_args)
 {
-	struct io_uring_rsrc_update up;
+	struct io_uring_rsrc_update2 up;
 
 	if (!nr_args)
 		return -EINVAL;
+	memset(&up, 0, sizeof(up));
+	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+		return -EFAULT;
+	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+				   unsigned size)
+{
+	struct io_uring_rsrc_update2 up;
+
+	if (size != sizeof(up))
+		return -EINVAL;
 	if (copy_from_user(&up, arg, sizeof(up)))
 		return -EFAULT;
-	if (up.resv)
+	if (!up.nr)
 		return -EINVAL;
-	return __io_register_rsrc_update(ctx, type, &up, nr_args);
+	return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
 }
 
 static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
@@ -9784,6 +9810,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_PERSONALITY:
 	case IORING_UNREGISTER_PERSONALITY:
 	case IORING_REGISTER_RSRC:
+	case IORING_REGISTER_RSRC_UPDATE:
 		return false;
 	default:
 		return true;
@@ -9861,7 +9888,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES_UPDATE:
-		ret = io_register_rsrc_update(ctx, IORING_RSRC_FILE, arg, nr_args);
+		ret = io_register_files_update(ctx, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
 	case IORING_REGISTER_EVENTFD_ASYNC:
@@ -9912,6 +9939,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_RSRC:
 		ret = io_register_rsrc(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_RSRC_UPDATE:
+		ret = io_register_rsrc_update(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 41edf1a5ec967bf4bddedb83c48e02dfea8315b4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:23 +0100
Subject: io_uring: keep table of pointers to ubufs

Instead of keeping a table of ubufs convert them into pointers to ubuf,
so we can atomically read one pointer and be sure that the content of
ubuf won't change.

Because it was already dynamically allocating imu->bvec, throw both
imu and bvec into a single structure so they can be allocated together.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b96efa4c5febadeccf41d0e849ac099f4c83b0d3.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a4c37a2cc690..b6ec14d26673 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,9 +195,9 @@ enum io_uring_cmd_flags {
 struct io_mapped_ubuf {
 	u64		ubuf;
 	u64		ubuf_end;
-	struct		bio_vec *bvec;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	struct bio_vec	bvec[];
 };
 
 struct io_ring_ctx;
@@ -405,7 +405,7 @@ struct io_ring_ctx {
 
 	/* if used, fixed mapped user buffers */
 	unsigned		nr_user_bufs;
-	struct io_mapped_ubuf	*user_bufs;
+	struct io_mapped_ubuf	**user_bufs;
 
 	struct user_struct	*user;
 
@@ -2760,7 +2760,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 	if (unlikely(buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
 	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
-	imu = &ctx->user_bufs[index];
+	imu = ctx->user_bufs[index];
 	buf_addr = req->rw.addr;
 
 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
@@ -8081,16 +8081,17 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 	return off;
 }
 
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
 {
+	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
 	for (i = 0; i < imu->nr_bvecs; i++)
 		unpin_user_page(imu->bvec[i].bv_page);
 	if (imu->acct_pages)
 		io_unaccount_mem(ctx, imu->acct_pages);
-	kvfree(imu->bvec);
-	imu->nr_bvecs = 0;
+	kvfree(imu);
+	*slot = NULL;
 }
 
 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
@@ -8157,7 +8158,7 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 
 	/* check previously registered pages */
 	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
 			if (!PageCompound(imu->bvec[j].bv_page))
@@ -8202,9 +8203,10 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 }
 
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf *imu,
+				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
 {
+	struct io_mapped_ubuf *imu = NULL;
 	struct vm_area_struct **vmas = NULL;
 	struct page **pages = NULL;
 	unsigned long off, start, end, ubuf;
@@ -8216,6 +8218,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	start = ubuf >> PAGE_SHIFT;
 	nr_pages = end - start;
 
+	*pimu = NULL;
 	ret = -ENOMEM;
 
 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
@@ -8227,8 +8230,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	if (!vmas)
 		goto done;
 
-	imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
-				   GFP_KERNEL);
+	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 	if (!imu->bvec)
 		goto done;
 
@@ -8258,14 +8260,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		 */
 		if (pret > 0)
 			unpin_user_pages(pages, pret);
-		kvfree(imu->bvec);
 		goto done;
 	}
 
 	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
 	if (ret) {
 		unpin_user_pages(pages, pret);
-		kvfree(imu->bvec);
 		goto done;
 	}
 
@@ -8285,8 +8285,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->ubuf = ubuf;
 	imu->ubuf_end = ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	*pimu = imu;
 	ret = 0;
 done:
+	if (ret)
+		kvfree(imu);
 	kvfree(pages);
 	kvfree(vmas);
 	return ret;
@@ -8336,15 +8339,15 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
-
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
-		ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+
+		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+					     &last_hpage);
 		if (ret)
 			break;
 	}
@@ -9291,7 +9294,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	}
 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
 		unsigned int len = buf->ubuf_end - buf->ubuf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
-- 
cgit v1.2.3


From eae071c9b4cefbcc3f985c5abf9a6e32c1608ca9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:24 +0100
Subject: io_uring: prepare fixed rw for dynanic buffers

With dynamic buffer updates, registered buffers in the table may change
at any moment. First of all we want to prevent future races between
updating and importing (i.e. io_import_fixed()), where the latter one
may happen without uring_lock held, e.g. from io-wq.

Save the first loaded io_mapped_ubuf buffer and reuse.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/21a2302d07766ae956640b6f753292c45200fe8f.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b6ec14d26673..4f427cf53537 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -839,6 +839,8 @@ struct io_kiocb {
 	struct hlist_node		hash_node;
 	struct async_poll		*apoll;
 	struct io_wq_work		work;
+	/* store used ubuf, so we can prevent reloading */
+	struct io_mapped_ubuf		*imu;
 };
 
 struct io_tctx_node {
@@ -2683,6 +2685,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (req->opcode == IORING_OP_READ_FIXED ||
+	    req->opcode == IORING_OP_WRITE_FIXED) {
+		req->imu = NULL;
+		io_req_set_rsrc_node(req);
+	}
+
 	req->rw.addr = READ_ONCE(sqe->addr);
 	req->rw.len = READ_ONCE(sqe->len);
 	req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2748,21 +2756,13 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 	}
 }
 
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+			     struct io_mapped_ubuf *imu)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	size_t len = req->rw.len;
-	struct io_mapped_ubuf *imu;
-	u16 index, buf_index = req->buf_index;
 	u64 buf_end, buf_addr = req->rw.addr;
 	size_t offset;
 
-	if (unlikely(buf_index >= ctx->nr_user_bufs))
-		return -EFAULT;
-	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
-	imu = ctx->user_bufs[index];
-	buf_addr = req->rw.addr;
-
 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 		return -EFAULT;
 	/* not inside the mapped region */
@@ -2814,6 +2814,22 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 	return 0;
 }
 
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_mapped_ubuf *imu = req->imu;
+	u16 index, buf_index = req->buf_index;
+
+	if (likely(!imu)) {
+		if (unlikely(buf_index >= ctx->nr_user_bufs))
+			return -EFAULT;
+		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+		imu = READ_ONCE(ctx->user_bufs[index]);
+		req->imu = imu;
+	}
+	return __io_import_fixed(req, rw, iter, imu);
+}
+
 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
 {
 	if (needs_lock)
@@ -9506,6 +9522,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ret = io_sq_offload_create(ctx, p);
 	if (ret)
 		goto err;
+	/* always set a rsrc node */
+	io_rsrc_node_switch_start(ctx);
+	io_rsrc_node_switch(ctx, NULL);
 
 	memset(&p->sq_off, 0, sizeof(p->sq_off));
 	p->sq_off.head = offsetof(struct io_rings, sq.head);
-- 
cgit v1.2.3


From bd54b6fe3316ec1d469513b888ced31eec20032a Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Sun, 25 Apr 2021 14:32:25 +0100
Subject: io_uring: implement fixed buffers registration similar to fixed files

Apply fixed_rsrc functionality for fixed buffers support.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[rebase, remove multi-level tables, fix unregister on exit]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/17035f4f75319dc92962fce4fc04bc0afb5a68dc.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4f427cf53537..0680d241fedc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -218,6 +218,7 @@ struct io_rsrc_put {
 	union {
 		void *rsrc;
 		struct file *file;
+		struct io_mapped_ubuf *buf;
 	};
 };
 
@@ -404,6 +405,7 @@ struct io_ring_ctx {
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
+	struct io_rsrc_data	*buf_data;
 	unsigned		nr_user_bufs;
 	struct io_mapped_ubuf	**user_bufs;
 
@@ -5927,7 +5929,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 			req->opcode);
-	return-EINVAL;
+	return -EINVAL;
 }
 
 static int io_req_prep_async(struct io_kiocb *req)
@@ -8110,19 +8112,36 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	*slot = NULL;
 }
 
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
-	unsigned int i;
+	/* no updates yet, so not used */
+	WARN_ON_ONCE(1);
+}
 
-	if (!ctx->user_bufs)
-		return -ENXIO;
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+	unsigned int i;
 
 	for (i = 0; i < ctx->nr_user_bufs; i++)
 		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 	kfree(ctx->user_bufs);
+	kfree(ctx->buf_data);
 	ctx->user_bufs = NULL;
+	ctx->buf_data = NULL;
 	ctx->nr_user_bufs = 0;
-	return 0;
+}
+
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+	int ret;
+
+	if (!ctx->buf_data)
+		return -ENXIO;
+
+	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+	if (!ret)
+		__io_sqe_buffers_unregister(ctx);
+	return ret;
 }
 
 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
@@ -8342,17 +8361,26 @@ static int io_buffer_validate(struct iovec *iov)
 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 				   unsigned int nr_args)
 {
+	struct page *last_hpage = NULL;
+	struct io_rsrc_data *data;
 	int i, ret;
 	struct iovec iov;
-	struct page *last_hpage = NULL;
 
 	if (ctx->user_bufs)
 		return -EBUSY;
 	if (!nr_args || nr_args > UIO_MAXIOV)
 		return -EINVAL;
-	ret = io_buffers_map_alloc(ctx, nr_args);
+	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
+	data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
+	if (!data)
+		return -ENOMEM;
+	ret = io_buffers_map_alloc(ctx, nr_args);
+	if (ret) {
+		kfree(data);
+		return ret;
+	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 		ret = io_copy_iov(ctx, &iov, arg, i);
@@ -8368,9 +8396,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			break;
 	}
 
-	if (ret)
-		io_sqe_buffers_unregister(ctx);
+	WARN_ON_ONCE(ctx->buf_data);
 
+	ctx->buf_data = data;
+	if (ret)
+		__io_sqe_buffers_unregister(ctx);
+	else
+		io_rsrc_node_switch(ctx, NULL);
 	return ret;
 }
 
@@ -8445,10 +8477,18 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
+static bool io_wait_rsrc_data(struct io_rsrc_data *data)
+{
+	if (!data)
+		return false;
+	if (!atomic_dec_and_test(&data->refs))
+		wait_for_completion(&data->done);
+	return true;
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
-	io_sqe_buffers_unregister(ctx);
 
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
@@ -8456,11 +8496,10 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	}
 
 	mutex_lock(&ctx->uring_lock);
-	if (ctx->file_data) {
-		if (!atomic_dec_and_test(&ctx->file_data->refs))
-			wait_for_completion(&ctx->file_data->done);
+	if (io_wait_rsrc_data(ctx->buf_data))
+		__io_sqe_buffers_unregister(ctx);
+	if (io_wait_rsrc_data(ctx->file_data))
 		__io_sqe_files_unregister(ctx);
-	}
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	mutex_unlock(&ctx->uring_lock);
@@ -9825,6 +9864,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
+	case IORING_REGISTER_BUFFERS:
+	case IORING_UNREGISTER_BUFFERS:
 	case IORING_REGISTER_FILES:
 	case IORING_UNREGISTER_FILES:
 	case IORING_REGISTER_FILES_UPDATE:
-- 
cgit v1.2.3


From 634d00df5e1cfc4a707b629a814bd607f726bd52 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:26 +0100
Subject: io_uring: add full-fledged dynamic buffers support

Hook buffers into all rsrc infrastructure, including tagging and
updates.

Suggested-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/119ed51d68a491dae87eb55fb467a47870c86aad.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 4 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0680d241fedc..0ab09f7a44e7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8114,8 +8114,8 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 
 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
-	/* no updates yet, so not used */
-	WARN_ON_ONCE(1);
+	io_buffer_unmap(ctx, &prsrc->buf);
+	prsrc->buf = NULL;
 }
 
 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
@@ -8359,7 +8359,7 @@ static int io_buffer_validate(struct iovec *iov)
 }
 
 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-				   unsigned int nr_args)
+				   unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
 	struct io_rsrc_data *data;
@@ -8383,6 +8383,12 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		u64 tag = 0;
+
+		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
+			ret = -EFAULT;
+			break;
+		}
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
@@ -8394,6 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 					     &last_hpage);
 		if (ret)
 			break;
+		data->tags[i] = tag;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
@@ -8406,6 +8413,62 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+				   struct io_uring_rsrc_update2 *up,
+				   unsigned int nr_args)
+{
+	u64 __user *tags = u64_to_user_ptr(up->tags);
+	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+	struct io_mapped_ubuf *imu;
+	struct page *last_hpage = NULL;
+	bool needs_switch = false;
+	__u32 done;
+	int i, err;
+
+	if (!ctx->buf_data)
+		return -ENXIO;
+	if (up->offset + nr_args > ctx->nr_user_bufs)
+		return -EINVAL;
+
+	for (done = 0; done < nr_args; done++) {
+		u64 tag = 0;
+
+		err = io_copy_iov(ctx, &iov, iovs, done);
+		if (err)
+			break;
+		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+			err = -EFAULT;
+			break;
+		}
+
+		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
+		imu = ctx->user_bufs[i];
+		if (imu) {
+			err = io_queue_rsrc_removal(ctx->buf_data, up->offset + done,
+						    ctx->rsrc_node, imu);
+			if (err)
+				break;
+			ctx->user_bufs[i] = NULL;
+			needs_switch = true;
+		}
+
+		if (iov.iov_base || iov.iov_len) {
+			err = io_buffer_validate(&iov);
+			if (err)
+				break;
+			err = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+						     &last_hpage);
+			if (err)
+				break;
+			ctx->buf_data->tags[up->offset + done] = tag;
+		}
+	}
+
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->buf_data);
+	return done ? done : err;
+}
+
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 {
 	__s32 __user *fds = arg;
@@ -9807,6 +9870,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	switch (type) {
 	case IORING_RSRC_FILE:
 		return __io_sqe_files_update(ctx, up, nr_args);
+	case IORING_RSRC_BUFFER:
+		return __io_sqe_buffers_update(ctx, up, nr_args);
 	}
 	return -EINVAL;
 }
@@ -9857,6 +9922,9 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	case IORING_RSRC_FILE:
 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 					     rr.nr, u64_to_user_ptr(rr.tags));
+	case IORING_RSRC_BUFFER:
+		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+					       rr.nr, u64_to_user_ptr(rr.tags));
 	}
 	return -EINVAL;
 }
@@ -9933,7 +10001,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
-		ret = io_sqe_buffers_register(ctx, arg, nr_args);
+		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_BUFFERS:
 		ret = -EINVAL;
-- 
cgit v1.2.3


From 2b4ae19c6d4842dc24d9e0cbec5c98d2766643d5 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Sat, 24 Apr 2021 17:26:20 +0800
Subject: io_uring: update sq_thread_idle after ctx deleted

we shall update sq_thread_idle anytime we do ctx deletion from ctx_list

Fixes:734551df6f9b ("io_uring: fix shared sqpoll cancellation hangs")

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/1619256380-236460-1-git-send-email-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0ab09f7a44e7..be2be0530798 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9018,6 +9018,7 @@ static void io_sqpoll_cancel_cb(struct callback_head *cb)
 	if (sqd->thread)
 		io_uring_cancel_sqpoll(sqd);
 	list_del_init(&work->ctx->sqd_list);
+	io_sqd_update_thread_idle(sqd);
 	complete(&work->completion);
 }
 
@@ -9028,7 +9029,6 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 	struct task_struct *task;
 
 	io_sq_thread_park(sqd);
-	io_sqd_update_thread_idle(sqd);
 	task = sqd->thread;
 	if (task) {
 		init_completion(&work.completion);
@@ -9037,6 +9037,7 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
 		wake_up_process(task);
 	} else {
 		list_del_init(&ctx->sqd_list);
+		io_sqd_update_thread_idle(sqd);
 	}
 	io_sq_thread_unpark(sqd);
 
-- 
cgit v1.2.3


From a2a7cc32a5e8cd983912f25a242820107e5613dc Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Sun, 25 Apr 2021 01:26:04 +0200
Subject: io_uring: io_sq_thread() no longer needs to reset
 current->pf_io_worker

This is done by create_io_thread() now.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index be2be0530798..eef373394ee9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6809,7 +6809,6 @@ static int io_sq_thread(void *data)
 
 	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 	set_task_comm(current, buf);
-	current->pf_io_worker = NULL;
 
 	if (sqd->sq_cpu != -1)
 		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
-- 
cgit v1.2.3


From a2b4198cab7e3edcb78fce77e0e8aca130435403 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 26 Apr 2021 00:16:31 +0100
Subject: io_uring: fix invalid error check after malloc

Now we allocate io_mapped_ubuf instead of bvec, so we clearly have to
check its address after allocation.

Fixes: 41edf1a5ec967 ("io_uring: keep table of pointers to ubufs")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d28eb1bc4384284f69dbce35b9f70c115ff6176f.1619392565.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index eef373394ee9..b65a25384019 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8265,7 +8265,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 
 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
-	if (!imu->bvec)
+	if (!imu)
 		goto done;
 
 	ret = 0;
-- 
cgit v1.2.3


From 615cee49b3ca55f54d527f7a6a7d0fd4fd6fef6b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 26 Apr 2021 10:47:35 +0100
Subject: io_uring: Fix uninitialized variable up.resv

The variable up.resv is not initialized and is being checking for a
non-zero value in the call to _io_register_rsrc_update. Fix this by
explicitly setting the variable to 0.

Addresses-Coverity: ("Uninitialized scalar variable)"
Fixes: c3bdad027183 ("io_uring: add generic rsrc update with tags")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Link: https://lore.kernel.org/r/20210426094735.8320-1-colin.king@canonical.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b65a25384019..577520445fa0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5842,6 +5842,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.data = req->rsrc_update.arg;
 	up.nr = 0;
 	up.tags = 0;
+	up.resv = 0;
 
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
-- 
cgit v1.2.3


From 28090c133869b461c5366195a856d73469ab87d9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 23:34:45 +0100
Subject: io_uring: fix work_exit sqpoll cancellations

After closing an SQPOLL ring, io_ring_exit_work() kicks in and starts
doing cancellations via io_uring_try_cancel_requests(). It will go
through io_uring_try_cancel_iowq(), which uses ctx->tctx_list, but as
SQPOLL task don't have a ctx note, its io-wq won't be reachable and so
is left not cancelled.

It will eventually cancelled when one of the tasks dies, but if a thread
group survives for long and changes rings, it will spawn lots of
unreclaimed resources and live locked works.

Cancel SQPOLL task's io-wq separately in io_ring_exit_work().

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a71a7fe345135d684025bb529d5cb1d8d6b46e10.1619389911.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 577520445fa0..f20622bd963b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8679,6 +8679,13 @@ static void io_tctx_exit_cb(struct callback_head *cb)
 	complete(&work->completion);
 }
 
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	return req->ctx == data;
+}
+
 static void io_ring_exit_work(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
@@ -8695,6 +8702,17 @@ static void io_ring_exit_work(struct work_struct *work)
 	 */
 	do {
 		io_uring_try_cancel_requests(ctx, NULL, NULL);
+		if (ctx->sq_data) {
+			struct io_sq_data *sqd = ctx->sq_data;
+			struct task_struct *tsk;
+
+			io_sq_thread_park(sqd);
+			tsk = sqd->thread;
+			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
+				io_wq_cancel_cb(tsk->io_uring->io_wq,
+						io_cancel_ctx_cb, ctx, true);
+			io_sq_thread_unpark(sqd);
+		}
 
 		WARN_ON_ONCE(time_after(jiffies, timeout));
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
@@ -8844,13 +8862,6 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	return true;
 }
 
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-	return req->ctx == data;
-}
-
 static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 {
 	struct io_tctx_node *node;
-- 
cgit v1.2.3


From 9f59a9d88d3bb2708d08e0e1d03899c469c27190 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 23:34:46 +0100
Subject: io_uring: simplify SQPOLL cancellations

All sqpoll rings (even sharing sqpoll task) are currently dead bound
to the task that created them, iow when owner task dies it kills all
its SQPOLL rings and their inflight requests via task_work infra. It's
neither the nicist way nor the most convenient as adds extra
locking/waiting and dependencies.

Leave it alone and rely on SIGKILL being delivered on its thread group
exit, so there are only two cases left:

1) thread group is dying, so sqpoll task gets a signal and exit itself
   cancelling all requests.

2) an sqpoll ring is dying. Because refs_kill() is called the sqpoll not
   going to submit any new request, and that's what we need. And
   io_ring_exit_work() will do all the cancellation itself before
   actually killing ctx, so sqpoll doesn't need to worry about it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3cd7f166b9c326a2c932b70e71a655b03257b366.1619389911.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 45 +++------------------------------------------
 1 file changed, 3 insertions(+), 42 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f20622bd963b..6b578c380e73 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9021,41 +9021,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 	return percpu_counter_sum(&tctx->inflight);
 }
 
-static void io_sqpoll_cancel_cb(struct callback_head *cb)
-{
-	struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
-	struct io_sq_data *sqd = work->ctx->sq_data;
-
-	if (sqd->thread)
-		io_uring_cancel_sqpoll(sqd);
-	list_del_init(&work->ctx->sqd_list);
-	io_sqd_update_thread_idle(sqd);
-	complete(&work->completion);
-}
-
-static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
-{
-	struct io_sq_data *sqd = ctx->sq_data;
-	struct io_tctx_exit work = { .ctx = ctx, };
-	struct task_struct *task;
-
-	io_sq_thread_park(sqd);
-	task = sqd->thread;
-	if (task) {
-		init_completion(&work.completion);
-		init_task_work(&work.task_work, io_sqpoll_cancel_cb);
-		io_task_work_add_head(&sqd->park_task_work, &work.task_work);
-		wake_up_process(task);
-	} else {
-		list_del_init(&ctx->sqd_list);
-		io_sqd_update_thread_idle(sqd);
-	}
-	io_sq_thread_unpark(sqd);
-
-	if (task)
-		wait_for_completion(&work.completion);
-}
-
 static void io_uring_try_cancel(struct files_struct *files)
 {
 	struct io_uring_task *tctx = current->io_uring;
@@ -9065,11 +9030,9 @@ static void io_uring_try_cancel(struct files_struct *files)
 	xa_for_each(&tctx->xa, index, node) {
 		struct io_ring_ctx *ctx = node->ctx;
 
-		if (ctx->sq_data) {
-			io_sqpoll_cancel_sync(ctx);
-			continue;
-		}
-		io_uring_try_cancel_requests(ctx, current, files);
+		/* sqpoll task will cancel all its requests */
+		if (!ctx->sq_data)
+			io_uring_try_cancel_requests(ctx, current, files);
 	}
 }
 
@@ -9117,8 +9080,6 @@ void __io_uring_cancel(struct files_struct *files)
 
 	/* make sure overflow events are dropped */
 	atomic_inc(&tctx->in_idle);
-	io_uring_try_cancel(files);
-
 	do {
 		/* read completions before cancelations */
 		inflight = tctx_inflight(tctx, !!files);
-- 
cgit v1.2.3


From 0b8c0e7c9692cfcfa02c9052d4d53ae67901c400 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 26 Apr 2021 15:17:38 +0100
Subject: io_uring: fix NULL reg-buffer

io_import_fixed() doesn't expect a registered buffer slot to be NULL and
would fail stumbling on it. We don't allow it, but if during
__io_sqe_buffers_update() rsrc removal succeeds but following register
fails, we'll get such a situation.

Do it atomically and don't remove buffers until we sure that a new one
can be set.

Fixes: 634d00df5e1cf ("io_uring: add full-fledged dynamic buffers support")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/830020f9c387acddd51962a3123b5566571b8c6d.1619446608.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6b578c380e73..863420e184cf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8419,7 +8419,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 {
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
-	struct io_mapped_ubuf *imu;
 	struct page *last_hpage = NULL;
 	bool needs_switch = false;
 	__u32 done;
@@ -8431,6 +8430,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
+		struct io_mapped_ubuf *imu;
+		int offset = up->offset + done;
 		u64 tag = 0;
 
 		err = io_copy_iov(ctx, &iov, iovs, done);
@@ -8440,28 +8441,27 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EFAULT;
 			break;
 		}
+		err = io_buffer_validate(&iov);
+		if (err)
+			break;
+		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+		if (err)
+			break;
 
-		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
-		imu = ctx->user_bufs[i];
-		if (imu) {
-			err = io_queue_rsrc_removal(ctx->buf_data, up->offset + done,
-						    ctx->rsrc_node, imu);
-			if (err)
+		i = array_index_nospec(offset, ctx->nr_user_bufs);
+		if (ctx->user_bufs[i]) {
+			err = io_queue_rsrc_removal(ctx->buf_data, offset,
+						    ctx->rsrc_node, ctx->user_bufs[i]);
+			if (unlikely(err)) {
+				io_buffer_unmap(ctx, &imu);
 				break;
+			}
 			ctx->user_bufs[i] = NULL;
 			needs_switch = true;
 		}
 
-		if (iov.iov_base || iov.iov_len) {
-			err = io_buffer_validate(&iov);
-			if (err)
-				break;
-			err = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
-						     &last_hpage);
-			if (err)
-				break;
-			ctx->buf_data->tags[up->offset + done] = tag;
-		}
+		ctx->user_bufs[i] = imu;
+		ctx->buf_data->tags[offset] = tag;
 	}
 
 	if (needs_switch)
-- 
cgit v1.2.3


From 6d042ffb598ed83e7d5623cc961d249def5b9829 Mon Sep 17 00:00:00 2001
From: Palash Oswal <hello@oswalpalash.com>
Date: Tue, 27 Apr 2021 18:21:49 +0530
Subject: io_uring: Check current->io_uring in io_uring_cancel_sqpoll

syzkaller identified KASAN: null-ptr-deref Write in
io_uring_cancel_sqpoll.

io_uring_cancel_sqpoll is called by io_sq_thread before calling
io_uring_alloc_task_context. This leads to current->io_uring being NULL.
io_uring_cancel_sqpoll should not have to deal with threads where
current->io_uring is NULL.

In order to cast a wider safety net, perform input sanitisation directly
in io_uring_cancel_sqpoll and return for NULL value of current->io_uring.
This is safe since if current->io_uring isn't set, then there's no way
for the task to have submitted any requests.

Reported-by: syzbot+be51ca5a4d97f017cd50@syzkaller.appspotmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Palash Oswal <hello@oswalpalash.com>
Link: https://lore.kernel.org/r/20210427125148.21816-1-hello@oswalpalash.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 863420e184cf..81096f3b01ea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9044,6 +9044,8 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 	s64 inflight;
 	DEFINE_WAIT(wait);
 
+	if (!current->io_uring)
+		return;
 	WARN_ON_ONCE(!sqd || sqd->thread != current);
 
 	atomic_inc(&tctx->in_idle);
-- 
cgit v1.2.3


From 7b289c38335ec7bebe45ed31137d596c808e23ac Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Tue, 13 Apr 2021 15:20:39 +0800
Subject: io_uring: maintain drain logic for multishot poll requests

Now that we have multishot poll requests, one SQE can emit multiple
CQEs. given below example:
    sqe0(multishot poll)-->sqe1-->sqe2(drain req)
sqe2 is designed to issue after sqe0 and sqe1 completed, but since sqe0
is a multishot poll request, sqe2 may be issued after sqe0's event
triggered twice before sqe1 completed. This isn't what users leverage
drain requests for.
Here the solution is to wait for multishot poll requests fully
completed.
To achieve this, we should reconsider the req_need_defer equation, the
original one is:

    all_sqes(excluding dropped ones) == all_cqes(including dropped ones)

This means we issue a drain request when all the previous submitted
SQEs have generated their CQEs.
Now we should consider multishot requests, we deduct all the multishot
CQEs except the cancellation one, In this way a multishot poll request
behave like a normal request, so:
    all_sqes == all_cqes - multishot_cqes(except cancellations)

Here we introduce cq_extra for it.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/1618298439-136286-1-git-send-email-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 81096f3b01ea..63ff70587d4f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -428,6 +428,7 @@ struct io_ring_ctx {
 		unsigned		cq_mask;
 		atomic_t		cq_timeouts;
 		unsigned		cq_last_tm_flush;
+		unsigned		cq_extra;
 		unsigned long		cq_check_overflow;
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
@@ -1193,7 +1194,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 		struct io_ring_ctx *ctx = req->ctx;
 
-		return seq != ctx->cached_cq_tail
+		return seq + ctx->cq_extra != ctx->cached_cq_tail
 				+ READ_ONCE(ctx->cached_cq_overflow);
 	}
 
@@ -4901,6 +4902,9 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 		req->poll.done = true;
 		flags = 0;
 	}
+	if (flags & IORING_CQE_F_MORE)
+		ctx->cq_extra++;
+
 	io_commit_cqring(ctx);
 	return !(flags & IORING_CQE_F_MORE);
 }
-- 
cgit v1.2.3