From fe315e76fc3a3f9f7e1581dc22fec7e7719f0896 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2009 14:10:21 -0400 Subject: SUNRPC: Avoid spurious wake-up during UDP connect processing To clear out old state, the UDP connect workers unconditionally invoke xs_close() before proceeding with a new connect. Nowadays this causes a spurious wake-up of the task waiting for the connect to complete. This is a little racey, but usually harmless. The waiting task immediately retries the connect via a call_bind/call_connect sequence, which usually finds the transport already in the connected state because the connect worker has finished in the background. To avoid a spurious wake-up, factor the xs_close() logic that resets the underlying socket into a helper, and have the UDP connect workers call that helper instead of xs_close(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 29c71e645b27..1127eb934136 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -767,23 +767,13 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } -/** - * xs_close - close a socket - * @xprt: transport - * - * This is used when all requests are complete; ie, no DRC state remains - * on the server we want to save. - */ -static void xs_close(struct rpc_xprt *xprt) +static void xs_reset_transport(struct sock_xprt *transport) { - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct socket *sock = transport->sock; struct sock *sk = transport->inet; - if (!sk) - goto clear_close_wait; - - dprintk("RPC: xs_close xprt %p\n", xprt); + if (sk == NULL) + return; write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; @@ -797,7 +787,23 @@ static void xs_close(struct rpc_xprt *xprt) sk->sk_no_check = 0; sock_release(sock); -clear_close_wait: +} + +/** + * xs_close - close a socket + * @xprt: transport + * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. + */ +static void xs_close(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + dprintk("RPC: xs_close xprt %p\n", xprt); + + xs_reset_transport(transport); + smp_mb__before_clear_bit(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); @@ -1537,9 +1543,10 @@ static void xs_udp_connect_worker4(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_close(xprt); + xs_reset_transport(transport); - if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (err < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } @@ -1578,9 +1585,10 @@ static void xs_udp_connect_worker6(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_close(xprt); + xs_reset_transport(transport); - if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (err < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } -- cgit v1.2.3 From b1e1e158779f1d99c2cc18e466f6bf9099fc0853 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:55 -0400 Subject: SVCRDMA: remove faulty assertions in rpc/rdma chunk validation. Certain client-provided RPCRDMA chunk alignments result in an additional scatter/gather entry, which triggered nfs/rdma server assertions incorrectly. OpenSolaris nfs/rdma client connectathon testing was blocked by these in the special/locking section. Signed-off-by: Tom Talpey Cc: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index a3334e3b73cc..d0bea987d80e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -191,7 +191,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) { - int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; u32 sge_bytes; u32 page_bytes; @@ -235,7 +234,11 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - BUG_ON(sge_no > sge_max); + dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + "page_base %zd page_len %zd head_len %d tail_len %d\n", + sge_no, page_no, xdr->page_base, xdr->page_len, + xdr->head[0].iov_len, xdr->tail[0].iov_len); + vec->count = sge_no; return 0; } @@ -579,7 +582,6 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[page_no+1].length = 0; } BUG_ON(sge_no > rdma->sc_max_sge); - BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; -- cgit v1.2.3 From b38ab40ad58c1fc43ea590d6342f6a6763ac8fb6 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:55 -0400 Subject: XPRTRDMA: correct an rpc/rdma inline send marshaling error Certain client rpc's which contain both lengthy page-contained metadata and a non-empty xdr_tail buffer require careful handling to avoid overlapped memory copying. Rearranging of existing rpcrdma marshaling code avoids it; this fixes an NFSv4 symlink creation error detected with connectathon basic/test8 to multiple servers. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 14106d26bb95..e5e28d1946a4 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -310,6 +310,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) __func__, pad, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; + + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { + memmove(destp + copy_len, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d\n", + __func__, destp + copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + r_xprt->rx_stats.pullup_copy_count += copy_len; npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; for (i = 0; copy_len && i < npages; i++) { @@ -332,17 +345,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp += curlen; copy_len -= curlen; } - if (rqst->rq_snd_buf.tail[0].iov_len) { - curlen = rqst->rq_snd_buf.tail[0].iov_len; - if (destp != rqst->rq_snd_buf.tail[0].iov_base) { - memcpy(destp, - rqst->rq_snd_buf.tail[0].iov_base, curlen); - r_xprt->rx_stats.pullup_copy_count += curlen; - } - dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", - __func__, destp, copy_len, curlen); - rqst->rq_svec[0].iov_len += curlen; - } /* header now contains entire send message */ return pad; } @@ -656,7 +658,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) curlen = rqst->rq_rcv_buf.tail[0].iov_len; if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", __func__, srcp, copy_len, curlen); rqst->rq_rcv_buf.tail[0].iov_len = curlen; -- cgit v1.2.3 From 441e3e242903f9b190d5764bed73edb58f977413 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Wed, 11 Mar 2009 14:37:56 -0400 Subject: SUNRPC: dynamically load RPC transport modules on-demand Provide an api to attempt to load any necessary kernel RPC client transport module automatically. By convention, the desired module name is "xprt"+"transport name". For example, when NFS mounting with "-o proto=rdma", attempt to load the "xprtrdma" module. Signed-off-by: Tom Talpey Cc: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 62098d101a1f..d1afec640394 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,6 +151,37 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); +/** + * xprt_load_transport - load a transport implementation + * @transport_name: transport to load + * + * Returns: + * 0: transport successfully loaded + * -ENOENT: transport module not available + */ +int xprt_load_transport(const char *transport_name) +{ + struct xprt_class *t; + char module_name[sizeof t->name + 5]; + int result; + + result = 0; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (strcmp(t->name, transport_name) == 0) { + spin_unlock(&xprt_list_lock); + goto out; + } + } + spin_unlock(&xprt_list_lock); + strcpy(module_name, "xprt"); + strncat(module_name, transport_name, sizeof t->name); + result = request_module(module_name); +out: + return result; +} +EXPORT_SYMBOL_GPL(xprt_load_transport); + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport -- cgit v1.2.3 From 15f081ca8ddfe150fb639c591b18944a539da0fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:57 -0400 Subject: SUNRPC: Avoid an unnecessary task reschedule on ENOTCONN If the socket is unconnected, and xprt_transmit() returns ENOTCONN, we currently give up the lock on the transport channel. Doing so means that the lock automatically gets assigned to the next task in the xprt->sending queue, and so that task needs to be woken up to do the actual connect. The following patch aims to avoid that unnecessary task switch. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 836f15c0c4a3..07e9b05321e6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1105,14 +1105,24 @@ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; - /* - * Special case: if we've been waiting on the socket's write_space() - * callback, then don't call xprt_end_transmit(). - */ - if (task->tk_status == -EAGAIN) - return; - xprt_end_transmit(task); - rpc_task_force_reencode(task); + switch (task->tk_status) { + case -EAGAIN: + break; + default: + xprt_end_transmit(task); + /* + * Special cases: if we've been waiting on the + * socket's write_space() callback, or if the + * socket just returned a connection error, + * then hold onto the transport lock. + */ + case -ECONNREFUSED: + case -ENOTCONN: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + rpc_task_force_reencode(task); + } } /* -- cgit v1.2.3 From 670f94573104b4a25525d3fcdcd6496c678df172 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:58 -0400 Subject: SUNRPC: Ensure we set XPRT_CLOSING only after we've sent a tcp FIN... ...so that we can distinguish between when we need to shutdown and when we don't. Also remove the call to xs_tcp_shutdown() from xs_tcp_connect(), since xprt_connect() makes the same test. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1127eb934136..cb4bd93b9211 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1180,7 +1180,6 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - set_bit(XPRT_CLOSING, &xprt->state); xprt_force_disconnect(xprt); case TCP_SYN_SENT: xprt->connect_cookie++; @@ -1193,6 +1192,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; case TCP_LAST_ACK: + set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); @@ -1836,9 +1836,6 @@ static void xs_tcp_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - /* Initiate graceful shutdown of the socket if not already done */ - if (test_bit(XPRT_CONNECTED, &xprt->state)) - xs_tcp_shutdown(xprt); /* Exit if we need to wait for socket shutdown to complete */ if (test_bit(XPRT_CLOSING, &xprt->state)) return; -- cgit v1.2.3 From 40d2549db5f515e415894def98b49db7d4c56714 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:58 -0400 Subject: SUNRPC: Don't disconnect if a connection is still in progress. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index cb4bd93b9211..9d1898f6ee87 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1613,10 +1613,9 @@ out: * We need to preserve the port number so the reply cache on the server can * find our cached RPC replies when we get around to reconnecting. */ -static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) { int result; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sockaddr any; dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); @@ -1633,6 +1632,17 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) result); } +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) +{ + unsigned int state = transport->inet->sk_state; + + if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) + return; + if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) + return; + xs_abort_connection(xprt, transport); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1706,7 +1716,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work) } } else /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt); + xs_tcp_reuse_connection(xprt, transport); dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1766,7 +1776,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work) } } else /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt); + xs_tcp_reuse_connection(xprt, transport); dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); -- cgit v1.2.3 From c8485e4d634f6df155040293928707f127f0d06d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:37:59 -0400 Subject: SUNRPC: Handle ECONNREFUSED correctly in xprt_transmit() If we get an ECONNREFUSED error, we currently go to sleep on the 'xprt->sending' wait queue. The problem is that no timeout is set there, and there is nothing else that will wake the task up later. We should deal with ECONNREFUSED in call_status, given that is where we also deal with -EHOSTDOWN, and friends. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 7 ++++++- net/sunrpc/xprt.c | 38 ++++++++++++++++---------------------- net/sunrpc/xprtsock.c | 26 ++++++++++++-------------- 3 files changed, 34 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 07e9b05321e6..145715b53115 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1117,10 +1117,12 @@ call_transmit_status(struct rpc_task *task) * then hold onto the transport lock. */ case -ECONNREFUSED: + case -ECONNRESET: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPIPE: rpc_task_force_reencode(task); } } @@ -1162,9 +1164,12 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(task->tk_xprt, req->rq_connect_cookie); break; + case -ECONNRESET: case -ECONNREFUSED: - case -ENOTCONN: rpc_force_rebind(clnt); + rpc_delay(task, 3*HZ); + case -EPIPE: + case -ENOTCONN: task->tk_action = call_bind; break; case -EAGAIN: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d1afec640394..d588e755e107 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -901,32 +901,26 @@ void xprt_transmit(struct rpc_task *task) req->rq_connect_cookie = xprt->connect_cookie; req->rq_xtime = jiffies; status = xprt->ops->send_request(task); - if (status == 0) { - dprintk("RPC: %5u xmit complete\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); + if (status != 0) { + task->tk_status = status; + return; + } - xprt->ops->set_retrans_timeout(task); + dprintk("RPC: %5u xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); - xprt->stat.sends++; - xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; - xprt->stat.bklog_u += xprt->backlog.qlen; + xprt->ops->set_retrans_timeout(task); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, xprt_timer); - spin_unlock_bh(&xprt->transport_lock); - return; - } + xprt->stat.sends++; + xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; + xprt->stat.bklog_u += xprt->backlog.qlen; - /* Note: at this point, task->tk_sleeping has not yet been set, - * hence there is no danger of the waking up task being put on - * schedq, and being picked up by a parallel run of rpciod(). - */ - task->tk_status = status; - if (status == -ECONNREFUSED) - rpc_sleep_on(&xprt->sending, task, NULL); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, xprt_timer); + spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9d1898f6ee87..5e8198bede81 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -594,6 +594,8 @@ static int xs_udp_send_request(struct rpc_task *task) /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } + if (!transport->sock) + goto out; switch (status) { case -ENOTSOCK: @@ -603,19 +605,17 @@ static int xs_udp_send_request(struct rpc_task *task) case -EAGAIN: xs_nospace(task); break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); case -ENETUNREACH: case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - break; - default: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); } - +out: return status; } @@ -697,6 +697,8 @@ static int xs_tcp_send_request(struct rpc_task *task) status = -EAGAIN; break; } + if (!transport->sock) + goto out; switch (status) { case -ENOTSOCK: @@ -706,21 +708,17 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: xs_nospace(task); break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); case -ECONNRESET: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: case -EPIPE: - status = -ENOTCONN; - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - xs_tcp_shutdown(xprt); } - +out: return status; } -- cgit v1.2.3 From 482f32e65d31cbf88d08306fa5d397cc945c3c26 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:00 -0400 Subject: SUNRPC: Handle socket errors correctly Ensure that we pick up and handle socket errors as they occur. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5e8198bede81..879af6f27b4c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1208,23 +1208,20 @@ static void xs_tcp_state_change(struct sock *sk) } /** - * xs_tcp_error_report - callback mainly for catching RST events + * xs_error_report - callback mainly for catching socket errors * @sk: socket */ -static void xs_tcp_error_report(struct sock *sk) +static void xs_error_report(struct sock *sk) { struct rpc_xprt *xprt; read_lock(&sk->sk_callback_lock); - if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED) - goto out; if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: %s client %p...\n" "RPC: error %d\n", __func__, xprt, sk->sk_err); - - xprt_force_disconnect(xprt); + xprt_wake_pending_tasks(xprt, -EAGAIN); out: read_unlock(&sk->sk_callback_lock); } @@ -1509,6 +1506,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_no_check = UDP_CSUM_NORCV; sk->sk_allocation = GFP_ATOMIC; @@ -1656,7 +1654,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sk->sk_error_report = xs_tcp_error_report; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ -- cgit v1.2.3 From 2a4919919a97911b0aa4b9f5ac1eab90ba87652b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:00 -0400 Subject: SUNRPC: Return EAGAIN instead of ENOTCONN when waking up xprt->pending While we should definitely return socket errors to the task that is currently trying to send data, there is no need to propagate the same error to all the other tasks on xprt->pending. Doing so actually slows down recovery, since it causes more than one tasks to attempt socket recovery. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 15 ++++--------- net/sunrpc/xprt.c | 20 ++++++------------ net/sunrpc/xprtsock.c | 58 +++++++++++++++++++++++++++------------------------ 3 files changed, 41 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 145715b53115..5abab094441f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1032,27 +1032,20 @@ call_connect_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - if (status >= 0) { + if (status >= 0 || status == -EAGAIN) { clnt->cl_stats->netreconn++; task->tk_action = call_transmit; return; } - /* Something failed: remote service port may have changed */ - rpc_force_rebind(clnt); - switch (status) { - case -ENOTCONN: - case -EAGAIN: - task->tk_action = call_bind; - if (!RPC_IS_SOFT(task)) - return; /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: task->tk_action = call_timeout; - return; + break; + default: + rpc_exit(task, -EIO); } - rpc_exit(task, -EIO); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d588e755e107..a0bfe53f1621 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -611,7 +611,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); @@ -629,7 +629,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -656,7 +656,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); } @@ -726,9 +726,8 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { - case -ENOTCONN: - dprintk("RPC: %5u xprt_connect_status: connection broken\n", - task->tk_pid); + case -EAGAIN: + dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; case -ETIMEDOUT: dprintk("RPC: %5u xprt_connect_status: connect attempt timed " @@ -849,15 +848,8 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!xprt->ops->reserve_xprt(task)) { + if (!xprt->ops->reserve_xprt(task)) err = -EAGAIN; - goto out_unlock; - } - - if (!xprt_connected(xprt)) { - err = -ENOTCONN; - goto out_unlock; - } out_unlock: spin_unlock_bh(&xprt->transport_lock); return err; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 879af6f27b4c..8e58b0b5460b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1162,7 +1162,7 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; - xprt_wake_pending_tasks(xprt, 0); + xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock_bh(&xprt->transport_lock); break; @@ -1721,20 +1721,22 @@ static void xs_tcp_connect_worker4(struct work_struct *work) dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - } + switch (status) { + case 0: + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); } + status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); out_clear: @@ -1780,20 +1782,22 @@ static void xs_tcp_connect_worker6(struct work_struct *work) status = xs_tcp_finish_connecting(xprt, sock); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - } + switch (status) { + case 0: + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); } + status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); out_clear: -- cgit v1.2.3 From 8a2cec295f4499cc9d4452e9b02d4ed071bb42d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:01 -0400 Subject: SUNRPC: Delay, then retry on connection errors. Enforce the comment in xs_tcp_connect_worker4/xs_tcp_connect_worker6 that we should delay, then retry on certain connection errors. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8e58b0b5460b..9f3e615d3e09 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1722,20 +1722,19 @@ static void xs_tcp_connect_worker4(struct work_struct *work) xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ case 0: case -EINPROGRESS: case -EALREADY: goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); } + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); @@ -1783,20 +1782,19 @@ static void xs_tcp_connect_worker6(struct work_struct *work) dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ case 0: case -EINPROGRESS: case -EALREADY: goto out_clear; - case -ECONNREFUSED: - case -ECONNRESET: - /* retry with existing socket, after a delay */ - break; - default: - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); } + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + printk("%s: connect returned unhandled error %d\n", + __func__, status); status = -EAGAIN; out: xprt_wake_pending_tasks(xprt, status); -- cgit v1.2.3 From 5e3771ce2d6a69e10fcc870cdf226d121d868491 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:01 -0400 Subject: SUNRPC: Ensure that xs_nospace return values are propagated If xs_nospace() finds that the socket has disconnected, it attempts to return ENOTCONN, however that value is then squashed by the callers. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9f3e615d3e09..2e070679ab4a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -521,11 +521,12 @@ static void xs_nospace_callback(struct rpc_task *task) * @task: task to put to sleep * */ -static void xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret = 0; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", task->tk_pid, req->rq_slen - req->rq_bytes_sent, @@ -537,6 +538,7 @@ static void xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + ret = -EAGAIN; /* * Notify TCP that we're limited by the application * window size @@ -548,10 +550,11 @@ static void xs_nospace(struct rpc_task *task) } } else { clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - task->tk_status = -ENOTCONN; + ret = -ENOTCONN; } spin_unlock_bh(&xprt->transport_lock); + return ret; } /** @@ -603,7 +606,7 @@ static int xs_udp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - xs_nospace(task); + status = xs_nospace(task); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -706,7 +709,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - xs_nospace(task); + status = xs_nospace(task); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -- cgit v1.2.3 From 7d1e8255cf959fba7ee2317550dfde39f0b936ae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:03 -0400 Subject: SUNRPC: Add the equivalent of the linger and linger2 timeouts to RPC sockets This fixes a regression against FreeBSD servers as reported by Tomas Kasparek. Apparently when using RPC over a TCP socket, the FreeBSD servers don't ever react to the client closing the socket, and so commit e06799f958bf7f9f8fae15f0c6f519953fb0257c (SUNRPC: Use shutdown() instead of close() when disconnecting a TCP socket) causes the setup to hang forever whenever the client attempts to close and then reconnect. We break the deadlock by adding a 'linger2' style timeout to the socket, after which, the client will abort the connection using a TCP 'RST'. The default timeout is set to 15 seconds. A subsequent patch will put it under user control by means of a systctl. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 98 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 81 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2e070679ab4a..b51f58b95c39 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -49,6 +49,8 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +#define XS_TCP_LINGER_TO (15U * HZ) + /* * We can register our own files under /proc/sys/sunrpc by * calling register_sysctl_table() again. The files in that @@ -806,6 +808,7 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1133,6 +1136,47 @@ out: read_unlock(&sk->sk_callback_lock); } +/* + * Do the equivalent of linger/linger2 handling for dealing with + * broken servers that don't close the socket in a timely + * fashion + */ +static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt, + unsigned long timeout) +{ + struct sock_xprt *transport; + + if (xprt_test_and_set_connecting(xprt)) + return; + set_bit(XPRT_CONNECTION_ABORT, &xprt->state); + transport = container_of(xprt, struct sock_xprt, xprt); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, + timeout); +} + +static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport; + + transport = container_of(xprt, struct sock_xprt, xprt); + + if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) || + !cancel_delayed_work(&transport->connect_worker)) + return; + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + xprt_clear_connecting(xprt); +} + +static void xs_sock_mark_closed(struct rpc_xprt *xprt) +{ + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); +} + /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1178,6 +1222,7 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_clear_bit(); + xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ @@ -1194,17 +1239,14 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); + xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); break; case TCP_CLOSE: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); - /* Mark transport as closed and wake up all pending tasks */ - xprt_disconnect_done(xprt); + xs_tcp_cancel_linger_timeout(xprt); + xs_sock_mark_closed(xprt); } out: read_unlock(&sk->sk_callback_lock); @@ -1562,8 +1604,8 @@ static void xs_udp_connect_worker4(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** @@ -1604,8 +1646,8 @@ static void xs_udp_connect_worker6(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /* @@ -1626,7 +1668,9 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo memset(&any, 0, sizeof(any)); any.sa_family = AF_UNSPEC; result = kernel_connect(transport->sock, &any, sizeof(any), 0); - if (result) + if (!result) + xs_sock_mark_closed(xprt); + else dprintk("RPC: AF_UNSPEC connect return code %d\n", result); } @@ -1702,6 +1746,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work) goto out; if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* start from scratch */ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { dprintk("RPC: can't create TCP transport socket (%d).\n", -err); @@ -1713,10 +1758,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work) sock_release(sock); goto out; } - } else + } else { + int abort_and_exit; + + abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, + &xprt->state); /* "close" the socket, preserving the local port */ xs_tcp_reuse_connection(xprt, transport); + if (abort_and_exit) + goto out_eagain; + } + dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1732,17 +1785,18 @@ static void xs_tcp_connect_worker4(struct work_struct *work) case 0: case -EINPROGRESS: case -EALREADY: - goto out_clear; + xprt_clear_connecting(xprt); + return; } /* get rid of existing socket, and retry */ xs_tcp_shutdown(xprt); printk("%s: connect returned unhandled error %d\n", __func__, status); +out_eagain: status = -EAGAIN; out: - xprt_wake_pending_tasks(xprt, status); -out_clear: xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** @@ -1763,6 +1817,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work) goto out; if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* start from scratch */ if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { dprintk("RPC: can't create TCP transport socket (%d).\n", -err); @@ -1774,10 +1829,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work) sock_release(sock); goto out; } - } else + } else { + int abort_and_exit; + + abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, + &xprt->state); /* "close" the socket, preserving the local port */ xs_tcp_reuse_connection(xprt, transport); + if (abort_and_exit) + goto out_eagain; + } + dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1792,17 +1855,18 @@ static void xs_tcp_connect_worker6(struct work_struct *work) case 0: case -EINPROGRESS: case -EALREADY: - goto out_clear; + xprt_clear_connecting(xprt); + return; } /* get rid of existing socket, and retry */ xs_tcp_shutdown(xprt); printk("%s: connect returned unhandled error %d\n", __func__, status); +out_eagain: status = -EAGAIN; out: - xprt_wake_pending_tasks(xprt, status); -out_clear: xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); } /** -- cgit v1.2.3 From 25fe6142a57c720452c5e9ddbc1f32309c1e5c19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:03 -0400 Subject: SUNRPC: Add a sysctl to control the duration of the socket linger timeout Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b51f58b95c39..42222b4dd76d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -50,6 +50,7 @@ unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; #define XS_TCP_LINGER_TO (15U * HZ) +static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; /* * We can register our own files under /proc/sys/sunrpc by @@ -118,6 +119,14 @@ static ctl_table xs_tunables_table[] = { .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, + { + .procname = "tcp_fin_timeout", + .data = &xs_tcp_fin_timeout, + .maxlen = sizeof(xs_tcp_fin_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = sysctl_jiffies + }, { .ctl_name = 0, }, @@ -1222,7 +1231,7 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_clear_bit(); - xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ @@ -1239,7 +1248,7 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); - xs_tcp_schedule_linger_timeout(xprt, XS_TCP_LINGER_TO); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); -- cgit v1.2.3 From b61d59fffd3e5b6037c92b4c840605831de8a251 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 14:38:04 -0400 Subject: =?UTF-8?q?SUNRPC:=C2=A0xs=5Ftcp=5Fconnect=5Fworker{4,6}:=20merge?= =?UTF-8?q?=20common=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 140 ++++++++++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 42222b4dd76d..f05a56e597ef 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1738,33 +1738,29 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } /** - * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint - * @work: RPC transport to connect + * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint + * @xprt: RPC transport to connect + * @transport: socket transport to connect + * @create_sock: function to create a socket of the correct type * * Invoked by a work queue tasklet. */ -static void xs_tcp_connect_worker4(struct work_struct *work) +static void xs_tcp_setup_socket(struct rpc_xprt *xprt, + struct sock_xprt *transport, + struct socket *(*create_sock)(struct rpc_xprt *, + struct sock_xprt *)) { - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; - int err, status = -EIO; + int status = -EIO; if (xprt->shutdown) goto out; if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - /* start from scratch */ - if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } - xs_reclassify_socket4(sock); - - if (xs_bind4(transport, sock) < 0) { - sock_release(sock); + sock = create_sock(xprt, transport); + if (IS_ERR(sock)) { + status = PTR_ERR(sock); goto out; } } else { @@ -1808,74 +1804,82 @@ out: xprt_wake_pending_tasks(xprt, status); } +static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt, + struct sock_xprt *transport) +{ + struct socket *sock; + int err; + + /* start from scratch */ + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", + -err); + goto out_err; + } + xs_reclassify_socket4(sock); + + if (xs_bind4(transport, sock) < 0) { + sock_release(sock); + goto out_err; + } + return sock; +out_err: + return ERR_PTR(-EIO); +} + /** - * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint + * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint * @work: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_tcp_connect_worker6(struct work_struct *work) +static void xs_tcp_connect_worker4(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; - int err, status = -EIO; - if (xprt->shutdown) - goto out; - - if (!sock) { - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - /* start from scratch */ - if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } - xs_reclassify_socket6(sock); + xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4); +} - if (xs_bind6(transport, sock) < 0) { - sock_release(sock); - goto out; - } - } else { - int abort_and_exit; +static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt, + struct sock_xprt *transport) +{ + struct socket *sock; + int err; - abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, - &xprt->state); - /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt, transport); + /* start from scratch */ + err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", + -err); + goto out_err; + } + xs_reclassify_socket6(sock); - if (abort_and_exit) - goto out_eagain; + if (xs_bind6(transport, sock) < 0) { + sock_release(sock); + goto out_err; } + return sock; +out_err: + return ERR_PTR(-EIO); +} - dprintk("RPC: worker connecting xprt %p to address: %s\n", - xprt, xprt->address_strings[RPC_DISPLAY_ALL]); +/** + * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint + * @work: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_connect_worker6(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; - status = xs_tcp_finish_connecting(xprt, sock); - dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - switch (status) { - case -ECONNREFUSED: - case -ECONNRESET: - case -ENETUNREACH: - /* retry with existing socket, after a delay */ - case 0: - case -EINPROGRESS: - case -EALREADY: - xprt_clear_connecting(xprt); - return; - } - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); -out_eagain: - status = -EAGAIN; -out: - xprt_clear_connecting(xprt); - xprt_wake_pending_tasks(xprt, status); + xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6); } /** -- cgit v1.2.3 From 55420c24a0d4d1fce70ca713f84aa00b6b74a70e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Mar 2009 15:29:24 -0400 Subject: SUNRPC: Ensure we close the socket on EPIPE errors too... As long as one task is holding the socket lock, then calls to xprt_force_disconnect(xprt) will not succeed in shutting down the socket. In particular, this would mean that a server initiated shutdown will not succeed until the lock is relinquished. In order to avoid the deadlock, we should ensure that xs_tcp_send_request() closes the socket on EPIPE errors too. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f05a56e597ef..fbc8725c20cb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -726,10 +726,10 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ECONNRESET: + case -EPIPE: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: - case -EPIPE: clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } out: -- cgit v1.2.3 From 2e3c230bc7149a6af65d26a0c312e230e0c33cc3 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 12 Mar 2009 22:21:21 -0400 Subject: SVCRDMA: fix recent printk format warnings. printk formats in prior commit were reversed/incorrect. Compiled without warning on x86 and x86_64, but detected on ppc. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d0bea987d80e..6c26a675435a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -235,7 +235,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, } dprintk("svcrdma: map_xdr: sge_no %d page_no %d " - "page_base %zd page_len %zd head_len %d tail_len %d\n", + "page_base %u page_len %u head_len %zu tail_len %zu\n", sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); -- cgit v1.2.3 From 776bd5c7a207de546918f805090bfc823d2660c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:28 -0400 Subject: SUNRPC: Don't flag empty RPCB_GETADDR reply as bogus In 2007, commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea added additional sanity checking to rpcb_decode_getaddr() to make sure we were getting a reply that was long enough to be an actual universal address. If the uaddr string isn't long enough, the XDR decoder returns EIO. However, an empty string is a valid RPCB_GETADDR response if the requested service isn't registered. Moreover, "::.n.m" is also a valid RPCB_GETADDR response for IPv6 addresses that is shorter than rpcb_decode_getaddr()'s lower limit of 11. So this sanity check introduced a regression for rpcbind requests against IPv6 remotes. So revert the lower bound check added by commit e65fe3976f594603ed7b1b4a99d3e9b867f573ea, and add an explicit check for an empty uaddr string, similar to libtirpc's rpcb_getaddr(3). Pointed-out-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 03ae007641e4..2caa7edeeaba 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -703,11 +703,16 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, *portp = 0; addr_len = ntohl(*p++); + if (addr_len == 0) { + dprintk("RPC: rpcb_decode_getaddr: " + "service is not registered\n"); + return 0; + } + /* - * Simple sanity check. The smallest possible universal - * address is an IPv4 address string containing 11 bytes. + * Simple sanity check. */ - if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN) + if (addr_len > RPCBIND_MAXUADDRLEN) goto out_err; /* -- cgit v1.2.3 From 156e62094a74cf43f02f56ef96b6cda567501357 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:45:58 -0400 Subject: SUNRPC: Clean up svc_find_xprt() calling sequence Clean up: add documentating comment and use appropriate data types for svc_find_xprt()'s arguments. This also eliminates a mixed sign comparison: @port was an int, while the return value of svc_xprt_local_port() is an unsigned short. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc_xprt.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e588df5d6b34..c947c93dbc24 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1033,7 +1033,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) return dr; } -/* +/** + * svc_find_xprt - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @af: Address family of transport's local address + * @port: transport's IP port number + * * Return the transport instance pointer for the endpoint accepting * connections/peer traffic from the specified transport class, * address family and port. @@ -1042,14 +1048,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) * wild-card, and will result in matching the first transport in the * service's list that has a matching class name. */ -struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, - int af, int port) +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, + const sa_family_t af, const unsigned short port) { struct svc_xprt *xprt; struct svc_xprt *found = NULL; /* Sanity check the args */ - if (!serv || !xcl_name) + if (serv == NULL || xcl_name == NULL) return found; spin_lock_bh(&serv->sv_lock); @@ -1058,7 +1064,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, continue; if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) continue; - if (port && port != svc_xprt_local_port(xprt)) + if (port != 0 && port != svc_xprt_local_port(xprt)) continue; found = xprt; svc_xprt_get(xprt); -- cgit v1.2.3 From 4b62e58cccff9c5e7ffc7023f7ec24c75fbd549b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:06 -0400 Subject: SUNRPC: Pass a family argument to svc_register() The sv_family field is going away. Instead of using sv_family, have the svc_register() function take a protocol family argument. Since this argument represents a protocol family, and not an address family, this argument takes an int, as this is what is passed to sock_create_kern(). Also make sure svc_register's helpers are checking for PF_FOO instead of AF_FOO. The value of [AP]F_FOO are equivalent; this is simply a symbolic change to reflect the semantics of the value stored in that variable. sock_create_kern() should return EPFNOSUPPORT if the passed-in protocol family isn't supported, but it uses EAFNOSUPPORT for this case. We will stick with that tradition here, as svc_register() is called by the RPC server in the same path as sock_create_kern(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 21 +++++++++++---------- net/sunrpc/svcsock.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c51fed4d1af1..41bc36ea2224 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -800,17 +800,17 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, * if any error occurs. */ static int __svc_register(const u32 program, const u32 version, - const sa_family_t family, + const int family, const unsigned short protocol, const unsigned short port) { int error; switch (family) { - case AF_INET: + case PF_INET: return __svc_rpcb_register4(program, version, protocol, port); - case AF_INET6: + case PF_INET6: error = __svc_rpcb_register6(program, version, protocol, port); if (error < 0) @@ -840,11 +840,11 @@ static int __svc_register(const u32 program, const u32 version, * if any error occurs. */ static int __svc_register(const u32 program, const u32 version, - sa_family_t family, + const int family, const unsigned short protocol, const unsigned short port) { - if (family != AF_INET) + if (family != PF_INET) return -EAFNOSUPPORT; return rpcb_register(program, version, protocol, port); @@ -855,13 +855,14 @@ static int __svc_register(const u32 program, const u32 version, /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register + * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * - * Service is registered for any address in serv's address family + * Service is registered for any address in the passed-in protocol family */ -int svc_register(const struct svc_serv *serv, const unsigned short proto, - const unsigned short port) +int svc_register(const struct svc_serv *serv, const int family, + const unsigned short proto, const unsigned short port) { struct svc_program *progp; unsigned int i; @@ -879,7 +880,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, i, proto == IPPROTO_UDP? "udp" : "tcp", port, - serv->sv_family, + family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); @@ -887,7 +888,7 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto, continue; error = __svc_register(progp->pg_prog, i, - serv->sv_family, proto, port); + family, proto, port); if (error < 0) break; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5763e6460fea..d00583c1cd04 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, inet->sk_protocol, + *errp = svc_register(serv, serv->sv_family, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { -- cgit v1.2.3 From baf01caf09e87579c2d157e5ee29975db8551522 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:13 -0400 Subject: SUNRPC: svc_setup_socket() gets protocol family from socket Since the sv_family field is going away, modify svc_setup_socket() to extract the protocol family from the passed-in socket instead of from the passed-in svc_serv struct. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d00583c1cd04..d00bc3307745 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1122,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, serv->sv_family, inet->sk_protocol, + *errp = svc_register(serv, inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { @@ -1145,13 +1145,13 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* * We start one listener per sv_serv. We want AF_INET - * requests to be automatically shunted to our AF_INET6 + * requests to be automatically shunted to our PF_INET6 * listener using a mapped IPv4 address. Make sure * no-one starts an equivalent IPv4 listener, which * would steal our incoming connections. */ val = 0; - if (serv->sv_family == AF_INET6) + if (inet->sk_family == PF_INET6) kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, (char *)&val, sizeof(val)); -- cgit v1.2.3 From 9652ada3fb5914a67d8422114e8a76388330fa79 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:21 -0400 Subject: SUNRPC: Change svc_create_xprt() to take a @family argument The sv_family field is going away. Pass a protocol family argument to svc_create_xprt() instead of extracting the family from the passed-in svc_serv struct. Again, as this is a listener socket and not an address, we make this new argument an "int" protocol family, instead of an "sa_family_t." Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc_xprt.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c947c93dbc24..2819ee093f36 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init); static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct svc_serv *serv, - unsigned short port, int flags) + const int family, + const unsigned short port, + int flags) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct sockaddr *sap; size_t len; - switch (serv->sv_family) { - case AF_INET: + switch (family) { + case PF_INET: sap = (struct sockaddr *)&sin; len = sizeof(sin); break; - case AF_INET6: + case PF_INET6: sap = (struct sockaddr *)&sin6; len = sizeof(sin6); break; @@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, sap, len, flags); } -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, +int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + const int family, const unsigned short port, int flags) { struct svc_xprt_class *xcl; @@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, port, flags); + newxprt = __svc_xpo_create(xcl, serv, family, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); -- cgit v1.2.3 From 49a9072f29a1039f142ec98b44a72d7173651c02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:29 -0400 Subject: SUNRPC: Remove @family argument from svc_create() and svc_create_pooled() Since an RPC service listener's protocol family is specified now via svc_create_xprt(), it no longer needs to be passed to svc_create() or svc_create_pooled(). Remove that argument from the synopsis of those functions, and remove the sv_family field from the svc_serv struct. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 41bc36ea2224..d72ff44826d8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; - serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, shutdown); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv), + void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, family, shutdown); + serv = __svc_create(prog, bufsize, npools, shutdown); if (serv != NULL) { serv->sv_function = func; -- cgit v1.2.3 From 7d21c0f9845f0ce4e81baac3519fbb2c6c2cc908 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:44 -0400 Subject: SUNRPC: Set IPV6ONLY flag on PF_INET6 RPC listener sockets We are about to convert to using separate RPC listener sockets for PF_INET and PF_INET6. This echoes the way IPv6 is handled in user space by TI-RPC, and eliminates the need for ULPs to worry about mapped IPv4 AF_INET6 addresses when doing address comparisons. Start by setting the IPV6ONLY flag on PF_INET6 RPC listener sockets. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d00bc3307745..ac6cd65220c7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1144,13 +1144,11 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svc_tcp_init(svsk, serv); /* - * We start one listener per sv_serv. We want AF_INET - * requests to be automatically shunted to our PF_INET6 - * listener using a mapped IPv4 address. Make sure - * no-one starts an equivalent IPv4 listener, which - * would steal our incoming connections. + * If this is a PF_INET6 listener, we want to avoid + * getting requests from IPv4 remotes. Those should + * be shunted to a PF_INET listener via rpcbind. */ - val = 0; + val = 1; if (inet->sk_family == PF_INET6) kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, (char *)&val, sizeof(val)); -- cgit v1.2.3 From fc28decdc93633a65d54e42498e9e819d466329c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:51 -0400 Subject: SUNRPC: Use IPv4 loopback for registering AF_INET6 kernel RPC services The kernel uses an IPv6 loopback address when registering its AF_INET6 RPC services so that it can tell whether the local portmapper is actually IPv6-enabled. Since the legacy portmapper doesn't listen on IPv6, however, this causes a long timeout on older systems if the kernel happens to try creating and registering an AF_INET6 RPC service. Originally I wanted to use a connected transport (either TCP or connected UDP) so that the upcall would fail immediately if the portmapper wasn't listening on IPv6, but we never agreed on what transport to use. In the end, it's of little consequence to the kernel whether the local portmapper is listening on IPv6. It's only important whether the portmapper supports rpcbind v4. And the kernel can't tell that at all if it is sending requests via IPv6 -- the portmapper will just ignore them. So, send both rpcbind v2 and v4 SET/UNSET requests via IPv4 loopback to maintain better backwards compatibility between new kernels and legacy user space, and prevent multi-second hangs in some cases when the kernel attempts to register RPC services. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 2caa7edeeaba..ebce7a5976c9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -124,12 +124,6 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; -static const struct sockaddr_in6 rpcb_in6addr_loopback = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - .sin6_port = htons(RPCBIND_PORT), -}; - static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, size_t addrlen, u32 version) { @@ -176,9 +170,10 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, - u32 version, struct rpc_message *msg) +static int rpcb_register_call(u32 version, struct rpc_message *msg) { + struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; + size_t addrlen = sizeof(rpcb_inaddr_loopback); struct rpc_clnt *rpcb_clnt; int result, error = 0; @@ -254,9 +249,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) if (port) msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, - sizeof(rpcb_inaddr_loopback), - RPCBVERS_2, &msg); + return rpcb_register_call(RPCBVERS_2, &msg); } /* @@ -284,9 +277,7 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, - sizeof(rpcb_inaddr_loopback), - RPCBVERS_4, msg); + return rpcb_register_call(RPCBVERS_4, msg); } /* @@ -318,9 +309,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, - sizeof(rpcb_in6addr_loopback), - RPCBVERS_4, msg); + return rpcb_register_call(RPCBVERS_4, msg); } /** -- cgit v1.2.3 From ba5c35e0c7e30b095636cd58b0854fdbd3c32947 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:46:59 -0400 Subject: SUNRPC: Don't return EPROTONOSUPPORT in svc_register()'s helpers The RPC client returns -EPROTONOSUPPORT if there is a protocol version mismatch (ie the remote RPC server doesn't support the RPC protocol version sent by the client). Helpers for the svc_register() function return -EPROTONOSUPPORT if they don't recognize the passed-in IPPROTO_ value. These are two entirely different failure modes. Have the helpers return -ENOPROTOOPT instead of -EPROTONOSUPPORT. This will allow callers to determine more precisely what the underlying problem is, and decide to report or recover appropriately. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d72ff44826d8..17e0d7265dfd 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -749,7 +749,7 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } return rpcb_v4_register(program, version, @@ -785,7 +785,7 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP6; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } return rpcb_v4_register(program, version, -- cgit v1.2.3 From 3aba45536fe8f92aa07bcdfd2fb1cf17eec7d786 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:06 -0400 Subject: SUNRPC: Clean up address type casts in rpcb_v4_register() Clean up: Simplify rpcb_v4_register() and its helpers by moving the details of sockaddr type casting to rpcb_v4_register()'s helper functions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index ebce7a5976c9..44d0732ba874 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -170,7 +170,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(u32 version, struct rpc_message *msg) +static int rpcb_register_call(const u32 version, struct rpc_message *msg) { struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; size_t addrlen = sizeof(rpcb_inaddr_loopback); @@ -255,17 +255,17 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) /* * Fill in AF_INET family-specific arguments to register */ -static int rpcb_register_netid4(struct sockaddr_in *address_to_register, +static int rpcb_register_netid4(const struct sockaddr *sap, struct rpc_message *msg) { + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(address_to_register->sin_port); + unsigned short port = ntohs(sin->sin_port); char buf[32]; /* Construct AF_INET universal address */ snprintf(buf, sizeof(buf), "%pI4.%u.%u", - &address_to_register->sin_addr.s_addr, - port >> 8, port & 0xff); + &sin->sin_addr.s_addr, port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -283,21 +283,21 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register, /* * Fill in AF_INET6 family-specific arguments to register */ -static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, +static int rpcb_register_netid6(const struct sockaddr *sap, struct rpc_message *msg) { + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(address_to_register->sin6_port); + unsigned short port = ntohs(sin6->sin6_port); char buf[64]; /* Construct AF_INET6 universal address */ - if (ipv6_addr_any(&address_to_register->sin6_addr)) + if (ipv6_addr_any(&sin6->sin6_addr)) snprintf(buf, sizeof(buf), "::.%u.%u", port >> 8, port & 0xff); else snprintf(buf, sizeof(buf), "%pI6.%u.%u", - &address_to_register->sin6_addr, - port >> 8, port & 0xff); + &sin6->sin6_addr, port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -369,11 +369,9 @@ int rpcb_v4_register(const u32 program, const u32 version, switch (address->sa_family) { case AF_INET: - return rpcb_register_netid4((struct sockaddr_in *)address, - &msg); + return rpcb_register_netid4(address, &msg); case AF_INET6: - return rpcb_register_netid6((struct sockaddr_in6 *)address, - &msg); + return rpcb_register_netid6(address, &msg); } return -EAFNOSUPPORT; -- cgit v1.2.3 From 126e4bc3b3b446482696377f67a634c76eaf2e9c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:14 -0400 Subject: SUNRPC: rpcbind actually interprets r_owner string RFC 1833 has little to say about the contents of r_owner; it only specifies that it is a string, and states that it is used to control who can UNSET an entry. Our port of rpcbind (from Sun) assumes this string contains a numeric UID value, not alphabetical or symbolic characters, but checks this value only for AF_LOCAL RPCB_SET or RPCB_UNSET requests. In all other cases, rpcbind ignores the contents of the r_owner string. The reference user space implementation of rpcb_set(3) uses a numeric UID for all SET/UNSET requests (even via the network) and an empty string for all other requests. We emulate that behavior here to maintain bug-for-bug compatibility. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 44d0732ba874..d550d0b967db 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -63,9 +63,16 @@ enum { * r_owner * * The "owner" is allowed to unset a service in the rpcbind database. - * We always use the following (arbitrary) fixed string. + * + * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a + * UID which it maps to a local user name via a password lookup. + * In all other cases it is ignored. + * + * For SET/UNSET requests, user space provides a value, even for + * network requests, and GETADDR uses an empty string. We follow + * those precedents here. */ -#define RPCB_OWNER_STRING "rpcb" +#define RPCB_OWNER_STRING "0" #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static void rpcb_getport_done(struct rpc_task *, void *); @@ -566,7 +573,7 @@ void rpcb_getport_async(struct rpc_task *task) map->r_xprt = xprt_get(xprt); map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR); - map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ + map->r_owner = ""; map->r_status = -EIO; child = rpcb_call_async(rpcb_clnt, map, proc); -- cgit v1.2.3 From 1673d0de40ab46cac3b456ad50e1c8d6a31bfd66 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:21 -0400 Subject: SUNRPC: Allow callers to pass rpcb_v4_register a NULL address The user space TI-RPC library uses an empty string for the universal address when unregistering all target addresses for [program, version]. The kernel's rpcb client should behave the same way. Here, we are switching between several registration methods based on the protocol family of the incoming address. Rename the other rpcbind v4 registration functions to make it clear that they, as well, are switched on protocol family. In /etc/netconfig, this is either "inet" or "inet6". NB: The loopback protocol families are not supported in the kernel. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index d550d0b967db..8ea8907d4b8d 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -262,8 +262,8 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) /* * Fill in AF_INET family-specific arguments to register */ -static int rpcb_register_netid4(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_inet4(const struct sockaddr *sap, + struct rpc_message *msg) { const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; @@ -290,8 +290,8 @@ static int rpcb_register_netid4(const struct sockaddr *sap, /* * Fill in AF_INET6 family-specific arguments to register */ -static int rpcb_register_netid6(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_inet6(const struct sockaddr *sap, + struct rpc_message *msg) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; @@ -319,6 +319,20 @@ static int rpcb_register_netid6(const struct sockaddr *sap, return rpcb_register_call(RPCBVERS_4, msg); } +static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) +{ + struct rpcbind_args *map = msg->rpc_argp; + + dprintk("RPC: unregistering [%u, %u, '%s'] with " + "local rpcbind\n", + map->r_prog, map->r_vers, map->r_netid); + + map->r_addr = ""; + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + + return rpcb_register_call(RPCBVERS_4, msg); +} + /** * rpcb_v4_register - set or unset a port registration with the local rpcbind * @program: RPC program number of service to (un)register @@ -336,10 +350,11 @@ static int rpcb_register_netid6(const struct sockaddr *sap, * invoke this function once for each [program, version, address, * netid] tuple they wish to advertise. * - * Callers may also unregister RPC services that are no longer - * available by setting the port number in the passed-in address - * to zero. Callers pass a netid of "" to unregister all - * transport netids associated with [program, version, address]. + * Callers may also unregister RPC services that are registered at a + * specific address by setting the port number in @address to zero. + * They may unregister all registered protocol families at once for + * a service by passing a NULL @address argument. If @netid is "" + * then all netids for [program, version, address] are unregistered. * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support @@ -374,11 +389,14 @@ int rpcb_v4_register(const u32 program, const u32 version, .rpc_argp = &map, }; + if (address == NULL) + return rpcb_unregister_all_protofamilies(&msg); + switch (address->sa_family) { case AF_INET: - return rpcb_register_netid4(address, &msg); + return rpcb_register_inet4(address, &msg); case AF_INET6: - return rpcb_register_netid6(address, &msg); + return rpcb_register_inet6(address, &msg); } return -EAFNOSUPPORT; -- cgit v1.2.3 From d5a8620f7c8a5bcade730e2fa1224191f289fb00 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:29 -0400 Subject: SUNRPC: Simplify svc_unregister() Our initial implementation of svc_unregister() assumed that PMAP_UNSET cleared all rpcbind registrations for a [program, version] tuple. However, we now have evidence that PMAP_UNSET clears only "inet" entries, and not "inet6" entries, in the rpcbind database. For backwards compatibility with the legacy portmapper, the svc_unregister() function also must work if user space doesn't support rpcbind version 4 at all. Thus we'll send an rpcbind v4 UNSET, and if that fails, we'll send a PMAP_UNSET. This simplifies the code in svc_unregister() and provides better backwards compatibility with legacy user space that does not support rpcbind version 4. We can get rid of the conditional compilation in here as well. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 17e0d7265dfd..bd0ee312dac9 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -896,38 +896,31 @@ int svc_register(const struct svc_serv *serv, const int family, return error; } -#ifdef CONFIG_SUNRPC_REGISTER_V4 - +/* + * If user space is running rpcbind, it should take the v4 UNSET + * and clear everything for this [program, version]. If user space + * is running portmap, it will reject the v4 UNSET, but won't have + * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient + * in this case to clear all existing entries for [program, version]. + */ static void __svc_unregister(const u32 program, const u32 version, const char *progname) { - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, - .sin6_port = 0, - }; int error; - error = rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, ""); - dprintk("svc: %s(%sv%u), error %d\n", - __func__, progname, version, error); -} - -#else /* CONFIG_SUNRPC_REGISTER_V4 */ + error = rpcb_v4_register(program, version, NULL, ""); -static void __svc_unregister(const u32 program, const u32 version, - const char *progname) -{ - int error; + /* + * User space didn't support rpcbind v4, so retry this + * request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, 0, 0); - error = rpcb_register(program, version, 0, 0); dprintk("svc: %s(%sv%u), error %d\n", __func__, progname, version, error); } -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not -- cgit v1.2.3 From cadc0fa534e51e20fdffe1623913c163a18d71b1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:36 -0400 Subject: SUNRPC: Simplify kernel RPC service registration The kernel registers RPC services with the local portmapper with an rpcbind SET upcall to the local portmapper. Traditionally, this used rpcbind v2 (PMAP), but registering RPC services that support IPv6 requires rpcbind v3 or v4. Since we now want separate PF_INET and PF_INET6 listeners for each kernel RPC service, svc_register() will do only one of those registrations at a time. For PF_INET, it tries an rpcb v4 SET upcall first; if that fails, it does a legacy portmap SET. This makes it entirely backwards compatible with legacy user space, but allows a proper v4 SET to be used if rpcbind is available. For PF_INET6, it does an rpcb v4 SET upcall. If that fails, it fails the registration, and thus the transport creation. This let's the kernel detect if user space is able to support IPv6 RPC services, and thus whether it should maintain a PF_INET6 listener for each service at all. This provides complete backwards compatibilty with legacy user space that only supports rpcbind v2. The only down-side is that registering a new kernel RPC service may take an extra exchange with the local portmapper on legacy systems, but this is an infrequent operation and is done over UDP (no lingering sockets in TIMEWAIT), so it shouldn't be consequential. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 79 ++++++++++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index bd0ee312dac9..142f64745fba 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -718,8 +718,6 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_exit_thread); -#ifdef CONFIG_SUNRPC_REGISTER_V4 - /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -734,12 +732,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - struct sockaddr_in sin = { + const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@ -752,10 +751,20 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin, netid); + + /* + * User space didn't support rpcbind v4, so retry this + * registration request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, protocol, port); + + return error; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -770,12 +779,13 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - struct sockaddr_in6 sin6 = { + const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@ -788,9 +798,19 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin6, netid); + + /* + * User space didn't support rpcbind version 4, so we won't + * use a PF_INET6 listener. + */ + if (error == -EPROTONOSUPPORT) + error = -EAFNOSUPPORT; + + return error; } +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ /* * Register a kernel RPC service via rpcbind version 4. @@ -809,48 +829,17 @@ static int __svc_register(const u32 program, const u32 version, case PF_INET: return __svc_rpcb_register4(program, version, protocol, port); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: - error = __svc_rpcb_register6(program, version, + return__svc_rpcb_register6(program, version, protocol, port); - if (error < 0) - return error; - - /* - * Work around bug in some versions of Linux rpcbind - * which don't allow registration of both inet and - * inet6 netids. - * - * Error return ignored for now. - */ - __svc_rpcb_register4(program, version, - protocol, port); - return 0; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ } return -EAFNOSUPPORT; } -#else /* CONFIG_SUNRPC_REGISTER_V4 */ - -/* - * Register a kernel RPC service via rpcbind version 2. - * - * Returns zero on success; a negative errno value is returned - * if any error occurs. - */ -static int __svc_register(const u32 program, const u32 version, - const int family, - const unsigned short protocol, - const unsigned short port) -{ - if (family != PF_INET) - return -EAFNOSUPPORT; - - return rpcb_register(program, version, protocol, port); -} - -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register -- cgit v1.2.3 From 363f724cdd3d2ae554e261be995abdeb15f7bdd9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:44 -0400 Subject: SUNRPC: rpcb_register() should handle errors silently Move error reporting for RPC registration to rpcb_register's caller. This way the caller can choose to recover silently from certain errors, but report errors it does not recognize. Error reporting for kernel RPC service registration is now handled in one place. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 2 +- net/sunrpc/svc.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 8ea8907d4b8d..beee6da33035 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -194,7 +194,7 @@ static int rpcb_register_call(const u32 version, struct rpc_message *msg) error = PTR_ERR(rpcb_clnt); if (error < 0) { - printk(KERN_WARNING "RPC: failed to contact local rpcbind " + dprintk("RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); return error; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 142f64745fba..8ba654bdd608 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -818,26 +818,30 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, * Returns zero on success; a negative errno value is returned * if any error occurs. */ -static int __svc_register(const u32 program, const u32 version, +static int __svc_register(const char *progname, + const u32 program, const u32 version, const int family, const unsigned short protocol, const unsigned short port) { - int error; + int error = -EAFNOSUPPORT; switch (family) { case PF_INET: - return __svc_rpcb_register4(program, version, + error = __svc_rpcb_register4(program, version, protocol, port); break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: - return__svc_rpcb_register6(program, version, + error = __svc_rpcb_register6(program, version, protocol, port); #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ } - return -EAFNOSUPPORT; + if (error < 0) + printk(KERN_WARNING "svc: failed to register %sv%u RPC " + "service (errno %d).\n", progname, version, -error); + return error; } /** @@ -875,8 +879,8 @@ int svc_register(const struct svc_serv *serv, const int family, if (progp->pg_vers[i]->vs_hidden) continue; - error = __svc_register(progp->pg_prog, i, - family, proto, port); + error = __svc_register(progp->pg_name, progp->pg_prog, + i, family, proto, port); if (error < 0) break; } -- cgit v1.2.3 From 9355982830ad67dca35e0f3d43319f3d438f82b4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Mar 2009 20:47:51 -0400 Subject: SUNRPC: Remove CONFIG_SUNRPC_REGISTER_V4 We just augmented the kernel's RPC service registration code so that it automatically adjusts to what is supported in user space. Thus we no longer need the kernel configuration option to enable registering RPC services with v4 -- it's all done automatically. This patch is part of a series that addresses http://bugzilla.kernel.org/show_bug.cgi?id=12256 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/Kconfig | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'net') diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 5592883e1e4a..afd91c78ce8e 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -17,28 +17,6 @@ config SUNRPC_XPRT_RDMA If unsure, say N. -config SUNRPC_REGISTER_V4 - bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - default n - help - Sun added support for registering RPC services at an IPv6 - address by creating two new versions of the rpcbind protocol - (RFC 1833). - - This option enables support in the kernel RPC server for - registering kernel RPC services via version 4 of the rpcbind - protocol. If you enable this option, you must run a portmapper - daemon that supports rpcbind protocol version 4. - - Serving NFS over IPv6 from knfsd (the kernel's NFS server) - requires that you enable this option and use a portmapper that - supports rpcbind version 4. - - If unsure, say N to get traditional behavior (register kernel - RPC services using only rpcbind version 2). Distributions - using the legacy Linux portmapper daemon must say N here. - config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL -- cgit v1.2.3 From c69da774b28e01e062e0a3aba7509f2dcfd2a11a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Mar 2009 18:59:17 -0400 Subject: SUNRPC: Ensure IPV6_V6ONLY is set on the socket before binding to a port Also ensure that we use the protocol family instead of the address family when calling sock_create_kern(). Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index ac6cd65220c7..9d504234af4a 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1110,7 +1110,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1143,16 +1142,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - /* - * If this is a PF_INET6 listener, we want to avoid - * getting requests from IPv4 remotes. Those should - * be shunted to a PF_INET listener via rpcbind. - */ - val = 1; - if (inet->sk_family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1220,6 +1209,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr_storage addr; struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; + int family; + int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1231,14 +1222,35 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, "sockets supported\n"); return ERR_PTR(-EINVAL); } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + switch (sin->sa_family) { + case AF_INET6: + family = PF_INET6; + break; + case AF_INET: + family = PF_INET; + break; + default: + return ERR_PTR(-EINVAL); + } - error = sock_create_kern(sin->sa_family, type, protocol, &sock); + error = sock_create_kern(family, type, protocol, &sock); if (error < 0) return ERR_PTR(error); svc_reclassify_socket(sock); + /* + * If this is an PF_INET6 listener, we want to avoid + * getting requests from IPv4 remotes. Those should + * be shunted to a PF_INET listener via rpcbind. + */ + val = 1; + if (family == PF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + if (type == SOCK_STREAM) sock->sk->sk_reuse = 1; /* allow address reuse */ error = kernel_bind(sock, sin, len); -- cgit v1.2.3