From 789f558cfb3680aeb52de137418637f6b04b7d22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 12 Apr 2015 18:51:09 -0700 Subject: tcp/dccp: get rid of central timewait timer Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) (limited to 'net/ipv4/tcp_minisocks.c') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2088fdcca141..63d6311b5365 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), .hashinfo = &tcp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&tcp_death_row), - .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&tcp_death_row), }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_RST; } @@ -174,11 +163,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -211,13 +198,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -267,8 +253,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process); */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw; bool recycle_ok = false; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this -- cgit v1.2.3