From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 23:20:37 +0000 Subject: net: ipv4: fix RCU races on dst refcounts commit c6cffba4ffa2 (ipv4: Fix input route performance regression.) added various fatal races with dst refcounts. crashes happen on tcp workloads if routes are added/deleted at the same time. The dst_free() calls from free_fib_info_rcu() are clearly racy. We need instead regular dst refcounting (dst_release()) and make sure dst_release() is aware of RCU grace periods : Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period before dst destruction for cached dst Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero() to make sure we dont increase a zero refcount (On a dst currently waiting an rcu grace period before destruction) rt_cache_route() must take a reference on the new cached route, and release it if was not able to install it. With this patch, my machines survive various benchmarks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 +------ include/net/inet_sock.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index baf59789006..31a9fd39edb 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,6 +61,7 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 +#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 613cfa40167..e3fd34c83ac 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } +static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (atomic_inc_not_zero(&dst->__refcnt)) { + if (!(dst->flags & DST_RCU_FREE)) + dst->flags |= DST_RCU_FREE; + + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + #endif /* _INET_SOCK_H */ -- cgit v1.2.3 From 0c7462a2351b4cc502f326aad7fedd04909928be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jul 2012 07:14:29 +0000 Subject: ipv4: remove rt_cache_rebuild_count After IP route cache removal, rt_cache_rebuild_count is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 0ffb8e31f3c..1474dd65c66 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -61,8 +61,6 @@ struct netns_ipv4 { int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_errors_use_inbound_ifaddr; - int sysctl_rt_cache_rebuild_count; - int current_rt_cache_rebuild_count; unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; -- cgit v1.2.3 From 54764bb647b2e847c512acf8d443df965da35000 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 01:08:23 +0000 Subject: ipv4: Restore old dst_free() behavior. commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried to solve a race but added a problem at device/fib dismantle time : We really want to call dst_free() as soon as possible, even if sockets still have dst in their cache. dst_release() calls in free_fib_info_rcu() are not welcomed. Root of the problem was that now we also cache output routes (in nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in rt_free(), because output route lookups are done in process context. Based on feedback and initial patch from David Miller (adding another call_rcu_bh() call in fib, but it appears it was not the right fix) I left the inet_sk_rx_dst_set() helper and added __rcu attributes to nh_rth_output and nh_rth_input to better document what is going on in this code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 ++++++- include/net/inet_sock.h | 10 +++------- include/net/ip_fib.h | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 31a9fd39edb..baf59789006 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,7 +61,6 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 -#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -383,6 +382,12 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e3fd34c83ac..83b567fe194 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,13 +253,9 @@ static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb { struct dst_entry *dst = skb_dst(skb); - if (atomic_inc_not_zero(&dst->__refcnt)) { - if (!(dst->flags & DST_RCU_FREE)) - dst->flags |= DST_RCU_FREE; - - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } #endif /* _INET_SOCK_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e69c3a47153..e521a03515b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -81,8 +81,8 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable *nh_rth_output; - struct rtable *nh_rth_input; + struct rtable __rcu *nh_rth_output; + struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; -- cgit v1.2.3 From d26b3a7c4b3b26319f18bb645de93eba8f4bdcd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 05:45:30 +0000 Subject: ipv4: percpu nh_rth_output cache Input path is mostly run under RCU and doesnt touch dst refcnt But output path on forwarding or UDP workloads hits badly dst refcount, and we have lot of false sharing, for example in ipv4_mtu() when reading rt->rt_pmtu Using a percpu cache for nh_rth_output gives a nice performance increase at a small cost. 24 udpflood test on my 24 cpu machine (dummy0 output device) (each process sends 1.000.000 udp frames, 24 processes are started) before : 5.24 s after : 2.06 s For reference, time on linux-3.5 : 6.60 s Signed-off-by: Eric Dumazet Tested-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e521a03515b..e331746029b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -21,6 +21,7 @@ #include #include #include +#include struct fib_config { u8 fc_dst_len; @@ -81,7 +82,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable __rcu *nh_rth_output; + struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; -- cgit v1.2.3 From c5038a8327b980a5b279fa193163c468011de009 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:02:02 -0700 Subject: ipv4: Cache routes in nexthop exception entries. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e331746029b..926142ed8d7 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -55,6 +55,7 @@ struct fib_nh_exception { u32 fnhe_pmtu; __be32 fnhe_gw; unsigned long fnhe_expires; + struct rtable __rcu *fnhe_rth; unsigned long fnhe_stamp; }; -- cgit v1.2.3 From caacf05e5ad1abf0a2864863da4e33024bc68ec6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:06:50 -0700 Subject: ipv4: Properly purge netdev references on uncached routes. When a device is unregistered, we have to purge all of the references to it that may exist in the entire system. If a route is uncached, we currently have no way of accomplishing this. So create a global list that is scanned when a network device goes down. This mirrors the logic in net/core/dst.c's dst_ifdown(). Signed-off-by: David S. Miller --- include/net/route.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 8c52bc6f1c9..776a27f1ab7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -57,6 +57,8 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; + + struct list_head rt_uncached; }; static inline bool rt_is_input_route(const struct rtable *rt) @@ -107,6 +109,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); +extern void rt_flush_dev(struct net_device *dev); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); -- cgit v1.2.3