From 89aef8921bfbac22f00e04f8450f6e447db13e42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 11:00:09 -0700 Subject: ipv4: Delete routing cache. The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index ace3cb44251..5dcfeb621e0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); -extern void rt_cache_flush_batch(struct net *net); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); -- cgit v1.2.3 From 38a424e4657462fe9f8b76f01a0e879abde99ab4 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:53 +0000 Subject: ipv4: Kill ip_route_input_noref(). The "noref" argument to ip_route_input_common() is now always ignored because we do not cache routes, and in that case we must always grab a reference to the resulting 'dst'. Signed-off-by: David S. Miller --- include/net/route.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 5dcfeb621e0..5c86c4773b2 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref); - -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, false); -} - -static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, true); -} +extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin); extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags); -- cgit v1.2.3 From 1a00fee4ffb22312a0ac40045ecd6f222b55eb3d Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:56 +0000 Subject: ipv4: Remove rt_key_{src,dst,tos} from struct rtable. They are always used in contexts where they can be reconstituted, or where the finally resolved rt->rt_{src,dst} is semantically equivalent. Signed-off-by: David S. Miller --- include/net/route.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 5c86c4773b2..935fa59630b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -44,14 +44,9 @@ struct fib_info; struct rtable { struct dst_entry dst; - /* Lookup key. */ - __be32 rt_key_dst; - __be32 rt_key_src; - int rt_genid; unsigned int rt_flags; __u16 rt_type; - __u8 rt_key_tos; __be32 rt_dst; /* Path destination */ __be32 rt_src; /* Path source */ -- cgit v1.2.3 From d6c0a4f609847d6e65658913f9ccbcb1c137cff3 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:59 +0000 Subject: ipv4: Kill 'rt_src' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 935fa59630b..85d1093e42d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -49,7 +49,6 @@ struct rtable { __u16 rt_type; __be32 rt_dst; /* Path destination */ - __be32 rt_src; /* Path source */ int rt_route_iif; int rt_iif; int rt_oif; -- cgit v1.2.3 From b48698895de86e07b685f8e4b8db0f1cd5a97e9a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:03:01 +0000 Subject: ipv4: Remove 'rt_mark' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 85d1093e42d..757fe40b6ce 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -52,7 +52,6 @@ struct rtable { int rt_route_iif; int rt_iif; int rt_oif; - __u32 rt_mark; /* Info on neighbour */ __be32 rt_gateway; -- cgit v1.2.3 From f1ce3062c53809d862d8a04e7a0566c3cc4e0bda Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jul 2012 10:10:17 -0700 Subject: ipv4: Remove 'rt_dst' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 757fe40b6ce..6d111bceb16 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -48,7 +48,6 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; - __be32 rt_dst; /* Path destination */ int rt_route_iif; int rt_iif; int rt_oif; -- cgit v1.2.3 From f8126f1d5136be1ca1a3536d43ad7a710b5620f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 13 Jul 2012 05:03:45 -0700 Subject: ipv4: Adjust semantics of rt->rt_gateway. In order to allow prefixed routes, we have to adjust how rt_gateway is set and interpreted. The new interpretation is: 1) rt_gateway == 0, destination is on-link, nexthop is iph->daddr 2) rt_gateway != 0, destination requires a nexthop gateway Abstract the fetching of the proper nexthop value using a new inline helper, rt_nexthop(), as suggested by Joe Perches. Signed-off-by: David S. Miller Tested-by: Vijay Subramanian --- include/net/route.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 6d111bceb16..3c1eeab9749 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -70,6 +70,13 @@ static inline bool rt_is_output_route(const struct rtable *rt) return rt->rt_route_iif == 0; } +static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) +{ + if (rt->rt_gateway) + return rt->rt_gateway; + return daddr; +} + struct ip_rt_acct { __u32 o_bytes; __u32 o_packets; -- cgit v1.2.3 From f5b0a8743601a4477419171f5046bd07d1c080a0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 Jul 2012 12:31:33 -0700 Subject: net: Document dst->obsolete better. Add a big comment explaining how the field works, and use defines instead of magic constants for the values assigned to it. Suggested by Joe Perches. Signed-off-by: David S. Miller --- include/net/dst.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 51610468c63..0df661a0c29 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -65,7 +65,19 @@ struct dst_entry { unsigned short pending_confirm; short error; + + /* A non-zero value of dst->obsolete forces by-hand validation + * of the route entry. Positive values are set by the generic + * dst layer to indicate that the entry has been forcefully + * destroyed. + * + * Negative values are used by the implementation layer code to + * force invocation of the dst_ops->check() method. + */ short obsolete; +#define DST_OBSOLETE_NONE 0 +#define DST_OBSOLETE_DEAD 2 +#define DST_OBSOLETE_FORCE_CHK -1 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID @@ -359,7 +371,7 @@ extern struct dst_entry *dst_destroy(struct dst_entry *dst); static inline void dst_free(struct dst_entry *dst) { - if (dst->obsolete > 1) + if (dst->obsolete > 0) return; if (!atomic_read(&dst->__refcnt)) { dst = dst_destroy(dst); -- cgit v1.2.3 From ceb3320610d6f15ff20dd4c042b36473d77de76f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 11:31:28 -0700 Subject: ipv4: Kill routes during PMTU/redirect updates. Mark them obsolete so there will be a re-lookup to fetch the FIB nexthop exception info. Signed-off-by: David S. Miller --- include/net/dst.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 0df661a0c29..baf59789006 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -78,6 +78,7 @@ struct dst_entry { #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 +#define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID -- cgit v1.2.3 From f2bb4bedf35d5167a073dcdddf16543f351ef3ae Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:20:47 -0700 Subject: ipv4: Cache output routes in fib_info nexthops. If we have an output route that lacks nexthop exceptions, we can cache it in the FIB info nexthop. Such routes will have DST_HOST cleared because such routes refer to a family of destinations, rather than just one. The sequence of the handling of exceptions during route lookup is adjusted to make the logic work properly. Before we allocate the route, we lookup the exception. Then we know if we will cache this route or not, and therefore whether DST_HOST should be set on the allocated route. Then we use DST_HOST to key off whether we should store the resulting route, during rt_set_nexthop(), in the FIB nexthop cache. With help from Eric Dumazet. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 2daf096dfc6..fb62c590360 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -46,6 +46,7 @@ struct fib_config { }; struct fib_info; +struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; @@ -80,6 +81,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; + struct rtable *nh_rth_output; struct fnhe_hash_bucket *nh_exceptions; }; -- cgit v1.2.3 From d2d68ba9fe8b38eb03124b3176a013bb8aa2b5e5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:58:50 -0700 Subject: ipv4: Cache input routes in fib_info nexthops. Caching input routes is slightly simpler than output routes, since we don't need to be concerned with nexthop exceptions. (locally destined, and routed packets, never trigger PMTU events or redirects that will be processed by us). However, we have to elide caching for the DIRECTSRC and non-zero itag cases. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index fb62c590360..e69c3a47153 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -82,6 +82,7 @@ struct fib_nh { __be32 nh_saddr; int nh_saddr_genid; struct rtable *nh_rth_output; + struct rtable *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; -- cgit v1.2.3 From ba3f7f04ef2b19aace38f855aedd17fe43035d50 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:02:46 -0700 Subject: ipv4: Kill FLOWI_FLAG_RT_NOCACHE and associated code. Signed-off-by: David S. Miller --- include/net/flow.h | 1 - include/net/inet_connection_sock.h | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index ce9cb7656b4..e1dd5082ec7 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -21,7 +21,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_CAN_SLEEP 0x02 -#define FLOWI_FLAG_RT_NOCACHE 0x04 __u32 flowic_secid; }; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2cf44b4ed2e..5ee66f517b4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum); extern struct dst_entry* inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req, - bool nocache); + const struct request_sock *req); extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, const struct request_sock *req); -- cgit v1.2.3 From 4fd551d7bed93af60af61c5a324b8f5dff37953a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:39:44 -0700 Subject: ipv4: Kill rt->rt_oif Never actually used. It was being set on output routes to the original OIF specified in the flow key used for the lookup. Adjust the only user, ipmr_rt_fib_lookup(), for greater correctness of the flowi4_oif and flowi4_iif values, thanks to feedback from Julian Anastasov. Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 3c1eeab9749..e789a92fd60 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -50,7 +50,6 @@ struct rtable { int rt_route_iif; int rt_iif; - int rt_oif; /* Info on neighbour */ __be32 rt_gateway; -- cgit v1.2.3 From 9917e1e8762745191eba5a3bf2040278cbddbee1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:44:26 -0700 Subject: ipv4: Turn rt->rt_route_iif into rt->rt_is_input. That is this value's only use, as a boolean to indicate whether a route is an input route or not. So implement it that way, using a u16 gap present in the struct already. Signed-off-by: David S. Miller --- include/net/route.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index e789a92fd60..4bafe0bfe82 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -47,8 +47,8 @@ struct rtable { int rt_genid; unsigned int rt_flags; __u16 rt_type; + __u16 rt_is_input; - int rt_route_iif; int rt_iif; /* Info on neighbour */ @@ -61,12 +61,12 @@ struct rtable { static inline bool rt_is_input_route(const struct rtable *rt) { - return rt->rt_route_iif != 0; + return rt->rt_is_input != 0; } static inline bool rt_is_output_route(const struct rtable *rt) { - return rt->rt_route_iif == 0; + return rt->rt_is_input == 0; } static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) -- cgit v1.2.3 From 2860583fe840d972573363dfa190b2149a604534 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:55:59 -0700 Subject: ipv4: Kill rt->fi It's not really needed. We only grabbed a reference to the fib_info for the sake of fib_info local metrics. However, fib_info objects are freed using RCU, as are therefore their private metrics (if any). We would have triggered a route cache flush if we eliminated a reference to a fib_info object in the routing tables. Therefore, any existing cached routes will first check and see that they have been invalidated before an errant reference to these metric values would occur. Signed-off-by: David S. Miller --- include/net/route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 4bafe0bfe82..60d611dc5ce 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -56,7 +56,6 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; - struct fib_info *fi; /* for client ref to shared metrics */ }; static inline bool rt_is_input_route(const struct rtable *rt) -- cgit v1.2.3