datapath: Use hash table more tolerant of collisions for flow table.

The hash table used until now in the kernel datapath for storing the flow table provides only two slots that a given flow can occupy. If both of those slots are already full, for a given flow, then that flow cannot be added at all and its packets must be handled entirely in userspace, taking a performance hit. The code does attempt to compensate for this by making the flow table rather large: 8 slots per flow actually in the flow table. In practice, this is usually good enough, but some of the tests that we have run show bad enough performance degradation or even timeouts of various kinds that we want to implement something better. This commit replaces the existing hash table by one with a completely different design in which buckets are flexibly sized and can accept any number of collisions. By use of suitable levels of indirection, this design is both simple and RCU-compatible. I did consider other schemes, but none of the ones that I came up with shared both of those two properties. This commit also adds kerneldoc comments for all of the flow table non-static functions and data structures. This has been lightly tested for correctness. It has not been tested for performance. Bug #1656. Bug #1851.
author: Ben Pfaff <blp@nicira.com> 2009-09-01 10:31:32 -0700
committer: Ben Pfaff <blp@nicira.com> 2009-09-01 10:36:42 -0700
commit: 6fa58f7a1533b96d8c958581ed18e0e5a245157b (patch)
tree: 2b2a3ee3bf08ea3bd17dd8f86924cc68a27f5150 /datapath/table.c
parent: 3c4fae5f4563848dd6392434424e10e32e6b6f83 (diff)
1 files changed, 228 insertions, 109 deletions
diff --git a/datapath/table.c b/datapath/table.c
index 11aeb888..23ae8abe 100644
--- a/datapath/table.c
+++ b/datapath/table.c
@@ -11,50 +11,76 @@
 
 #include <linux/gfp.h>
 #include <linux/jhash.h>
+#include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <asm/pgtable.h>
 
-static void free_table(struct sw_flow ***flows, unsigned int n_buckets,
-		       int free_flows)
+static inline int bucket_size(int n_flows)
+{
+	return sizeof(struct dp_bucket) + sizeof(struct sw_flow*) * n_flows;
+}
+
+static struct dp_bucket *dp_bucket_alloc(int n_flows)
+{
+	return kmalloc(bucket_size(n_flows), GFP_KERNEL);
+}
+
+static void free_buckets(struct dp_bucket ***l1, unsigned int n_buckets,
+			 int free_flows)
 {
 	unsigned int i;
 
 	for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-		struct sw_flow **l2 = flows[i];
-		if (free_flows) {
-			unsigned int j;
-			for (j = 0; j < DP_L1_SIZE; j++) {
-				if (l2[j])
-					flow_free(l2[j]);
+		struct dp_bucket **l2 = l1[i];
+		unsigned int j;
+
+		for (j = 0; j < DP_L1_SIZE; j++) {
+			struct dp_bucket *bucket = l2[j];
+			if (!bucket)
+				continue;
+
+			if (free_flows) {
+				unsigned int k;
+				for (k = 0; k < bucket->n_flows; k++)
+					flow_free(bucket->flows[k]);
 			}
+			kfree(bucket);
 		}
 		free_page((unsigned long)l2);
 	}
-	kfree(flows);
+	kfree(l1);
 }
 
-static struct sw_flow ***alloc_table(unsigned int n_buckets)
+static struct dp_bucket ***alloc_buckets(unsigned int n_buckets)
 {
-	struct sw_flow ***flows;
+	struct dp_bucket ***l1;
 	unsigned int i;
 
-	flows = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct sw_flow**),
-			GFP_KERNEL);
-	if (!flows)
+	l1 = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct dp_bucket**),
+		     GFP_KERNEL);
+	if (!l1)
 		return NULL;
 	for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
-		flows[i] = (struct sw_flow **)get_zeroed_page(GFP_KERNEL);
-		if (!flows[i]) {
-			free_table(flows, i << DP_L1_BITS, 0);
+		l1[i] = (struct dp_bucket **)get_zeroed_page(GFP_KERNEL);
+		if (!l1[i]) {
+			free_buckets(l1, i << DP_L1_BITS, 0);
 			return NULL;
 		}
 	}
-	return flows;
+	return l1;
 }
 
+/**
+ * dp_table_create - create and return a new flow table
+ * @n_buckets: number of buckets in the new table
+ *
+ * Creates and returns a new flow table, or %NULL if memory cannot be
+ * allocated.  @n_buckets must be a power of 2 in the range %DP_L1_SIZE to
+ * %DP_MAX_BUCKETS.
+ */
 struct dp_table *dp_table_create(unsigned int n_buckets)
 {
 	struct dp_table *table;
@@ -64,95 +90,124 @@ struct dp_table *dp_table_create(unsigned int n_buckets)
 		goto err;
 
 	table->n_buckets = n_buckets;
-	table->flows[0] = alloc_table(n_buckets);
-	if (!table[0].flows)
-		goto err_free_tables;
-
-	table->flows[1] = alloc_table(n_buckets);
-	if (!table->flows[1])
-		goto err_free_flows0;
+	table->buckets = alloc_buckets(n_buckets);
+	if (!table->buckets)
+		goto err_free_table;
+	get_random_bytes(&table->hash_seed, sizeof table->hash_seed);
 
 	return table;
 
-err_free_flows0:
-	free_table(table->flows[0], table->n_buckets, 0);
-err_free_tables:
+err_free_table:
 	kfree(table);
 err:
 	return NULL;
 }
 
+/**
+ * dp_table_destroy - destroy flow table and optionally the flows it contains
+ * @table: table to destroy (must not be %NULL)
+ * @free_flows: whether to destroy the flows
+ *
+ * If @free_flows is zero, then the buckets in @table are destroyed but not the
+ * flows within those buckets.  This behavior is useful when a table is being
+ * replaced by a larger or smaller one without destroying the flows.
+ *
+ * If @free_flows is nonzero, then the flows in @table are destroyed as well as
+ * the buckets.
+ */
 void dp_table_destroy(struct dp_table *table, int free_flows)
 {
-	int i;
-	for (i = 0; i < 2; i++)
-		free_table(table->flows[i], table->n_buckets, free_flows);
+	free_buckets(table->buckets, table->n_buckets, free_flows);
 	kfree(table);
 }
 
-static struct sw_flow **find_bucket(struct dp_table *table,
-				    struct sw_flow ***flows, u32 hash)
+static struct dp_bucket **find_bucket(struct dp_table *table, u32 hash)
 {
 	unsigned int l1 = (hash & (table->n_buckets - 1)) >> DP_L1_SHIFT;
 	unsigned int l2 = hash & ((1 << DP_L2_BITS) - 1);
-	return &flows[l1][l2];
+	return &table->buckets[l1][l2];
 }
 
-static struct sw_flow *lookup_table(struct dp_table *table,
-				    struct sw_flow ***flows, u32 hash,
-				    const struct odp_flow_key *key)
+static int search_bucket(const struct dp_bucket *bucket, const struct odp_flow_key *key)
 {
-	struct sw_flow **bucket = find_bucket(table, flows, hash);
-	struct sw_flow *flow = rcu_dereference(*bucket);
-	if (flow && !memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
-		return flow;
-	return NULL;
-}
+	int i;
 
-static u32 flow_hash0(const struct odp_flow_key *key)
-{
-	return jhash2((u32*)key, sizeof *key / sizeof(u32), 0xaaaaaaaa);
+	for (i = 0; i < bucket->n_flows; i++) {
+		struct sw_flow *flow = rcu_dereference(bucket->flows[i]);
+		if (!memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
+			return i;
+	}
+
+	return -1;
 }
 
-static u32 flow_hash1(const struct odp_flow_key *key)
+static struct sw_flow *lookup_flow(struct dp_table *table, u32 hash,
+				   const struct odp_flow_key *key)
 {
-	return jhash2((u32*)key, sizeof *key / sizeof(u32), 0x55555555);
+	struct dp_bucket **bucketp = find_bucket(table, hash);
+	struct dp_bucket *bucket = rcu_dereference(*bucketp);
+	int index;
+
+	if (!bucket)
+		return NULL;
+
+	index = search_bucket(bucket, key);
+	if (index < 0)
+		return NULL;
+
+	return bucket->flows[index];
 }
 
-static void find_buckets(struct dp_table *table,
-			 const struct odp_flow_key *key,
-			 struct sw_flow **buckets[2])
+static u32 flow_hash(const struct dp_table *table,
+		     const struct odp_flow_key *key)
 {
-	buckets[0] = find_bucket(table, table->flows[0], flow_hash0(key));
-	buckets[1] = find_bucket(table, table->flows[1], flow_hash1(key));
+	return jhash2((u32*)key, sizeof *key / sizeof(u32), table->hash_seed);
 }
 
+/**
+ * dp_table_lookup - searches flow table for a matching flow
+ * @table: flow table to search
+ * @key: flow key for which to search
+ *
+ * Searches @table for a flow whose key is equal to @key.  Returns the flow if
+ * successful, otherwise %NULL.
+ */
 struct sw_flow *dp_table_lookup(struct dp_table *table,
 				const struct odp_flow_key *key)
 {
-	struct sw_flow *flow;
-	flow = lookup_table(table, table->flows[0], flow_hash0(key), key);
-	if (!flow)
-		flow = lookup_table(table, table->flows[1],
-				    flow_hash1(key), key);
-	return flow;
+	return lookup_flow(table, flow_hash(table, key), key);
 }
 
+/**
+ * dp_table_foreach - iterate through flow table
+ * @table: table to iterate
+ * @callback: function to call for each flow entry
+ * @aux: Extra data to pass to @callback
+ *
+ * Iterates through all of the flows in @table in hash order, passing each of
+ * them in turn to @callback.  If @callback returns nonzero, this terminates
+ * the iteration and dp_table_foreach() returns the same value.  Returns 0 if
+ * @callback never returns nonzero.
+ *
+ * This function does not try to intelligently handle the case where @callback
+ * adds or removes flows in @table.
+ */
 int dp_table_foreach(struct dp_table *table,
 		     int (*callback)(struct sw_flow *flow, void *aux),
 		     void *aux)
 {
 	unsigned int i, j, k;
-	for (i = 0; i < 2; i++) {
-		for (j = 0; j < table->n_buckets >> DP_L1_BITS; j++) {
-			struct sw_flow **l2 = table->flows[i][j];
-			for (k = 0; k < DP_L1_SIZE; k++) {
-				struct sw_flow *flow = rcu_dereference(l2[k]);
-				if (flow) {
-					int error = callback(flow, aux);
-					if (error)
-						return error;
-				}
+	for (i = 0; i < table->n_buckets >> DP_L1_BITS; i++) {
+		struct dp_bucket **l2 = table->buckets[i];
+		for (j = 0; j < DP_L1_SIZE; j++) {
+			struct dp_bucket *bucket = rcu_dereference(l2[j]);
+			if (!bucket)
+				continue;
+
+			for (k = 0; k < bucket->n_flows; k++) {
+				int error = (*callback)(bucket->flows[k], aux);
+				if (error)
+					return error;
 			}
 		}
 	}
@@ -162,18 +217,7 @@ int dp_table_foreach(struct dp_table *table,
 static int insert_flow(struct sw_flow *flow, void *new_table_)
 {
 	struct dp_table *new_table = new_table_;
-	struct sw_flow **buckets[2];
-	int i;
-
-	find_buckets(new_table, &flow->key, buckets);
-	for (i = 0; i < 2; i++) {
-		if (!*buckets[i]) {
-			rcu_assign_pointer(*buckets[i], flow);
-			return 0;
-		}
-	}
-	WARN_ON_ONCE(1);
-	return 0;
+	return dp_table_insert(new_table, flow);
 }
 
 static void dp_free_table_rcu(struct rcu_head *rcu)
@@ -182,16 +226,34 @@ static void dp_free_table_rcu(struct rcu_head *rcu)
 	dp_table_destroy(table, 0);
 }
 
+/**
+ * dp_table_expand - replace datapath's flow table by one with more buckets
+ * @dp: datapath to expand
+ *
+ * Replaces @dp's flow table by one that has twice as many buckets.  All of the
+ * flows in @dp's flow table are moved to the new flow table.  Returns 0 if
+ * successful, otherwise a negative error.
+ */
 int dp_table_expand(struct datapath *dp)
 {
 	struct dp_table *old_table = rcu_dereference(dp->table);
-	struct dp_table *new_table = dp_table_create(old_table->n_buckets * 2);
+	struct dp_table *new_table;
+
+	new_table = dp_table_create(old_table->n_buckets * 2);
 	if (!new_table)
-		return -ENOMEM;
-	dp_table_foreach(old_table, insert_flow, new_table);
+		goto error;
+
+	if (dp_table_foreach(old_table, insert_flow, new_table))
+		goto error_free_new_table;
+
 	rcu_assign_pointer(dp->table, new_table);
 	call_rcu(&old_table->rcu, dp_free_table_rcu);
 	return 0;
+
+error_free_new_table:
+	dp_table_destroy(new_table, 0);
+error:
+	return -ENOMEM;
 }
 
 static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
@@ -200,6 +262,13 @@ static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
 	dp_table_destroy(table, 1);
 }
 
+/**
+ * dp_table_flush - clear datapath's flow table
+ * @dp: datapath to clear
+ *
+ * Replaces @dp's flow table by an empty flow table, destroying all the flows
+ * in the old table (after a suitable RCU grace period).
+ */
 int dp_table_flush(struct datapath *dp)
 {
 	struct dp_table *old_table = rcu_dereference(dp->table);
@@ -211,38 +280,88 @@ int dp_table_flush(struct datapath *dp)
 	return 0;
 }
 
-struct sw_flow **
-dp_table_lookup_for_insert(struct dp_table *table,
-			   const struct odp_flow_key *target)
+static void dp_free_bucket_rcu(struct rcu_head *rcu)
 {
-	struct sw_flow **buckets[2];
-	struct sw_flow **empty_bucket = NULL;
-	int i;
+	struct dp_bucket *bucket = container_of(rcu, struct dp_bucket, rcu);
+	kfree(bucket);
+}
 
-	find_buckets(table, target, buckets);
-	for (i = 0; i < 2; i++) {
-		struct sw_flow *f = rcu_dereference(*buckets[i]);
-		if (f) {
-			if (!memcmp(&f->key, target, sizeof(struct odp_flow_key)))
-				return buckets[i];
-		} else if (!empty_bucket)
-			empty_bucket = buckets[i];
-	}
-	return empty_bucket;
+/**
+ * dp_table_insert - insert flow into table
+ * @table: table in which to insert flow
+ * @target: flow to insert
+ *
+ * The caller must ensure that no flow with key identical to @target->key
+ * already exists in @table.  Returns 0 or a negative error (currently just
+ * -ENOMEM).
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
+int dp_table_insert(struct dp_table *table, struct sw_flow *target)
+{
+	u32 hash = flow_hash(table, &target->key);
+	struct dp_bucket **oldp = find_bucket(table, hash);
+	struct dp_bucket *old = *rcu_dereference(oldp);
+	unsigned int n = old ? old->n_flows : 0;
+	struct dp_bucket *new = dp_bucket_alloc(n + 1);
+
+	if (!new)
+		return -ENOMEM;
+
+	new->n_flows = n + 1;
+	if (old)
+		memcpy(new->flows, old->flows, n * sizeof(struct sw_flow*));
+	new->flows[n] = target;
+
+	rcu_assign_pointer(*oldp, new);
+	if (old)
+		call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+	return 0;
 }
 
+/**
+ * dp_table_delete - remove flow from table
+ * @table: table from which to remove flow
+ * @target: flow to remove
+ *
+ * The caller must ensure that @target itself is in @table.  (It is not
+ * good enough for @table to contain a different flow with a key equal to
+ * @target's key.)
+ *
+ * Returns 0 or a negative error (currently just -ENOMEM).  Yes, it *is*
+ * possible for a flow deletion to fail due to lack of memory.
+ *
+ * The caller is responsible for updating &struct datapath's n_flows member.
+ */
 int dp_table_delete(struct dp_table *table, struct sw_flow *target)
 {
-	struct sw_flow **buckets[2];
-	int i;
+	u32 hash = flow_hash(table, &target->key);
+	struct dp_bucket **oldp = find_bucket(table, hash);
+	struct dp_bucket *old = *rcu_dereference(oldp);
+	unsigned int n = old->n_flows;
+	struct dp_bucket *new;
+
+	if (n > 1) {
+		unsigned int i;
 
-	find_buckets(table, &target->key, buckets);
-	for (i = 0; i < 2; i++) {
-		struct sw_flow *flow = rcu_dereference(*buckets[i]);
-		if (flow == target) {
-			rcu_assign_pointer(*buckets[i], NULL);
-			return 0;
+		new = dp_bucket_alloc(n - 1);
+		if (!new)
+			return -ENOMEM;
+
+		new->n_flows = 0;
+		for (i = 0; i < n; i++) {
+			struct sw_flow *flow = old->flows[i];
+			if (flow != target)
+				new->flows[new->n_flows++] = flow;
 		}
+		WARN_ON_ONCE(new->n_flows != n - 1);
+	} else {
+		new = NULL;
 	}
-	return -ENOENT;
+
+	rcu_assign_pointer(*oldp, new);
+	call_rcu(&old->rcu, dp_free_bucket_rcu);
+
+	return 0;
 }
author	Ben Pfaff <blp@nicira.com>	2009-09-01 10:31:32 -0700
committer	Ben Pfaff <blp@nicira.com>	2009-09-01 10:36:42 -0700
commit	6fa58f7a1533b96d8c958581ed18e0e5a245157b (patch)
tree	2b2a3ee3bf08ea3bd17dd8f86924cc68a27f5150 /datapath/table.c
parent	3c4fae5f4563848dd6392434424e10e32e6b6f83 (diff)