helper: add a more complete and correct checksum implementation

This patch adds a file called chksum.c which implements a complete TCP/UDP over either IPv4 or IPV6 checksum generation / verification capability. In addition it can deal with any form of packet segmentation including tiny segments, segments not aligned to a 2 byte boundary, etc. Signed-off-by: Barry Spinney <spinney@mellanox.com> Signed-off-by: Bill Fischofer <bill.fischofer@linaro.org> Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org>
author: Barry Spinney <spinney@mellanox.com> 2016-04-28 22:35:41 -0500
committer: Maxim Uvarov <maxim.uvarov@linaro.org> 2016-04-29 11:47:51 +0300
commit: 0e1645087dea2c7744b2384e8c4d790d79e3125e (patch)
tree: 286e7c4bd3112df6e4175c2c242321de5842b641 /helper/chksum.c
parent: 166859a5380cd156f65a2143216d04e17399b6dc (diff)
1 files changed, 342 insertions, 0 deletions
diff --git a/helper/chksum.c b/helper/chksum.c
new file mode 100644
index 000000000..859d1ec96
--- /dev/null
+++ b/helper/chksum.c
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier:     BSD-3-Clause
+ */
+
+#include <odp.h>
+#include <odp/helper/ip.h>
+#include <odp/helper/udp.h>
+#include <odp/helper/tcp.h>
+#include <odp/helper/chksum.h>
+
+/* The following union type is used to "view" an ordered set of bytes (either
+ * 2 or 4) as 1 or 2 16-bit quantities - using host endian order. */
+typedef union {
+	uint16_t words16[2];
+	uint8_t  bytes[4];
+} swap_buf_t;
+
+static uint8_t ZEROS[2] = { 0, 0 };
+
+/* Note that for data_seg_sum byte_len MUST be >= 1.  This function Returns the
+ * sum of the data (as described by data8_ptr and data_len) as 16-bit
+ * integers. */
+
+static uint32_t data_seg_sum(uint8_t   *data8_ptr,
+			     uint32_t   data_len,   /* length in bytes */
+			     odp_bool_t is_last,
+			     odp_bool_t has_odd_byte_in,
+			     uint8_t   *odd_byte_in_out)
+{
+	swap_buf_t swap_buf;
+	uint32_t   sum, len_in_16_byte_chunks, idx, data0, data1, data2, data3;
+	uint32_t   data4, data5, data6, data7;
+	uint16_t  *data16_ptr;
+
+	sum = 0;
+	if (has_odd_byte_in) {
+		swap_buf.bytes[0] = *odd_byte_in_out;
+		swap_buf.bytes[1] = *data8_ptr++;
+		sum              += (uint32_t)swap_buf.words16[0];
+		data_len--;
+	}
+
+	data16_ptr = (uint16_t *)data8_ptr;
+
+	/* The following code tries to gain a modest performance enhancement by
+	 * unrolling the normal 16 bits at a time loop eight times.  Even
+	 * better would be to add some data prefetching instructions here. */
+	len_in_16_byte_chunks = data_len / 16;
+	for (idx = 0; idx < len_in_16_byte_chunks; idx++) {
+		data0 = (uint32_t)*data16_ptr++;
+		data1 = (uint32_t)*data16_ptr++;
+		data2 = (uint32_t)*data16_ptr++;
+		data3 = (uint32_t)*data16_ptr++;
+		data4 = (uint32_t)*data16_ptr++;
+		data5 = (uint32_t)*data16_ptr++;
+		data6 = (uint32_t)*data16_ptr++;
+		data7 = (uint32_t)*data16_ptr++;
+
+		data_len -= 16;
+		sum      += data0 + data1;
+		sum      += data2 + data3;
+		sum      += data4 + data5;
+		sum      += data6 + data7;
+	}
+
+	for (idx = 0; idx < data_len / 2; idx++)
+		sum += (uint32_t)*data16_ptr++;
+
+	if ((data_len & 1) == 0)
+		return sum;
+
+	/* Now handle the case of a single odd byte. */
+	if (is_last) {
+		swap_buf.bytes[0] = *(uint8_t *)data16_ptr;
+		swap_buf.bytes[1] = 0;
+		sum              += (uint32_t)swap_buf.words16[0];
+	} else {
+		*odd_byte_in_out = *(uint8_t *)data16_ptr;
+	}
+
+	return sum;
+}
+
+static inline int odph_process_l4_hdr(odp_packet_t     odp_pkt,
+				      odph_chksum_op_t op,
+				      uint16_t        *chksum_ptr,
+				      uint32_t        *l4_len_ptr,
+				      odp_bool_t      *split_l4_hdr_ptr,
+				      odp_bool_t      *is_tcp_ptr,
+				      uint32_t        *pkt_chksum_offset_ptr,
+				      uint16_t       **pkt_chksum_ptr_ptr)
+{
+	odph_udphdr_t  *udp_hdr_ptr, udp_hdr;
+	odph_tcphdr_t  *tcp_hdr_ptr, tcp_hdr;
+	odp_bool_t      split_l4_hdr, is_tcp;
+	uint32_t        l4_offset, l4_len, hdr_len, pkt_chksum_offset;
+	uint16_t       *pkt_chksum_ptr;
+	uint8_t        *l4_ptr;
+
+	/* Parse the TCP/UDP header. */
+	l4_offset         = odp_packet_l4_offset(odp_pkt);
+	l4_ptr            = odp_packet_l4_ptr(odp_pkt, &hdr_len);
+	pkt_chksum_offset = l4_offset;
+	l4_len            = 0;
+	split_l4_hdr      = false;
+	is_tcp            = false;
+
+	if (odp_packet_has_udp(odp_pkt)) {
+		udp_hdr_ptr  = (odph_udphdr_t *)l4_ptr;
+		split_l4_hdr = hdr_len < ODPH_UDPHDR_LEN;
+		if (split_l4_hdr) {
+			odp_packet_copy_to_mem(odp_pkt, l4_offset,
+					       ODPH_UDPHDR_LEN, &udp_hdr);
+			udp_hdr_ptr = &udp_hdr;
+		}
+
+		/* According to the spec's the l4_len to be used for UDP pkts
+		 * should come from the udp header, unlike for TCP where is
+		 * derived. */
+		l4_len            = odp_be_to_cpu_16(udp_hdr_ptr->length);
+		pkt_chksum_ptr    = &udp_hdr_ptr->chksum;
+		pkt_chksum_offset = l4_offset + offsetof(odph_udphdr_t, chksum);
+	} else if (odp_packet_has_tcp(odp_pkt)) {
+		tcp_hdr_ptr  = (odph_tcphdr_t *)l4_ptr;
+		split_l4_hdr = hdr_len < ODPH_TCPHDR_LEN;
+		if (split_l4_hdr) {
+			odp_packet_copy_to_mem(odp_pkt, l4_offset,
+					       ODPH_TCPHDR_LEN, &tcp_hdr);
+			tcp_hdr_ptr = &tcp_hdr;
+		}
+
+		pkt_chksum_ptr    = &tcp_hdr_ptr->cksm;
+		pkt_chksum_offset = l4_offset + offsetof(odph_tcphdr_t, cksm);
+		is_tcp            = true;
+	} else {
+		return -1;
+	}
+
+	/* Note that if the op is ODPH_CHKSUM_VERIFY and the existing
+	 * chksum field is 0 and this is a UDP pkt and the chksum_ptr is NULL
+	 * then skip the rest of the chksum calculation, returning 1 instead. */
+	if ((op == ODPH_CHKSUM_VERIFY) && (*pkt_chksum_ptr == 0) &&
+	    (!is_tcp) && (chksum_ptr == NULL))
+		return 1;
+
+	/* If we are doing a ODPH_CHKSUM_GENERATE op, then make sure that the
+	 * existing chksum field has been set to zeros. */
+	if ((op == ODPH_CHKSUM_GENERATE) && (*pkt_chksum_ptr != 0)) {
+		if (split_l4_hdr)
+			odp_packet_copy_from_mem(odp_pkt, pkt_chksum_offset,
+						 2, ZEROS);
+		else
+			*pkt_chksum_ptr = 0;
+	}
+
+	*l4_len_ptr            = l4_len;
+	*split_l4_hdr_ptr      = split_l4_hdr;
+	*is_tcp_ptr            = is_tcp;
+	*pkt_chksum_offset_ptr = pkt_chksum_offset;
+	*pkt_chksum_ptr_ptr    = pkt_chksum_ptr;
+	return 0;
+}
+
+/* odph_process_l3_hdr includes the 16-bit sum of the pseudo header. */
+
+static inline int odph_process_l3_hdr(odp_packet_t odp_pkt,
+				      odp_bool_t   is_tcp,
+				      uint32_t    *l4_len_ptr,
+				      uint32_t    *sum_ptr)
+{
+	odph_ipv4hdr_t *ipv4_hdr_ptr, ipv4_hdr;
+	odph_ipv6hdr_t *ipv6_hdr_ptr, ipv6_hdr;
+	odp_bool_t      split_l3_hdr;
+	swap_buf_t      swap_buf;
+	uint32_t        l3_offset, l4_offset, l3_hdrs_len, hdr_len, addrs_len;
+	uint32_t        protocol, l3_len, l4_len, idx, ipv6_payload_len, sum;
+	uint16_t       *addrs_ptr;
+
+	/* The following computation using the l3 and l4 offsets handles both
+	 * the case of IPv4 options and IPv6 extension headers uniformly. */
+	l3_offset   = odp_packet_l3_offset(odp_pkt);
+	l4_offset   = odp_packet_l4_offset(odp_pkt);
+	l3_hdrs_len = l4_offset - l3_offset;
+
+	/* Parse the IPv4/IPv6 header. */
+	split_l3_hdr = false;
+	if (odp_packet_has_ipv4(odp_pkt)) {
+		ipv4_hdr_ptr = odp_packet_l3_ptr(odp_pkt, &hdr_len);
+		split_l3_hdr = hdr_len < ODPH_IPV4HDR_LEN;
+		if (split_l3_hdr) {
+			odp_packet_copy_to_mem(odp_pkt, l3_offset,
+					       ODPH_IPV4HDR_LEN, &ipv4_hdr);
+			ipv4_hdr_ptr = &ipv4_hdr;
+		}
+
+		addrs_ptr = (uint16_t *)&ipv4_hdr_ptr->src_addr;
+		addrs_len = 2 * ODPH_IPV4ADDR_LEN;
+		protocol  = ipv4_hdr_ptr->proto;
+		l3_len    = odp_be_to_cpu_16(ipv4_hdr_ptr->tot_len);
+	} else if (odp_packet_has_ipv6(odp_pkt)) {
+		ipv6_hdr_ptr = odp_packet_l3_ptr(odp_pkt, &hdr_len);
+		split_l3_hdr = hdr_len < ODPH_IPV6HDR_LEN;
+		if (split_l3_hdr) {
+			odp_packet_copy_to_mem(odp_pkt, l3_offset,
+					       ODPH_IPV6HDR_LEN, &ipv6_hdr);
+			ipv6_hdr_ptr = &ipv6_hdr;
+		}
+
+		addrs_ptr        = (uint16_t *)&ipv6_hdr_ptr->src_addr;
+		addrs_len        = 2 * ODPH_IPV6ADDR_LEN;
+		protocol         = ipv6_hdr_ptr->next_hdr;
+		ipv6_payload_len = odp_be_to_cpu_16(ipv6_hdr_ptr->payload_len);
+		l3_len           = ipv6_payload_len + ODPH_IPV6HDR_LEN;
+	} else {
+		return -1;
+	}
+
+	/* For UDP pkts, must use the incoming l4_len taken from the udp header.
+	 * For tcp pkts the l4_len is derived from the l3_len and l3_hdrs_len
+	 * calculated above. */
+	l4_len = is_tcp ? (l3_len - l3_hdrs_len) : *l4_len_ptr;
+
+	/* Do a one's complement addition over the IP pseudo-header.
+	 * Note that the pseudo-header is different for IPv4 and IPv6. */
+	sum = 0;
+	for (idx = 0; idx < addrs_len / 2; idx++)
+		sum += (uint32_t)*addrs_ptr++;
+
+	/* Need to convert l4_len and protocol into endian independent form */
+	swap_buf.bytes[0] = (l4_len >> 8) & 0xFF;
+	swap_buf.bytes[1] = (l4_len >> 0) & 0xFF;
+	swap_buf.bytes[2] = 0;
+	swap_buf.bytes[3] = protocol;
+
+	sum += (uint32_t)swap_buf.words16[0] + (uint32_t)swap_buf.words16[1];
+
+	*l4_len_ptr = l4_len;
+	*sum_ptr    = sum;
+	return 0;
+}
+
+/* Note that this implementation does not including any code or conditionally
+ * modified code that is endian specific, yet it works equally well on BIG or
+ * LITTLE endian machines.  The reason that this works is primarily because
+ * a 16-bit one's complement sum happens to be "endian-agnostic".  Specifically
+ * if one does a sum of 16-bit pkt values on a big endian machine and then on
+ * a little endian machine, they will not agree.  But after turning it into
+ * a one's complement sum by adding the carry bits in and truncating to
+ * 16-bits (which may need to be done more than once), the final 16-bit results
+ * will be byte-swapped versions of the other.  Then after storing the result
+ * back into the pkt (as a 16-bit value), the final byte pattern will be
+ * identical for both machines. */
+
+int odph_udp_tcp_chksum(odp_packet_t     odp_pkt,
+			odph_chksum_op_t op,
+			uint16_t        *chksum_ptr)
+{
+	odp_bool_t split_l4_hdr, is_tcp, is_last;
+	odp_bool_t has_odd_byte_in;
+	uint32_t   l4_len, sum, ones_compl_sum, remaining_seg_len, data_len;
+	uint32_t   pkt_chksum_offset, offset;
+	uint16_t  *pkt_chksum_ptr, chksum;
+	uint8_t   *data_ptr, odd_byte_in_out;
+	int        rc, ret_code;
+
+	/* First parse and process the l4 header */
+	rc = odph_process_l4_hdr(odp_pkt, op, chksum_ptr, &l4_len,
+				 &split_l4_hdr, &is_tcp, &pkt_chksum_offset,
+				 &pkt_chksum_ptr);
+	if (rc != 0)
+		return rc;
+
+	/* Note that in addition to parsing the l3 header, this function
+	 * does the sum of the pseudo header. */
+	rc = odph_process_l3_hdr(odp_pkt, is_tcp, &l4_len, &sum);
+	if (rc != 0)
+		return rc;
+
+	/* The following code handles all of the different cases where the
+	 * data to be checksummed might be split among an arbitrary number of
+	 * segments, each of an arbitrary length (include odd alignments!). */
+	data_ptr        = odp_packet_l4_ptr(odp_pkt, &remaining_seg_len);
+	offset          = odp_packet_l4_offset(odp_pkt);
+	has_odd_byte_in = false;
+	odd_byte_in_out = 0;
+
+	while (true) {
+		data_len = remaining_seg_len;
+		is_last  = false;
+		if (l4_len < remaining_seg_len)
+			data_len = l4_len;
+		else if (l4_len == remaining_seg_len)
+			is_last = true;
+
+		sum += data_seg_sum(data_ptr, data_len, is_last,
+				    has_odd_byte_in, &odd_byte_in_out);
+		l4_len  -= data_len;
+		if (l4_len == 0)
+			break;
+
+		if (data_len & 1)
+			has_odd_byte_in = !has_odd_byte_in;
+
+		offset  += data_len;
+		data_ptr = odp_packet_offset(odp_pkt, offset,
+					     &remaining_seg_len, NULL);
+	}
+
+	/* Now do the one's complement "carry" algorithm.  Up until now this
+	 * has just been regular two's complement addition.  Note that it is
+	 * important that this regular sum of 16-bit quantities be done with
+	 * at least 32-bit arithmetic to prevent the loss of the carries.
+	 * Note that it can be proven that only two rounds of the carry
+	 * wrap around logic are necessary (assuming 32-bit arithmetic and
+	 * a data length of < 64K). */
+	ones_compl_sum = (sum              & 0xFFFF) + (sum            >> 16);
+	ones_compl_sum = (ones_compl_sum   & 0xFFFF) + (ones_compl_sum >> 16);
+	chksum         = (~ones_compl_sum) & 0xFFFF;
+	ret_code       = 0;
+
+	/* Now based upon the given op, the calculated chksum and the incoming
+	 * chksum value complete the operation. */
+	if (op == ODPH_CHKSUM_GENERATE) {
+		if (split_l4_hdr)
+			odp_packet_copy_from_mem(odp_pkt, pkt_chksum_offset,
+						 2, &chksum);
+		else
+			*pkt_chksum_ptr = chksum;
+	} else if (op == ODPH_CHKSUM_VERIFY) {
+		if ((*pkt_chksum_ptr == 0) && (!is_tcp))
+			ret_code = 1;
+		else
+			ret_code = (chksum == 0) ? 0 : 2;
+	}
+
+	if (chksum_ptr != NULL)
+		*chksum_ptr = chksum;
+
+	return ret_code;
+}
author	Barry Spinney <spinney@mellanox.com>	2016-04-28 22:35:41 -0500
committer	Maxim Uvarov <maxim.uvarov@linaro.org>	2016-04-29 11:47:51 +0300
commit	0e1645087dea2c7744b2384e8c4d790d79e3125e (patch)
tree	286e7c4bd3112df6e4175c2c242321de5842b641 /helper/chksum.c
parent	166859a5380cd156f65a2143216d04e17399b6dc (diff)