diff options
author | Jere Leppänen <jere.leppanen@nokia.com> | 2020-12-22 16:29:49 +0200 |
---|---|---|
committer | Petri Savolainen <petri.savolainen@nokia.com> | 2021-02-12 08:43:57 +0200 |
commit | 6709708360dc430277bf1f37b0f2203c5181438d (patch) | |
tree | 26ab5711eacc78aef66042fef45281d032497e39 /platform/linux-generic/include/odp_chksum_internal.h | |
parent | 596cec45ded4a9b9f0749292c30b96aef5b2ea7a (diff) |
linux-gen: improve software checksum implementation
Improve software checksum implementation and put it in a separate
header file so that it may be easily called from multiple places.
Signed-off-by: Jere Leppänen <jere.leppanen@nokia.com>
Reviewed-by: Petri Savolainen <petri.savolainen@nokia.com>
Diffstat (limited to 'platform/linux-generic/include/odp_chksum_internal.h')
-rw-r--r-- | platform/linux-generic/include/odp_chksum_internal.h | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/platform/linux-generic/include/odp_chksum_internal.h b/platform/linux-generic/include/odp_chksum_internal.h new file mode 100644 index 000000000..5a134ae2d --- /dev/null +++ b/platform/linux-generic/include/odp_chksum_internal.h @@ -0,0 +1,189 @@ +/* Copyright (c) 2020, Nokia + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef ODP_CHKSUM_INTERNAL_H_ +#define ODP_CHKSUM_INTERNAL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <odp/api/hints.h> +#include <odp/api/byteorder.h> +#include <odp_cpu.h> +#include <stdint.h> + +/* + * Compute the final Internet checksum (RFC 1071) based on a partial + * sum. A partial sum can be obtained e.g. by calling + * chksum_partial(). + */ +static inline uint16_t chksum_finalize(uint64_t sum) +{ + sum = (sum >> 32) + (sum & 0xffffffff); + sum = (sum >> 16) + (sum & 0xffff); + /* + * The final & 0xffff is intentionally omitted, the extra bits + * are discarded by the implicit cast to the return type. + */ + return (sum >> 16) + sum; +} + +/* + * Compute a partial checksum. Several partial checksums may be summed + * together. The final checksum may be obtained by calling + * chksum_finalize(). Parameter offset is the offset of this segment + * of data from the start of IP header. + * + * This implementation + * + * - Accepts unaligned data. + * + * - Accepts data at any byte offset from the start of IP header, + * including odd offsets. + * + * - Uses unaligned memory access only if available. + * + * - Is optimized (for skylake, cn96, a53) by trial and error. + * + * The following did not improve performance (in synthetic tests): + * + * - 2 or 4 sub-sums in the main loop (to break dependency chains). + * + * - Aligning to 8 bytes instead of 4 (for ldp instruction). This + * makes the main loop faster on a53 (only), but the extra + * conditional branch has its cost. + * + * - __builtin_assume_aligned(). + */ +static uint64_t chksum_partial(const void *addr, uint32_t len, uint32_t offset) +{ + const uint8_t *b; + const uint16_t *w; + const uint32_t *d; + uint64_t sum = 0; + + /* + * Offset is either even or odd, the rest of it doesn't + * matter. + */ + offset &= 1; + + if (_ODP_UNALIGNED) { + /* + * We have efficient unaligned access. Just read + * dwords starting at the given address. + */ + d = (const uint32_t *)addr; + } else { + /* + * We must avoid unaligned access, so align to 4 bytes + * by summing up the first up to 3 bytes. + */ + b = (const uint8_t *)addr; + + if (odp_unlikely((uintptr_t)b & 1) && len >= 1) { + /* + * Align to 2 bytes by handling an odd + * byte. Since addr is unaligned, the first + * byte goes into the second byte of the sum. + */ + sum += odp_cpu_to_be_16(*b++); + len -= 1; + + /* An odd byte negates the effect of offset. */ + offset ^= 1; + } + + /* + * This cast increases alignment, but it's OK, since + * we've made sure that the pointer value is aligned. + */ + w = (const uint16_t *)(uintptr_t)b; + + if ((uintptr_t)w & 2 && len >= 2) { + /* Align bytes by handling an odd word. */ + sum += *w++; + len -= 2; + } + + /* Increases alignment. */ + d = (const uint32_t *)(uintptr_t)w; + } + + while (len >= 32) { + /* 8 dwords or 32 bytes per round. */ + + sum += *d++; + sum += *d++; + sum += *d++; + sum += *d++; + + sum += *d++; + sum += *d++; + sum += *d++; + sum += *d++; + + len -= 32; + } + + /* Last up to 7 dwords. */ + switch (len >> 2) { + case 7: + sum += *d++; + /* FALLTHROUGH */ + case 6: + sum += *d++; + /* FALLTHROUGH */ + case 5: + sum += *d++; + /* FALLTHROUGH */ + case 4: + sum += *d++; + /* FALLTHROUGH */ + case 3: + sum += *d++; + /* FALLTHROUGH */ + case 2: + sum += *d++; + /* FALLTHROUGH */ + case 1: + sum += *d++; + /* FALLTHROUGH */ + default: + break; + } + + len &= 3; + + w = (const uint16_t *)d; + if (len > 1) { + /* Last word. */ + sum += *w++; + len -= 2; + } + + if (len) { + /* Last byte. */ + b = (const uint8_t *)w; + sum += odp_cpu_to_be_16((uint16_t)*b << 8); + } + + /* + * If offset is odd, our sum is byte-flipped and we need to + * flip odd and even bytes. + */ + if (odp_unlikely(offset)) + sum = ((sum & 0xff00ff00ff00ff) << 8) | ((sum & 0xff00ff00ff00ff00) >> 8); + + return sum; +} + +#ifdef __cplusplus +} +#endif + +#endif |