aboutsummaryrefslogtreecommitdiff
path: root/platform/linux-generic/include/odp_chksum_internal.h
blob: 5a134ae2d8408562637ccc4d13a0006b87529b7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* Copyright (c) 2020, Nokia
 * All rights reserved.
 *
 * SPDX-License-Identifier:	BSD-3-Clause
 */

#ifndef ODP_CHKSUM_INTERNAL_H_
#define ODP_CHKSUM_INTERNAL_H_

#ifdef __cplusplus
extern "C" {
#endif

#include <odp/api/hints.h>
#include <odp/api/byteorder.h>
#include <odp_cpu.h>
#include <stdint.h>

/*
 * Compute the final Internet checksum (RFC 1071) based on a partial
 * sum. A partial sum can be obtained e.g. by calling
 * chksum_partial().
 */
static inline uint16_t chksum_finalize(uint64_t sum)
{
	sum = (sum >> 32) + (sum & 0xffffffff);
	sum = (sum >> 16) + (sum & 0xffff);
	/*
	 * The final & 0xffff is intentionally omitted, the extra bits
	 * are discarded by the implicit cast to the return type.
	 */
	return (sum >> 16) + sum;
}

/*
 * Compute a partial checksum. Several partial checksums may be summed
 * together. The final checksum may be obtained by calling
 * chksum_finalize(). Parameter offset is the offset of this segment
 * of data from the start of IP header.
 *
 * This implementation
 *
 * - Accepts unaligned data.
 *
 * - Accepts data at any byte offset from the start of IP header,
 *   including odd offsets.
 *
 * - Uses unaligned memory access only if available.
 *
 * - Is optimized (for skylake, cn96, a53) by trial and error.
 *
 * The following did not improve performance (in synthetic tests):
 *
 * - 2 or 4 sub-sums in the main loop (to break dependency chains).
 *
 * - Aligning to 8 bytes instead of 4 (for ldp instruction). This
 *   makes the main loop faster on a53 (only), but the extra
 *   conditional branch has its cost.
 *
 * - __builtin_assume_aligned().
 */
static uint64_t chksum_partial(const void *addr, uint32_t len, uint32_t offset)
{
	const uint8_t *b;
	const uint16_t *w;
	const uint32_t *d;
	uint64_t sum = 0;

	/*
	 * Offset is either even or odd, the rest of it doesn't
	 * matter.
	 */
	offset &= 1;

	if (_ODP_UNALIGNED) {
		/*
		 * We have efficient unaligned access. Just read
		 * dwords starting at the given address.
		 */
		d = (const uint32_t *)addr;
	} else {
		/*
		 * We must avoid unaligned access, so align to 4 bytes
		 * by summing up the first up to 3 bytes.
		 */
		b = (const uint8_t *)addr;

		if (odp_unlikely((uintptr_t)b & 1) && len >= 1) {
			/*
			 * Align to 2 bytes by handling an odd
			 * byte. Since addr is unaligned, the first
			 * byte goes into the second byte of the sum.
			 */
			sum += odp_cpu_to_be_16(*b++);
			len -= 1;

			/* An odd byte negates the effect of offset. */
			offset ^= 1;
		}

		/*
		 * This cast increases alignment, but it's OK, since
		 * we've made sure that the pointer value is aligned.
		 */
		w = (const uint16_t *)(uintptr_t)b;

		if ((uintptr_t)w & 2 && len >= 2) {
			/* Align bytes by handling an odd word. */
			sum += *w++;
			len -= 2;
		}

		/* Increases alignment. */
		d = (const uint32_t *)(uintptr_t)w;
	}

	while (len >= 32)  {
		/* 8 dwords or 32 bytes per round. */

		sum += *d++;
		sum += *d++;
		sum += *d++;
		sum += *d++;

		sum += *d++;
		sum += *d++;
		sum += *d++;
		sum += *d++;

		len -= 32;
	}

	/* Last up to 7 dwords. */
	switch (len >> 2) {
	case 7:
		sum += *d++;
		/* FALLTHROUGH */
	case 6:
		sum += *d++;
		/* FALLTHROUGH */
	case 5:
		sum += *d++;
		/* FALLTHROUGH */
	case 4:
		sum += *d++;
		/* FALLTHROUGH */
	case 3:
		sum += *d++;
		/* FALLTHROUGH */
	case 2:
		sum += *d++;
		/* FALLTHROUGH */
	case 1:
		sum += *d++;
		/* FALLTHROUGH */
	default:
		break;
	}

	len &= 3;

	w = (const uint16_t *)d;
	if (len > 1)  {
		/* Last word. */
		sum += *w++;
		len -= 2;
	}

	if (len) {
		/* Last byte. */
		b = (const uint8_t *)w;
		sum += odp_cpu_to_be_16((uint16_t)*b << 8);
	}

	/*
	 * If offset is odd, our sum is byte-flipped and we need to
	 * flip odd and even bytes.
	 */
	if (odp_unlikely(offset))
		sum = ((sum & 0xff00ff00ff00ff) << 8) | ((sum & 0xff00ff00ff00ff00) >> 8);

	return sum;
}

#ifdef __cplusplus
}
#endif

#endif