aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2015-01-10 12:09:11 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2015-01-10 12:09:11 +0000
commite38d89aca2cbbfae7f5271e871cdbaa746264040 (patch)
treec739b7b367d7cd5505cd47111d5887fa9224ada6
parent95b6497fc89cfa46e97093bcf7e9cede71949659 (diff)
Simplify the code somewhat. It actually wasn't necessary to have a "fast path" and a "medium path"-- they perform the same.
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1486 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r--simd/jccolext-altivec.c119
-rw-r--r--simd/jcgryext-altivec.c119
2 files changed, 84 insertions, 154 deletions
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
index e3a97b3..0455ca2 100644
--- a/simd/jccolext-altivec.c
+++ b/simd/jccolext-altivec.c
@@ -70,101 +70,66 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
- if (bytes >= (RGB_PIXELSIZE + 1) * 16) {
- /* Fast path -- we have enough buffer space to load all vectors.
- * Even if we don't need them all, this is faster than narrowing
- * down which ones we need.
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
*/
- rgb0 = vec_ld(0, inptr);
- rgb1 = vec_ld(16, inptr);
- rgb2 = vec_ld(32, inptr);
- rgb3 = vec_ld(48, inptr);
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
- rgb4 = vec_ld(64, inptr);
+ rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
- if (bytes & 15) {
- /* Slow path to prevent buffer overread. Since there is no way to
- * read a partial AltiVec register, overread would occur on the
- * last chunk of the last image row if the right edge is not on a
- * 16-byte boundary. It could also occur on other rows if the
- * bytes per row is low enough. Since we can't determine whether
- * we're on the last image row, we have to assume every row is the
- * last.
- */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
#endif
- goto start; /* Skip permutation */
- } else {
- /* Medium path -- if the right edge is vector-aligned, then we can
- * read full vectors (but with a lot of branches.)
- */
- rgb0 = vec_ld(0, inptr);
- if (bytes > 16) {
- rgb1 = vec_ld(16, inptr);
- if (bytes > 32) {
- rgb2 = vec_ld(32, inptr);
- if (bytes > 48) {
- rgb3 = vec_ld(48, inptr);
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- rgb4 = vec_ld(64, inptr);
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
#endif
- }
- }
- }
- }
}
-
- unaligned_shift_index = vec_lvsl(0, inptr);
- rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
- rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-#endif
} else {
- if (num_cols >= RGB_PIXELSIZE * 16) {
- /* Fast path */
- rgb0 = vec_ld(0, inptr);
- rgb1 = vec_ld(16, inptr);
- rgb2 = vec_ld(32, inptr);
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, inptr);
+ rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
- if (num_cols & 15) {
- /* Slow path */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
-#endif
- } else {
- /* Medium path */
- rgb0 = vec_ld(0, inptr);
- if (num_cols > 16) {
- rgb1 = vec_ld(16, inptr);
- if (num_cols > 32) {
- rgb2 = vec_ld(32, inptr);
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_ld(32, inptr);
#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- rgb3 = vec_ld(48, inptr);
+ if (num_cols > 48)
+ rgb3 = vec_ld(48, inptr);
#endif
- }
- }
- }
}
}
-start:
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
index 9337744..1578b8f 100644
--- a/simd/jcgryext-altivec.c
+++ b/simd/jcgryext-altivec.c
@@ -65,101 +65,66 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
__vector unsigned char unaligned_shift_index;
int bytes = num_cols + offset;
- if (bytes >= (RGB_PIXELSIZE + 1) * 16) {
- /* Fast path -- we have enough buffer space to load all vectors.
- * Even if we don't need them all, this is faster than narrowing
- * down which ones we need.
+ if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+ /* Slow path to prevent buffer overread. Since there is no way to
+ * read a partial AltiVec register, overread would occur on the last
+ * chunk of the last image row if the right edge is not on a 16-byte
+ * 16-byte boundary. It could also occur on other rows if the bytes
+ * per row is low enough. Since we can't determine whether we're on
+ * the last image row, we have to assume every row is the last.
*/
- rgb0 = vec_ld(0, inptr);
- rgb1 = vec_ld(16, inptr);
- rgb2 = vec_ld(32, inptr);
- rgb3 = vec_ld(48, inptr);
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
- rgb4 = vec_ld(64, inptr);
+ rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
- if (bytes & 15) {
- /* Slow path to prevent buffer overread. Since there is no way to
- * read a partial AltiVec register, overread would occur on the
- * last chunk of the last image row if the right edge is not on a
- * 16-byte boundary. It could also occur on other rows if the
- * bytes per row is low enough. Since we can't determine whether
- * we're on the last image row, we have to assume every row is the
- * last.
- */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (bytes > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (bytes > 32)
+ rgb2 = vec_ld(32, inptr);
+ if (bytes > 48)
+ rgb3 = vec_ld(48, inptr);
#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
+ if (bytes > 64)
+ rgb4 = vec_ld(64, inptr);
#endif
- goto start; /* Skip permutation */
- } else {
- /* Medium path -- if the right edge is vector-aligned, then we can
- * read full vectors (but with a lot of branches.)
- */
- rgb0 = vec_ld(0, inptr);
- if (bytes > 16) {
- rgb1 = vec_ld(16, inptr);
- if (bytes > 32) {
- rgb2 = vec_ld(32, inptr);
- if (bytes > 48) {
- rgb3 = vec_ld(48, inptr);
+ unaligned_shift_index = vec_lvsl(0, inptr);
+ rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+ rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+ rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- rgb4 = vec_ld(64, inptr);
+ rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
#endif
- }
- }
- }
- }
}
-
- unaligned_shift_index = vec_lvsl(0, inptr);
- rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
- rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-#endif
} else {
- if (num_cols >= RGB_PIXELSIZE * 16) {
- /* Fast path */
- rgb0 = vec_ld(0, inptr);
- rgb1 = vec_ld(16, inptr);
- rgb2 = vec_ld(32, inptr);
+ if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+ /* Slow path */
+ memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+ rgb0 = vec_ld(0, tmpbuf);
+ rgb1 = vec_ld(16, tmpbuf);
+ rgb2 = vec_ld(32, tmpbuf);
#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, inptr);
+ rgb3 = vec_ld(48, tmpbuf);
#endif
} else {
- if (num_cols & 15) {
- /* Slow path */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
-#endif
- } else {
- /* Medium path */
- rgb0 = vec_ld(0, inptr);
- if (num_cols > 16) {
- rgb1 = vec_ld(16, inptr);
- if (num_cols > 32) {
- rgb2 = vec_ld(32, inptr);
+ /* Fast path */
+ rgb0 = vec_ld(0, inptr);
+ if (num_cols > 16)
+ rgb1 = vec_ld(16, inptr);
+ if (num_cols > 32)
+ rgb2 = vec_ld(32, inptr);
#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- rgb3 = vec_ld(48, inptr);
+ if (num_cols > 48)
+ rgb3 = vec_ld(48, inptr);
#endif
- }
- }
- }
}
}
-start:
#if RGB_PIXELSIZE == 3
/* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
* rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga