From e38d89aca2cbbfae7f5271e871cdbaa746264040 Mon Sep 17 00:00:00 2001 From: dcommander Date: Sat, 10 Jan 2015 12:09:11 +0000 Subject: Simplify the code somewhat. It actually wasn't necessary to have a "fast path" and a "medium path"-- they perform the same. git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1486 632fc199-4ca6-4c93-a231-07263d6284db --- simd/jccolext-altivec.c | 119 +++++++++++++++++------------------------------- simd/jcgryext-altivec.c | 119 +++++++++++++++++------------------------------- 2 files changed, 84 insertions(+), 154 deletions(-) diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c index e3a97b3..0455ca2 100644 --- a/simd/jccolext-altivec.c +++ b/simd/jccolext-altivec.c @@ -70,101 +70,66 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf, __vector unsigned char unaligned_shift_index; int bytes = num_cols + offset; - if (bytes >= (RGB_PIXELSIZE + 1) * 16) { - /* Fast path -- we have enough buffer space to load all vectors. - * Even if we don't need them all, this is faster than narrowing - * down which ones we need. + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. */ - rgb0 = vec_ld(0, inptr); - rgb1 = vec_ld(16, inptr); - rgb2 = vec_ld(32, inptr); - rgb3 = vec_ld(48, inptr); + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); #if RGB_PIXELSIZE == 4 - rgb4 = vec_ld(64, inptr); + rgb3 = vec_ld(48, tmpbuf); #endif } else { - if (bytes & 15) { - /* Slow path to prevent buffer overread. Since there is no way to - * read a partial AltiVec register, overread would occur on the - * last chunk of the last image row if the right edge is not on a - * 16-byte boundary. It could also occur on other rows if the - * bytes per row is low enough. Since we can't determine whether - * we're on the last image row, we have to assume every row is the - * last. - */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); #if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); + if (bytes > 64) + rgb4 = vec_ld(64, inptr); #endif - goto start; /* Skip permutation */ - } else { - /* Medium path -- if the right edge is vector-aligned, then we can - * read full vectors (but with a lot of branches.) - */ - rgb0 = vec_ld(0, inptr); - if (bytes > 16) { - rgb1 = vec_ld(16, inptr); - if (bytes > 32) { - rgb2 = vec_ld(32, inptr); - if (bytes > 48) { - rgb3 = vec_ld(48, inptr); + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); #if RGB_PIXELSIZE == 4 - if (bytes > 64) - rgb4 = vec_ld(64, inptr); + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); #endif - } - } - } - } } - - unaligned_shift_index = vec_lvsl(0, inptr); - rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); - rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); - rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); -#endif } else { - if (num_cols >= RGB_PIXELSIZE * 16) { - /* Fast path */ - rgb0 = vec_ld(0, inptr); - rgb1 = vec_ld(16, inptr); - rgb2 = vec_ld(32, inptr); + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); #if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, inptr); + rgb3 = vec_ld(48, tmpbuf); #endif } else { - if (num_cols & 15) { - /* Slow path */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); -#endif - } else { - /* Medium path */ - rgb0 = vec_ld(0, inptr); - if (num_cols > 16) { - rgb1 = vec_ld(16, inptr); - if (num_cols > 32) { - rgb2 = vec_ld(32, inptr); + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); #if RGB_PIXELSIZE == 4 - if (num_cols > 48) - rgb3 = vec_ld(48, inptr); + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); #endif - } - } - } } } -start: #if RGB_PIXELSIZE == 3 /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c index 9337744..1578b8f 100644 --- a/simd/jcgryext-altivec.c +++ b/simd/jcgryext-altivec.c @@ -65,101 +65,66 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width, __vector unsigned char unaligned_shift_index; int bytes = num_cols + offset; - if (bytes >= (RGB_PIXELSIZE + 1) * 16) { - /* Fast path -- we have enough buffer space to load all vectors. - * Even if we don't need them all, this is faster than narrowing - * down which ones we need. + if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { + /* Slow path to prevent buffer overread. Since there is no way to + * read a partial AltiVec register, overread would occur on the last + * chunk of the last image row if the right edge is not on a 16-byte + * 16-byte boundary. It could also occur on other rows if the bytes + * per row is low enough. Since we can't determine whether we're on + * the last image row, we have to assume every row is the last. */ - rgb0 = vec_ld(0, inptr); - rgb1 = vec_ld(16, inptr); - rgb2 = vec_ld(32, inptr); - rgb3 = vec_ld(48, inptr); + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); #if RGB_PIXELSIZE == 4 - rgb4 = vec_ld(64, inptr); + rgb3 = vec_ld(48, tmpbuf); #endif } else { - if (bytes & 15) { - /* Slow path to prevent buffer overread. Since there is no way to - * read a partial AltiVec register, overread would occur on the - * last chunk of the last image row if the right edge is not on a - * 16-byte boundary. It could also occur on other rows if the - * bytes per row is low enough. Since we can't determine whether - * we're on the last image row, we have to assume every row is the - * last. - */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (bytes > 16) + rgb1 = vec_ld(16, inptr); + if (bytes > 32) + rgb2 = vec_ld(32, inptr); + if (bytes > 48) + rgb3 = vec_ld(48, inptr); #if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); + if (bytes > 64) + rgb4 = vec_ld(64, inptr); #endif - goto start; /* Skip permutation */ - } else { - /* Medium path -- if the right edge is vector-aligned, then we can - * read full vectors (but with a lot of branches.) - */ - rgb0 = vec_ld(0, inptr); - if (bytes > 16) { - rgb1 = vec_ld(16, inptr); - if (bytes > 32) { - rgb2 = vec_ld(32, inptr); - if (bytes > 48) { - rgb3 = vec_ld(48, inptr); + unaligned_shift_index = vec_lvsl(0, inptr); + rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); + rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); + rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); #if RGB_PIXELSIZE == 4 - if (bytes > 64) - rgb4 = vec_ld(64, inptr); + rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); #endif - } - } - } - } } - - unaligned_shift_index = vec_lvsl(0, inptr); - rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); - rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); - rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); -#endif } else { - if (num_cols >= RGB_PIXELSIZE * 16) { - /* Fast path */ - rgb0 = vec_ld(0, inptr); - rgb1 = vec_ld(16, inptr); - rgb2 = vec_ld(32, inptr); + if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { + /* Slow path */ + memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); + rgb0 = vec_ld(0, tmpbuf); + rgb1 = vec_ld(16, tmpbuf); + rgb2 = vec_ld(32, tmpbuf); #if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, inptr); + rgb3 = vec_ld(48, tmpbuf); #endif } else { - if (num_cols & 15) { - /* Slow path */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); -#endif - } else { - /* Medium path */ - rgb0 = vec_ld(0, inptr); - if (num_cols > 16) { - rgb1 = vec_ld(16, inptr); - if (num_cols > 32) { - rgb2 = vec_ld(32, inptr); + /* Fast path */ + rgb0 = vec_ld(0, inptr); + if (num_cols > 16) + rgb1 = vec_ld(16, inptr); + if (num_cols > 32) + rgb2 = vec_ld(32, inptr); #if RGB_PIXELSIZE == 4 - if (num_cols > 48) - rgb3 = vec_ld(48, inptr); + if (num_cols > 48) + rgb3 = vec_ld(48, inptr); #endif - } - } - } } } -start: #if RGB_PIXELSIZE == 3 /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga -- cgit v1.2.3