aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2015-01-14 10:45:31 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2015-01-14 10:45:31 +0000
commit3c20e84d6f8f56b5b9d52d2a3678c5b0e064ea2c (patch)
tree3bde13d30fbf9ccd1aa4c77df54a9012725e2e26
parentb4c654c6277cdeee2a77394549e22812bea708c0 (diff)
Fix bugs in the AltiVec fancy upsampling routines uncovered during additional testing with small image sizes. Since the input width is half the output width, the upsampler should only write a second 16-byte chuck if there are more than 8 input columns left. Additionally, if the width is < 16, then we need to insert a dummy sample (the SSE2 code does this as well, but I neglected to port that portion of the code for some reason.)
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1501 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r--simd/jdsample-altivec.c101
1 files changed, 62 insertions, 39 deletions
diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c
index 6b8cecf..f49e403 100644
--- a/simd/jdsample-altivec.c
+++ b/simd/jdsample-altivec.c
@@ -33,7 +33,7 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
- int inrow, col;
+ int inrow, incol;
__vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
out;
@@ -53,19 +53,22 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
inptr = input_data[inrow];
outptr = output_data[inrow];
+ if (downsampled_width & 15)
+ inptr[downsampled_width] = inptr[downsampled_width - 1];
+
this0 = vec_ld(0, inptr);
p_last0 = vec_perm(this0, this0, last_index_col0);
last0 = this0;
- for (col = 0; col < downsampled_width;
- col += 16, inptr += 16, outptr += 32) {
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr += 16, outptr += 32) {
- if (col > 0) {
+ if (downsampled_width - incol > 0) {
p_last0 = vec_perm(last0, this0, last_index);
last0 = this0;
}
- if (downsampled_width - col <= 16)
+ if (incol <= 16)
p_next0 = vec_perm(this0, this0, next_index_lastcol);
else {
next0 = vec_ld(16, inptr);
@@ -80,28 +83,33 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
last0l = (__vector short)vec_mergeh(pb_zero, p_last0);
last0h = (__vector short)vec_mergel(pb_zero, p_last0);
last0l = vec_add(last0l, pw_one);
- last0h = vec_add(last0h, pw_one);
next0l = (__vector short)vec_mergeh(pb_zero, p_next0);
next0h = (__vector short)vec_mergel(pb_zero, p_next0);
next0l = vec_add(next0l, pw_two);
- next0h = vec_add(next0h, pw_two);
outle = vec_add(this0l, last0l);
- outhe = vec_add(this0h, last0h);
outlo = vec_add(this0l, next0l);
- outho = vec_add(this0h, next0h);
outle = vec_sr(outle, (__vector unsigned short)pw_two);
- outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
- outho = vec_sr(outho, (__vector unsigned short)pw_two);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr);
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr);
+
+ if (incol > 8) {
+ last0h = vec_add(last0h, pw_one);
+ next0h = vec_add(next0h, pw_two);
+
+ outhe = vec_add(this0h, last0h);
+ outho = vec_add(this0h, next0h);
+ outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+ outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr);
+ }
this0 = next0;
}
@@ -117,7 +125,7 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
- int inrow, outrow, col;
+ int inrow, outrow, incol;
__vector unsigned char this_1, this0, this1, out;
__vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
@@ -148,6 +156,12 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
+ if (downsampled_width & 15) {
+ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+ inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+ inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+ }
+
this0 = vec_ld(0, inptr0);
this0l = (__vector short)vec_mergeh(pb_zero, this0);
this0h = (__vector short)vec_mergel(pb_zero, this0);
@@ -172,11 +186,11 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
- for (col = 0; col < downsampled_width;
- col += 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
outptr0 += 32, outptr1 += 32) {
- if (col > 0) {
+ if (downsampled_width - incol > 0) {
p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
@@ -184,7 +198,7 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
}
- if (downsampled_width - col <= 16) {
+ if (incol <= 16) {
p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
next_index_lastcol);
@@ -218,53 +232,62 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
/* Process the upper row */
tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
- tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
outle = vec_add(tmpl, p_lastcolsum_1l);
- outhe = vec_add(tmph, p_lastcolsum_1h);
outle = vec_add(outle, pw_eight);
- outhe = vec_add(outhe, pw_eight);
outle = vec_sr(outle, pw_four);
- outhe = vec_sr(outhe, pw_four);
outlo = vec_add(tmpl, p_nextcolsum_1l);
- outho = vec_add(tmph, p_nextcolsum_1h);
outlo = vec_add(outlo, pw_seven);
- outho = vec_add(outho, pw_seven);
outlo = vec_sr(outlo, pw_four);
- outho = vec_sr(outho, pw_four);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr0);
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr0);
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum_1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum_1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr0);
+ }
/* Process the lower row */
tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
- tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
outle = vec_add(tmpl, p_lastcolsum1l);
- outhe = vec_add(tmph, p_lastcolsum1h);
outle = vec_add(outle, pw_eight);
- outhe = vec_add(outhe, pw_eight);
outle = vec_sr(outle, pw_four);
- outhe = vec_sr(outhe, pw_four);
outlo = vec_add(tmpl, p_nextcolsum1l);
- outho = vec_add(tmph, p_nextcolsum1h);
outlo = vec_add(outlo, pw_seven);
- outho = vec_add(outho, pw_seven);
outlo = vec_sr(outlo, pw_four);
- outho = vec_sr(outho, pw_four);
out = vec_perm((__vector unsigned char)outle,
(__vector unsigned char)outlo, merge_pack_index);
vec_st(out, 0, outptr1);
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr1);
+
+ if (incol > 8) {
+ tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+ outhe = vec_add(tmph, p_lastcolsum1h);
+ outhe = vec_add(outhe, pw_eight);
+ outhe = vec_sr(outhe, pw_four);
+
+ outho = vec_add(tmph, p_nextcolsum1h);
+ outho = vec_add(outho, pw_seven);
+ outho = vec_sr(outho, pw_four);
+
+ out = vec_perm((__vector unsigned char)outhe,
+ (__vector unsigned char)outho, merge_pack_index);
+ vec_st(out, 16, outptr1);
+ }
thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;