diff options
author | dcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db> | 2014-12-22 16:04:17 +0000 |
---|---|---|
committer | dcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db> | 2014-12-22 16:04:17 +0000 |
commit | 18c3256ba0149ef24e1900ebcf7fe291a19d3aca (patch) | |
tree | ceb27cd74e3b4a6261efb6c7c8a80686e18f30e4 /simd | |
parent | 377ac0e6c2521d04d6aafa779d45a690b5ef9d5d (diff) |
Use intrinsics for loading/storing data in the DCT/IDCT functions. This has no effect on the performance of the aligned loads/stores, but it makes it more obvious what that code is doing. Using intrinsics for the unaligned stores in the inverse DCT functions increases overall decompression performance by 1-2%.
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1472 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r-- | simd/jfdctfst-altivec.c | 32 | ||||
-rw-r--r-- | simd/jfdctint-altivec.c | 32 | ||||
-rw-r--r-- | simd/jidctfst-altivec.c | 78 | ||||
-rw-r--r-- | simd/jidctint-altivec.c | 62 |
4 files changed, 122 insertions, 82 deletions
diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c index 3556ab5..16a52df 100644 --- a/simd/jfdctfst-altivec.c +++ b/simd/jfdctfst-altivec.c @@ -108,14 +108,14 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) /* Pass 1: process rows */ - row0 = *(__vector short *)&data[0]; - row1 = *(__vector short *)&data[8]; - row2 = *(__vector short *)&data[16]; - row3 = *(__vector short *)&data[24]; - row4 = *(__vector short *)&data[32]; - row5 = *(__vector short *)&data[40]; - row6 = *(__vector short *)&data[48]; - row7 = *(__vector short *)&data[56]; + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); TRANSPOSE(row, col); @@ -145,12 +145,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) DO_FDCT(); - *(__vector short *)&data[0] = out0; - *(__vector short *)&data[8] = out1; - *(__vector short *)&data[16] = out2; - *(__vector short *)&data[24] = out3; - *(__vector short *)&data[32] = out4; - *(__vector short *)&data[40] = out5; - *(__vector short *)&data[48] = out6; - *(__vector short *)&data[56] = out7; + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); } diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c index 7699b29..548ab96 100644 --- a/simd/jfdctint-altivec.c +++ b/simd/jfdctint-altivec.c @@ -177,14 +177,14 @@ jsimd_fdct_islow_altivec (DCTELEM *data) /* Pass 1: process rows */ - row0 = *(__vector short *)&data[0]; - row1 = *(__vector short *)&data[8]; - row2 = *(__vector short *)&data[16]; - row3 = *(__vector short *)&data[24]; - row4 = *(__vector short *)&data[32]; - row5 = *(__vector short *)&data[40]; - row6 = *(__vector short *)&data[48]; - row7 = *(__vector short *)&data[56]; + row0 = vec_ld(0, data); + row1 = vec_ld(16, data); + row2 = vec_ld(32, data); + row3 = vec_ld(48, data); + row4 = vec_ld(64, data); + row5 = vec_ld(80, data); + row6 = vec_ld(96, data); + row7 = vec_ld(112, data); TRANSPOSE(row, col); @@ -214,12 +214,12 @@ jsimd_fdct_islow_altivec (DCTELEM *data) DO_FDCT_COLS(); - *(__vector short *)&data[0] = out0; - *(__vector short *)&data[8] = out1; - *(__vector short *)&data[16] = out2; - *(__vector short *)&data[24] = out3; - *(__vector short *)&data[32] = out4; - *(__vector short *)&data[40] = out5; - *(__vector short *)&data[48] = out6; - *(__vector short *)&data[56] = out7; + vec_st(out0, 0, data); + vec_st(out1, 16, data); + vec_st(out2, 32, data); + vec_st(out3, 48, data); + vec_st(out4, 64, data); + vec_st(out5, 80, data); + vec_st(out6, 96, data); + vec_st(out7, 112, data); } diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c index 33acb83..37a2f4e 100644 --- a/simd/jidctfst-altivec.c +++ b/simd/jidctfst-altivec.c @@ -112,7 +112,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block, z5, z10, z10s, z11, z12s, z13, out0, out1, out2, out3, out4, out5, out6, out7; __vector signed char outb; - long long *outptr, *outbptr = (long long *)(&outb); + int *outptr; /* Constants */ __vector short zero = { __8X(0) }, @@ -127,14 +127,14 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block, /* Pass 1: process columns */ - col0 = *(__vector short *)&coef_block[0]; - col1 = *(__vector short *)&coef_block[8]; - col2 = *(__vector short *)&coef_block[16]; - col3 = *(__vector short *)&coef_block[24]; - col4 = *(__vector short *)&coef_block[32]; - col5 = *(__vector short *)&coef_block[40]; - col6 = *(__vector short *)&coef_block[48]; - col7 = *(__vector short *)&coef_block[56]; + col0 = vec_ld(0, coef_block); + col1 = vec_ld(16, coef_block); + col2 = vec_ld(32, coef_block); + col3 = vec_ld(48, coef_block); + col4 = vec_ld(64, coef_block); + col5 = vec_ld(80, coef_block); + col6 = vec_ld(96, coef_block); + col7 = vec_ld(112, coef_block); tmp1 = vec_or(col1, col2); tmp2 = vec_or(col3, col4); @@ -196,31 +196,51 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block, TRANSPOSE(out, col); - outb = vec_packs(col0, col1); + outb = vec_packs(col0, col0); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[0] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[1] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col2, col3); + outb = vec_packs(col1, col1); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[2] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[3] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col4, col5); + outb = vec_packs(col2, col2); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[4] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[5] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col6, col7); + outb = vec_packs(col3, col3); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[6] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[7] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); } diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c index d54479b..a354fcc 100644 --- a/simd/jidctint-altivec.c +++ b/simd/jidctint-altivec.c @@ -186,7 +186,7 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block, out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; __vector signed char outb; - long long *outptr, *outbptr = (long long *)(&outb); + int *outptr; /* Constants */ __vector short zero16 = { __8X(0) }, @@ -271,31 +271,51 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block, TRANSPOSE(out, col); - outb = vec_packs(col0, col1); + outb = vec_packs(col0, col0); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[0] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[1] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[0] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col2, col3); + outb = vec_packs(col1, col1); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[2] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[3] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[1] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col4, col5); + outb = vec_packs(col2, col2); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[4] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[5] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[2] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); - outb = vec_packs(col6, col7); + outb = vec_packs(col3, col3); outb = vec_add(outb, pb_centerjsamp); - outptr = (long long *)(output_buf[6] + output_col); - *outptr = outbptr[0]; - outptr = (long long *)(output_buf[7] + output_col); - *outptr = outbptr[1]; + outptr = (int *)(output_buf[3] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col4, col4); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[4] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col5, col5); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[5] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col6, col6); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[6] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); + + outb = vec_packs(col7, col7); + outb = vec_add(outb, pb_centerjsamp); + outptr = (int *)(output_buf[7] + output_col); + vec_ste((__vector int)outb, 0, outptr); + vec_ste((__vector int)outb, 4, outptr); } |