diff options
author | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2011-06-17 21:12:58 +0000 |
---|---|---|
committer | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2011-06-17 21:12:58 +0000 |
commit | 121e12173aed78a85622794f1a6834cc4e50de90 (patch) | |
tree | 48b843678858c0528580cf0f5d67e7523fc2292b /trunk/simd | |
parent | 73eeb525fbaba9f21f1c71c1a6a54a1495809ff1 (diff) |
NEON-optimized 2x2 and 4x4 scaled iDCTs
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@662 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk/simd')
-rw-r--r-- | trunk/simd/jsimd.h | 9 | ||||
-rw-r--r-- | trunk/simd/jsimd_arm.c | 34 | ||||
-rw-r--r-- | trunk/simd/jsimd_arm_neon.S | 371 |
3 files changed, 414 insertions, 0 deletions
diff --git a/trunk/simd/jsimd.h b/trunk/simd/jsimd.h index 3b801f4..39a0867 100644 --- a/trunk/simd/jsimd.h +++ b/trunk/simd/jsimd.h @@ -569,6 +569,15 @@ EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + /* SIMD Inverse DCT */ EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table, JCOEFPTR coef_block, diff --git a/trunk/simd/jsimd_arm.c b/trunk/simd/jsimd_arm.c index b4d094d..1a5cdd3 100644 --- a/trunk/simd/jsimd_arm.c +++ b/trunk/simd/jsimd_arm.c @@ -440,6 +440,21 @@ jsimd_can_idct_2x2 (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_ARM_NEON)) + return 1; + return 0; } @@ -448,6 +463,21 @@ jsimd_can_idct_4x4 (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_ARM_NEON)) + return 1; + return 0; } @@ -456,6 +486,8 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); } GLOBAL(void) @@ -463,6 +495,8 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); } GLOBAL(int) diff --git a/trunk/simd/jsimd_arm_neon.S b/trunk/simd/jsimd_arm_neon.S index 6f4de8d..dc94ee5 100644 --- a/trunk/simd/jsimd_arm_neon.S +++ b/trunk/simd/jsimd_arm_neon.S @@ -32,6 +32,9 @@ .object_arch armv4 .arm + +#define RESPECT_STRICT_ALIGNMENT 1 + /*****************************************************************************/ /* Supplementary macro for setting function attributes */ @@ -247,6 +250,374 @@ asm_function jsimd_idct_ifast_neon /*****************************************************************************/ /* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +jsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* d0[0] */ + .short -FIX_0_765366865 /* d0[1] */ + .short -FIX_0_211164243 /* d0[2] */ + .short FIX_1_451774981 /* d0[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* d2[0] */ + .short FIX_2_562915447 /* d2[1] */ + .short 1 << (CONST_BITS+1) /* d2[2] */ + .short 0 /* d2[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + vmull.s16 q14, \x4, d2[2] + vmlal.s16 q14, \x8, d0[0] + vmlal.s16 q14, \x14, d0[1] + + vmull.s16 q13, \x16, d1[2] + vmlal.s16 q13, \x12, d1[3] + vmlal.s16 q13, \x10, d2[0] + vmlal.s16 q13, \x6, d2[1] + + vmull.s16 q15, \x4, d2[2] + vmlsl.s16 q15, \x8, d0[0] + vmlsl.s16 q15, \x14, d0[1] + + vmull.s16 q12, \x16, d0[2] + vmlal.s16 q12, \x12, d0[3] + vmlal.s16 q12, \x10, d1[0] + vmlal.s16 q12, \x6, d1[1] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y29, q14 +.else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y29, q14, #\shift +.endif + + vadd.s32 q10, q15, q12 + vsub.s32 q15, q15, q12 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q15, q15, #\shift + vmovn.s32 \y27, q10 + vmovn.s32 \y28, q15 +.else + vrshrn.s32 \y27, q10, #\shift + vrshrn.s32 \y28, q15, #\shift +.endif + +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + vpush {d8-d15} + + /* Load constants (d3 is just used for padding) */ + adr TMP4, jsimd_idct_4x4_neon_consts + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | d14 | d15 + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q3, q3, q10 + vmul.s16 q4, q4, q11 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + vmul.s16 q6, q6, q13 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q7, q7, q14 + vmul.s16 q8, q8, q15 + + /* Pass 1 */ + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 + transpose_4x4 d5, d7, d9, d11 + + /* Pass 2 */ + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 + transpose_4x4 d26, d27, d28, d29 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vadd.s16 q14, q14, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q14 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + vst1.32 {d26[0]}, [TMP1]! + vst1.32 {d27[0]}, [TMP3]! + vst1.32 {d26[1]}, [TMP2]! + vst1.32 {d27[1]}, [TMP4]! +#else + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[0]}, [TMP3]! + vst1.8 {d26[1]}, [TMP1]! + vst1.8 {d27[1]}, [TMP3]! + vst1.8 {d26[2]}, [TMP1]! + vst1.8 {d27[2]}, [TMP3]! + vst1.8 {d26[3]}, [TMP1]! + vst1.8 {d27[3]}, [TMP3]! + + vst1.8 {d26[4]}, [TMP2]! + vst1.8 {d27[4]}, [TMP4]! + vst1.8 {d26[5]}, [TMP2]! + vst1.8 {d27[5]}, [TMP4]! + vst1.8 {d26[6]}, [TMP2]! + vst1.8 {d27[6]}, [TMP4]! + vst1.8 {d26[7]}, [TMP2]! + vst1.8 {d27[7]}, [TMP4]! +#endif + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 +.endfunc + +.purgem idct_helper + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +jsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* d0[0] */ + .short FIX_0_850430095 /* d0[1] */ + .short -FIX_1_272758580 /* d0[2] */ + .short FIX_3_624509785 /* d0[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + vshll.s16 q14, \x4, #15 + vmull.s16 q13, \x6, d0[3] + vmlal.s16 q13, \x10, d0[2] + vmlal.s16 q13, \x12, d0[1] + vmlal.s16 q13, \x16, d0[0] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y27, q14 +.else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y27, q14, #\shift +.endif + +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP2, jsimd_idct_2x2_neon_consts + vld1.16 {d0}, [TMP2, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | - | - + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | - | - + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* Dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vmul.s16 q3, q3, q10 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! + vmul.s16 q6, q6, q13 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q8, q8, q15 + + /* Pass 1 */ +#if 0 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 + transpose_4x4 d5, d7, d9, d11 +#else + vmull.s16 q13, d6, d0[3] + vmlal.s16 q13, d10, d0[2] + vmlal.s16 q13, d12, d0[1] + vmlal.s16 q13, d16, d0[0] + vmull.s16 q12, d7, d0[3] + vmlal.s16 q12, d11, d0[2] + vmlal.s16 q12, d13, d0[1] + vmlal.s16 q12, d17, d0[0] + vshll.s16 q14, d4, #15 + vshll.s16 q15, d5, #15 + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + vrshrn.s32 d4, q10, #13 + vrshrn.s32 d6, q14, #13 + vadd.s32 q10, q15, q12 + vsub.s32 q14, q15, q12 + vrshrn.s32 d5, q10, #13 + vrshrn.s32 d7, q14, #13 + vtrn.16 q2, q3 + vtrn.32 q3, q5 +#endif + + /* Pass 2 */ + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q13 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[4]}, [TMP1]! + vst1.8 {d26[1]}, [TMP2]! + vst1.8 {d27[5]}, [TMP2]! + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 +.endfunc + +.purgem idct_helper + +/*****************************************************************************/ + +/* * jsimd_ycc_extrgb_convert_neon * jsimd_ycc_extbgr_convert_neon * jsimd_ycc_extrgbx_convert_neon |