diff options
author | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2011-08-17 21:00:59 +0000 |
---|---|---|
committer | dcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519> | 2011-08-17 21:00:59 +0000 |
commit | 4bc87ce1693a14038b34dbcd5f34b4b3d357e227 (patch) | |
tree | e1c971caedcad48bc33a3d8522302cf7f50a35aa /trunk/simd | |
parent | a9dddf79f5060155c527e021c729267207f82ca1 (diff) |
NEON-accelerated quantization
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@689 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk/simd')
-rw-r--r-- | trunk/simd/jsimd.h | 4 | ||||
-rw-r--r-- | trunk/simd/jsimd_arm.c | 13 | ||||
-rw-r--r-- | trunk/simd/jsimd_arm_neon.S | 99 |
3 files changed, 116 insertions, 0 deletions
diff --git a/trunk/simd/jsimd.h b/trunk/simd/jsimd.h index 07ccfff..38d2e53 100644 --- a/trunk/simd/jsimd.h +++ b/trunk/simd/jsimd.h @@ -572,6 +572,10 @@ EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)); +EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block, + DCTELEM * divisors, + DCTELEM * workspace)); + EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)); diff --git a/trunk/simd/jsimd_arm.c b/trunk/simd/jsimd_arm.c index 05e0ca2..7d1aa54 100644 --- a/trunk/simd/jsimd_arm.c +++ b/trunk/simd/jsimd_arm.c @@ -479,6 +479,17 @@ jsimd_can_quantize (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -494,6 +505,8 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { + if (simd_support & JSIMD_ARM_NEON) + jsimd_quantize_neon(coef_block, divisors, workspace); } GLOBAL(void) diff --git a/trunk/simd/jsimd_arm_neon.S b/trunk/simd/jsimd_arm_neon.S index c942352..0a72854 100644 --- a/trunk/simd/jsimd_arm_neon.S +++ b/trunk/simd/jsimd_arm_neon.S @@ -1406,3 +1406,102 @@ asm_function jsimd_fdct_ifast_neon .endfunc /*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, + * DCTELEM * workspace); + * + * Note: the code uses 2 stage pipelining in order to improve instructions + * scheduling and eliminate stalls (this provides ~15% better + * performance for this function on both ARM Cortex-A8 and + * ARM Cortex-A9 when compared to the non-pipelined variant). + * The instructions which belong to the second stage use different + * indentation for better readiability. + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req r0 + DIVISORS .req r1 + WORKSPACE .req r2 + + RECIPROCAL .req DIVISORS + CORRECTION .req r3 + SHIFT .req ip + LOOP_COUNT .req r4 + + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + vabs.s16 q12, q0 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + + push {r4, r5} + mov LOOP_COUNT, #3 +1: + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + veor.u16 q14, q14, q2 /* restore sign */ + vabs.s16 q12, q0 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + veor.u16 q15, q15, q3 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vsub.u16 q14, q14, q2 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vsub.u16 q15, q15, q3 + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + subs LOOP_COUNT, LOOP_COUNT, #1 + bne 1b + pop {r4, r5} + + veor.u16 q14, q14, q2 /* restore sign */ + veor.u16 q15, q15, q3 + vsub.u16 q14, q14, q2 + vsub.u16 q15, q15, q3 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + + bx lr /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT +.endfunc |