summaryrefslogtreecommitdiff
path: root/trunk/simd
diff options
context:
space:
mode:
authordcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2011-08-17 21:00:59 +0000
committerdcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2011-08-17 21:00:59 +0000
commit4bc87ce1693a14038b34dbcd5f34b4b3d357e227 (patch)
treee1c971caedcad48bc33a3d8522302cf7f50a35aa /trunk/simd
parenta9dddf79f5060155c527e021c729267207f82ca1 (diff)
NEON-accelerated quantization
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@689 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk/simd')
-rw-r--r--trunk/simd/jsimd.h4
-rw-r--r--trunk/simd/jsimd_arm.c13
-rw-r--r--trunk/simd/jsimd_arm_neon.S99
3 files changed, 116 insertions, 0 deletions
diff --git a/trunk/simd/jsimd.h b/trunk/simd/jsimd.h
index 07ccfff..38d2e53 100644
--- a/trunk/simd/jsimd.h
+++ b/trunk/simd/jsimd.h
@@ -572,6 +572,10 @@ EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
DCTELEM * divisors,
DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block,
+ DCTELEM * divisors,
+ DCTELEM * workspace));
+
EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
FAST_FLOAT * divisors,
FAST_FLOAT * workspace));
diff --git a/trunk/simd/jsimd_arm.c b/trunk/simd/jsimd_arm.c
index 05e0ca2..7d1aa54 100644
--- a/trunk/simd/jsimd_arm.c
+++ b/trunk/simd/jsimd_arm.c
@@ -479,6 +479,17 @@ jsimd_can_quantize (void)
{
init_simd();
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_ARM_NEON)
+ return 1;
+
return 0;
}
@@ -494,6 +505,8 @@ GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
+ if (simd_support & JSIMD_ARM_NEON)
+ jsimd_quantize_neon(coef_block, divisors, workspace);
}
GLOBAL(void)
diff --git a/trunk/simd/jsimd_arm_neon.S b/trunk/simd/jsimd_arm_neon.S
index c942352..0a72854 100644
--- a/trunk/simd/jsimd_arm_neon.S
+++ b/trunk/simd/jsimd_arm_neon.S
@@ -1406,3 +1406,102 @@ asm_function jsimd_fdct_ifast_neon
.endfunc
/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
+ * DCTELEM * workspace);
+ *
+ * Note: the code uses 2 stage pipelining in order to improve instructions
+ * scheduling and eliminate stalls (this provides ~15% better
+ * performance for this function on both ARM Cortex-A8 and
+ * ARM Cortex-A9 when compared to the non-pipelined variant).
+ * The instructions which belong to the second stage use different
+ * indentation for better readiability.
+ */
+asm_function jsimd_quantize_neon
+
+ COEF_BLOCK .req r0
+ DIVISORS .req r1
+ WORKSPACE .req r2
+
+ RECIPROCAL .req DIVISORS
+ CORRECTION .req r3
+ SHIFT .req ip
+ LOOP_COUNT .req r4
+
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
+ vabs.s16 q12, q0
+ add CORRECTION, DIVISORS, #(64 * 2)
+ add SHIFT, DIVISORS, #(64 * 6)
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
+ vabs.s16 q13, q1
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
+ vadd.u16 q12, q12, q10 /* add correction */
+ vadd.u16 q13, q13, q11
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
+ vmull.u16 q11, d25, d17
+ vmull.u16 q8, d26, d18
+ vmull.u16 q9, d27, d19
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
+ vshrn.u32 d20, q10, #16
+ vshrn.u32 d21, q11, #16
+ vshrn.u32 d22, q8, #16
+ vshrn.u32 d23, q9, #16
+ vneg.s16 q12, q12
+ vneg.s16 q13, q13
+ vshr.s16 q2, q0, #15 /* extract sign */
+ vshr.s16 q3, q1, #15
+ vshl.u16 q14, q10, q12 /* shift */
+ vshl.u16 q15, q11, q13
+
+ push {r4, r5}
+ mov LOOP_COUNT, #3
+1:
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
+ veor.u16 q14, q14, q2 /* restore sign */
+ vabs.s16 q12, q0
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
+ vabs.s16 q13, q1
+ veor.u16 q15, q15, q3
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
+ vadd.u16 q12, q12, q10 /* add correction */
+ vadd.u16 q13, q13, q11
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
+ vmull.u16 q11, d25, d17
+ vmull.u16 q8, d26, d18
+ vmull.u16 q9, d27, d19
+ vsub.u16 q14, q14, q2
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
+ vsub.u16 q15, q15, q3
+ vshrn.u32 d20, q10, #16
+ vshrn.u32 d21, q11, #16
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
+ vshrn.u32 d22, q8, #16
+ vshrn.u32 d23, q9, #16
+ vneg.s16 q12, q12
+ vneg.s16 q13, q13
+ vshr.s16 q2, q0, #15 /* extract sign */
+ vshr.s16 q3, q1, #15
+ vshl.u16 q14, q10, q12 /* shift */
+ vshl.u16 q15, q11, q13
+ subs LOOP_COUNT, LOOP_COUNT, #1
+ bne 1b
+ pop {r4, r5}
+
+ veor.u16 q14, q14, q2 /* restore sign */
+ veor.u16 q15, q15, q3
+ vsub.u16 q14, q14, q2
+ vsub.u16 q15, q15, q3
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
+
+ bx lr /* return */
+
+ .unreq COEF_BLOCK
+ .unreq DIVISORS
+ .unreq WORKSPACE
+ .unreq RECIPROCAL
+ .unreq CORRECTION
+ .unreq SHIFT
+ .unreq LOOP_COUNT
+.endfunc