aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Gall <tom.gall@linaro.org>2012-01-09 22:22:46 +0000
committerTom Gall <tom.gall@linaro.org>2012-01-09 22:22:46 +0000
commit4171e24e7e9451847571afc9977ec066d9b9f804 (patch)
tree3385ef75d146170f5a5b561488562865f25c7363
parent746c51b314d8f0d0e033fd62314fd458a7d7b133 (diff)
692 : Make ARM ISLOW iDCT faster on typical cases, and eliminate the possibility of 16-bit overflows when handling arbitrary coefficients.
693 : Improve performance of IFAST iDCT by changing the order of transpose and descale steps 694 : Update Nokia contact info
-rw-r--r--simd/jsimd_arm_neon.S370
1 files changed, 248 insertions, 122 deletions
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 9ef6efc..b2f9c2a 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -3,7 +3,7 @@
*
* Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
- * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -263,46 +263,74 @@ asm_function jsimd_idct_islow_neon
vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
vmlal.s16 q6, d5, XFIX_1_175875602
vmull.s16 q7, d4, XFIX_1_175875602
+ /* Check for the zero coefficients in the right 4x8 half */
+ push {r4, r5}
vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
vsubl.s16 q3, ROW0L, ROW4L
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
vmull.s16 q2, ROW2L, XFIX_0_541196100
vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+ orr r0, r4, r5
vmov q4, q6
vmlsl.s16 q6, ROW5L, XFIX_2_562915447
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
vshl.s32 q3, q3, #13
+ orr r0, r0, r4
vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ orr r0, r0, r5
vadd.s32 q1, q3, q2
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
vmov q5, q7
vadd.s32 q1, q1, q6
+ orr r0, r0, r4
vmlsl.s16 q7, ROW7L, XFIX_0_899976223
+ orr r0, r0, r5
vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
vrshrn.s32 ROW1L, q1, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
vsub.s32 q1, q1, q6
vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+ orr r0, r0, r4
vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ orr r0, r0, r5
vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
vmlal.s16 q6, ROW6L, XFIX_0_541196100
vsub.s32 q3, q3, q2
+ orr r0, r0, r4
vrshrn.s32 ROW6L, q1, #11
+ orr r0, r0, r5
vadd.s32 q1, q3, q5
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
vsub.s32 q3, q3, q5
vaddl.s16 q5, ROW0L, ROW4L
+ orr r0, r0, r4
vrshrn.s32 ROW2L, q1, #11
+ orr r0, r0, r5
vrshrn.s32 ROW5L, q3, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
vshl.s32 q5, q5, #13
vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+ orr r0, r0, r4
vadd.s32 q2, q5, q6
+ orrs r0, r0, r5
vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4
+ orr r0, r4, r5
vsub.s32 q3, q1, q4
+ pop {r4, r5}
vrshrn.s32 ROW7L, q2, #11
vrshrn.s32 ROW3L, q5, #11
vrshrn.s32 ROW0L, q6, #11
vrshrn.s32 ROW4L, q3, #11
+
+ beq 3f /* Go to do some special handling for the sparse right 4x8 half */
+
/* 1-D IDCT, pass 1, right 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */
vadd.s16 d10, ROW7R, ROW3R
@@ -359,102 +387,101 @@ asm_function jsimd_idct_islow_neon
vrshrn.s32 ROW3R, q5, #11
vrshrn.s32 ROW0R, q6, #11
vrshrn.s32 ROW4R, q3, #11
+ /* Transpose right 4x8 half */
+ vtrn.16 ROW6R, ROW7R
+ vtrn.16 ROW2R, ROW3R
+ vtrn.16 ROW0R, ROW1R
+ vtrn.16 ROW4R, ROW5R
+ vtrn.32 ROW1R, ROW3R
+ vtrn.32 ROW4R, ROW6R
+ vtrn.32 ROW0R, ROW2R
+ vtrn.32 ROW5R, ROW7R
+
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */
- /* Transpose right 4x8 half */
- vtrn.16 ROW6R, ROW7R
- vtrn.16 ROW2R, ROW3R
- vtrn.16 ROW0R, ROW1R
- vtrn.16 ROW4R, ROW5R
- vmov.s16 q7, #(CENTERJSAMPLE << 5)
- vtrn.32 ROW1R, ROW3R
- vtrn.32 ROW4R, ROW6R
- vtrn.32 ROW0R, ROW2R
- vtrn.32 ROW5R, ROW7R
- /* 1-D IDCT, pass 2, left 4x8 half */
- vswp ROW7L, ROW3R
- vadd.s16 d10, ROW7L, ROW3L
- vswp ROW5L, ROW1R
- vadd.s16 d8, ROW5L, ROW1L
- vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d8, XFIX_1_175875602
- vswp ROW4L, ROW0R
- vadd.s16 q8, q8, q7
- vmull.s16 q7, d10, XFIX_1_175875602
- vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW4L
- vswp ROW6L, ROW2R
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
vmov q4, q6
- vmlsl.s16 q6, ROW5L, XFIX_2_562915447
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
vshl.s32 q3, q3, #13
vmlsl.s16 q4, ROW1L, XFIX_0_899976223
vadd.s32 q1, q3, q2
vmov q5, q7
vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW7L, XFIX_0_899976223
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
vshrn.s32 ROW1L, q1, #16
vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
vmlsl.s16 q5, ROW3L, XFIX_2_562915447
vsub.s32 q1, q1, q6
vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW6L, XFIX_0_541196100
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
vsub.s32 q3, q3, q2
- vshrn.s32 ROW6L, q1, #16
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW4L
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW5L, q3, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
vadd.s32 q2, q5, q6
vsub.s32 q1, q5, q6
vadd.s32 q6, q2, q7
vsub.s32 q2, q2, q7
vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4
- vshrn.s32 ROW7L, q2, #16
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
vshrn.s32 ROW3L, q5, #16
vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW4L, q3, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
/* 1-D IDCT, pass 2, right 4x8 half */
vld1.s16 {d2}, [ip, :64] /* reload constants */
- vadd.s16 d10, ROW7R, ROW3R
- vadd.s16 d8, ROW5R, ROW1R
- vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d8, XFIX_1_175875602
- vmull.s16 q7, d10, XFIX_1_175875602
- vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0R, ROW4R
- vmull.s16 q2, ROW2R, XFIX_0_541196100
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
vmov q4, q6
vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1R, XFIX_0_899976223
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
vadd.s32 q1, q3, q2
vmov q5, q7
vadd.s32 q1, q1, q6
vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
- vshrn.s32 ROW1R, q1, #16
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
vsub.s32 q1, q1, q6
vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW3R, XFIX_2_562915447
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
vmlal.s16 q6, ROW6R, XFIX_0_541196100
vsub.s32 q3, q3, q2
vshrn.s32 ROW6R, q1, #16
vadd.s32 q1, q3, q5
vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0R, ROW4R
- vshrn.s32 ROW2R, q1, #16
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
vshrn.s32 ROW5R, q3, #16
vshl.s32 q5, q5, #13
vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
@@ -465,50 +492,157 @@ asm_function jsimd_idct_islow_neon
vadd.s32 q5, q1, q4
vsub.s32 q3, q1, q4
vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW3R, q5, #16
- vshrn.s32 ROW0R, q6, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
vshrn.s32 ROW4R, q3, #16
- /* Descale to 8-bit and range limit */
- vqrshrun.s16 d16, q8, #2
- vqrshrun.s16 d17, q9, #2
- vqrshrun.s16 d18, q10, #2
- vqrshrun.s16 d19, q11, #2
+
+2: /* Descale to 8-bit and range limit */
+ vqrshrn.s16 d16, q8, #2
+ vqrshrn.s16 d17, q9, #2
+ vqrshrn.s16 d18, q10, #2
+ vqrshrn.s16 d19, q11, #2
vpop {d8-d15} /* restore NEON registers */
- vqrshrun.s16 d20, q12, #2
- vqrshrun.s16 d21, q13, #2
- vqrshrun.s16 d22, q14, #2
- vqrshrun.s16 d23, q15, #2
- /* Transpose the final 8-bit samples */
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vst1.8 {d20}, [TMP1]
- vtrn.8 d22, d23
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
+ vqrshrn.s16 d20, q12, #2
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+ vtrn.16 q8, q9
+ vqrshrn.s16 d21, q13, #2
+ vqrshrn.s16 d22, q14, #2
+ vmov.u8 q0, #(CENTERJSAMPLE)
+ vqrshrn.s16 d23, q15, #2
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vtrn.16 q10, q11
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vadd.u8 q10, q10, q0
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vtrn.8 d22, d23
+ vst1.8 {d20}, [TMP1]
+ vadd.u8 q11, q11, q0
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
bx lr
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vtrn.16 ROW2L, ROW3L
+ vtrn.16 ROW0L, ROW1L
+ vtrn.16 ROW4L, ROW5L
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
+ vtrn.32 ROW1L, ROW3L
+ vtrn.32 ROW4L, ROW6L
+ vtrn.32 ROW0L, ROW2L
+ vtrn.32 ROW5L, ROW7L
+
+ cmp r0, #0
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+
+ /* Only row 0 is non-zero for the right 4x8 half */
+ vdup.s16 ROW1R, ROW0R[1]
+ vdup.s16 ROW2R, ROW0R[2]
+ vdup.s16 ROW3R, ROW0R[3]
+ vdup.s16 ROW4R, ROW0R[0]
+ vdup.s16 ROW5R, ROW0R[1]
+ vdup.s16 ROW6R, ROW0R[2]
+ vdup.s16 ROW7R, ROW0R[3]
+ vdup.s16 ROW0R, ROW0R[0]
+ b 1b /* Go to 'normal' second pass */
+
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vshll.s16 q3, ROW0L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW0L, #13
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
+ vshll.s16 q3, ROW4L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW4L, #13
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+ b 2b /* Go to epilogue */
+
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
@@ -703,56 +837,48 @@ asm_function jsimd_idct_ifast_neon
vsub.s16 q13, q10, q2
vpop {d8-d13} /* restore NEON registers */
vadd.s16 q10, q10, q2
- /* Transpose */
- vtrn.16 q8, q9
vsub.s16 q11, q12, q1
- vtrn.16 q14, q15
vadd.s16 q12, q12, q1
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- /* Descale and range limit */
- vmov.s16 q0, #(0x80 << 5)
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q8, q10
- vtrn.32 q13, q15
- vswp d24, d17
- vswp d26, d19
- vqadd.s16 q8, q8, q0
- vswp d28, d21
- vqadd.s16 q9, q9, q0
- vswp d30, d23
- vqadd.s16 q10, q10, q0
- vqadd.s16 q11, q11, q0
+ /* Descale to 8-bit and range limit */
+ vmov.u8 q0, #0x80
+ vqshrn.s16 d16, q8, #5
+ vqshrn.s16 d17, q9, #5
+ vqshrn.s16 d18, q10, #5
+ vqshrn.s16 d19, q11, #5
+ vqshrn.s16 d20, q12, #5
+ vqshrn.s16 d21, q13, #5
+ vqshrn.s16 d22, q14, #5
+ vqshrn.s16 d23, q15, #5
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vadd.u8 q10, q10, q0
+ vadd.u8 q11, q11, q0
+ /* Transpose the final 8-bit samples */
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
/* Store results to the output buffer */
ldmia OUTPUT_BUF!, {TMP1, TMP2}
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
- vqshrun.s16 d16, q8, #5
- vqshrun.s16 d17, q9, #5
- vqshrun.s16 d18, q10, #5
- vqshrun.s16 d19, q11, #5
vst1.8 {d16}, [TMP1]
- vqadd.s16 q12, q12, q0
- vqadd.s16 q13, q13, q0
vst1.8 {d17}, [TMP2]
- vqadd.s16 q14, q14, q0
- vqadd.s16 q15, q15, q0
ldmia OUTPUT_BUF!, {TMP1, TMP2}
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
vst1.8 {d18}, [TMP1]
- vqshrun.s16 d20, q12, #5
- vqshrun.s16 d21, q13, #5
+ vtrn.8 d20, d21
vst1.8 {d19}, [TMP2]
- vqshrun.s16 d22, q14, #5
ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
vst1.8 {d20}, [TMP1]
- vqshrun.s16 d23, q15, #5
+ vtrn.8 d22, d23
vst1.8 {d21}, [TMP2]
vst1.8 {d22}, [TMP3]
vst1.8 {d23}, [TMP4]