aboutsummaryrefslogtreecommitdiff
path: root/simd/jsimd_arm_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'simd/jsimd_arm_neon.S')
-rw-r--r--simd/jsimd_arm_neon.S205
1 files changed, 205 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 9ef6efc..3f1cf9e 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -62,6 +62,7 @@ _\fname:
vtrn.32 \x1, \x3
.endm
+<<<<<<< HEAD
#define CENTERJSAMPLE 128
/*****************************************************************************/
@@ -536,6 +537,8 @@ asm_function jsimd_idct_islow_neon
.unreq ROW7R
.endfunc
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/*****************************************************************************/
/*
@@ -543,6 +546,7 @@ asm_function jsimd_idct_islow_neon
*
* This function contains a fast, not so accurate integer implementation of
* the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+<<<<<<< HEAD
* and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
* function from jidctfst.c
*
@@ -552,6 +556,12 @@ asm_function jsimd_idct_islow_neon
* like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
* which introduces an extra addition. Overall, there are 6 extra additions
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+=======
+ * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
+ * function from jidctfst.c
+ *
+ * TODO: a bit better instructions scheduling is needed.
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
*/
#define XFIX_1_082392200 d0[0]
@@ -566,12 +576,70 @@ jsimd_idct_ifast_neon_consts:
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+<<<<<<< HEAD
+=======
+/* 1-D IDCT helper macro */
+
+.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
+ t10, t11, t12, t13, t14
+
+ vsub.s16 \t10, \x0, \x4
+ vadd.s16 \x4, \x0, \x4
+ vswp.s16 \t10, \x0
+ vsub.s16 \t11, \x2, \x6
+ vadd.s16 \x6, \x2, \x6
+ vswp.s16 \t11, \x2
+ vsub.s16 \t10, \x3, \x5
+ vadd.s16 \x5, \x3, \x5
+ vswp.s16 \t10, \x3
+ vsub.s16 \t11, \x1, \x7
+ vadd.s16 \x7, \x1, \x7
+ vswp.s16 \t11, \x1
+
+ vqdmulh.s16 \t13, \x2, d0[1]
+ vadd.s16 \t12, \x3, \x3
+ vadd.s16 \x2, \x2, \t13
+ vqdmulh.s16 \t13, \x3, d0[3]
+ vsub.s16 \t10, \x1, \x3
+ vadd.s16 \t12, \t12, \t13
+ vqdmulh.s16 \t13, \t10, d0[2]
+ vsub.s16 \t11, \x7, \x5
+ vadd.s16 \t10, \t10, \t13
+ vqdmulh.s16 \t13, \t11, d0[1]
+ vadd.s16 \t11, \t11, \t13
+
+ vqdmulh.s16 \t13, \x1, d0[0]
+ vsub.s16 \x2, \x6, \x2
+ vsub.s16 \t14, \x0, \x2
+ vadd.s16 \x2, \x0, \x2
+ vadd.s16 \x0, \x4, \x6
+ vsub.s16 \x4, \x4, \x6
+ vadd.s16 \x1, \x1, \t13
+ vadd.s16 \t13, \x7, \x5
+ vsub.s16 \t12, \t13, \t12
+ vsub.s16 \t12, \t12, \t10
+ vadd.s16 \t11, \t12, \t11
+ vsub.s16 \t10, \x1, \t10
+ vadd.s16 \t10, \t10, \t11
+
+ vsub.s16 \x7, \x0, \t13
+ vadd.s16 \x0, \x0, \t13
+ vadd.s16 \x6, \t14, \t12
+ vsub.s16 \x1, \t14, \t12
+ vsub.s16 \x5, \x2, \t11
+ vadd.s16 \x2, \x2, \t11
+ vsub.s16 \x3, \x4, \t10
+ vadd.s16 \x4, \x4, \t10
+.endm
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
asm_function jsimd_idct_ifast_neon
DCT_TABLE .req r0
COEF_BLOCK .req r1
OUTPUT_BUF .req r2
OUTPUT_COL .req r3
+<<<<<<< HEAD
TMP1 .req r0
TMP2 .req r1
TMP3 .req r2
@@ -756,18 +824,117 @@ asm_function jsimd_idct_ifast_neon
vst1.8 {d21}, [TMP2]
vst1.8 {d22}, [TMP3]
vst1.8 {d23}, [TMP4]
+=======
+ TMP .req ip
+
+ vpush {d8-d15}
+
+ /* Load constants */
+ adr TMP, jsimd_idct_ifast_neon_consts
+ vld1.16 {d0}, [TMP, :64]
+
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d4 | d5
+ * 1 | d6 | d7
+ * 2 | d8 | d9
+ * 3 | d10 | d11
+ * 4 | d12 | d13
+ * 5 | d14 | d15
+ * 6 | d16 | d17
+ * 7 | d18 | d19
+ */
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
+ vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
+ vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
+ /* Dequantize */
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q2, q2, q10
+ vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
+ vmul.s16 q3, q3, q11
+ vmul.s16 q4, q4, q12
+ vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
+ vmul.s16 q5, q5, q13
+ vmul.s16 q6, q6, q14
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q7, q7, q15
+ vmul.s16 q8, q8, q10
+ vmul.s16 q9, q9, q11
+
+ /* Pass 1 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Pass 2 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Descale and range limit */
+ vmov.s16 q15, #(0x80 << 5)
+ vqadd.s16 q2, q2, q15
+ vqadd.s16 q3, q3, q15
+ vqadd.s16 q4, q4, q15
+ vqadd.s16 q5, q5, q15
+ vqadd.s16 q6, q6, q15
+ vqadd.s16 q7, q7, q15
+ vqadd.s16 q8, q8, q15
+ vqadd.s16 q9, q9, q15
+ vqshrun.s16 d4, q2, #5
+ vqshrun.s16 d6, q3, #5
+ vqshrun.s16 d8, q4, #5
+ vqshrun.s16 d10, q5, #5
+ vqshrun.s16 d12, q6, #5
+ vqshrun.s16 d14, q7, #5
+ vqshrun.s16 d16, q8, #5
+ vqshrun.s16 d18, q9, #5
+
+ /* Store results to the output buffer */
+ .irp x, d4, d6, d8, d10, d12, d14, d16, d18
+ ldr TMP, [OUTPUT_BUF], #4
+ add TMP, TMP, OUTPUT_COL
+ vst1.8 {\x}, [TMP]!
+ .endr
+
+ vpop {d8-d15}
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
bx lr
.unreq DCT_TABLE
.unreq COEF_BLOCK
.unreq OUTPUT_BUF
.unreq OUTPUT_COL
+<<<<<<< HEAD
.unreq TMP1
.unreq TMP2
.unreq TMP3
.unreq TMP4
.endfunc
+=======
+ .unreq TMP
+.endfunc
+
+.purgem idct_helper
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/*****************************************************************************/
/*
@@ -1152,12 +1319,21 @@ asm_function jsimd_idct_2x2_neon
.macro do_load size
.if \size == 8
+<<<<<<< HEAD
vld1.8 {d4}, [U, :64]!
vld1.8 {d5}, [V, :64]!
vld1.8 {d0}, [Y, :64]!
pld [U, #64]
pld [V, #64]
pld [Y, #64]
+=======
+ vld1.8 {d4}, [U]!
+ vld1.8 {d5}, [V]!
+ vld1.8 {d0}, [Y]!
+ pld [Y, #64]
+ pld [U, #64]
+ pld [V, #64]
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
.elseif \size == 4
vld1.8 {d4[0]}, [U]!
vld1.8 {d4[1]}, [U]!
@@ -1227,11 +1403,15 @@ asm_function jsimd_idct_2x2_neon
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+<<<<<<< HEAD
/*
* 2 stage pipelined YCbCr->RGB conversion
*/
.macro do_yuv_to_rgb_stage1
+=======
+.macro do_yuv_to_rgb
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
@@ -1242,9 +1422,12 @@ asm_function jsimd_idct_2x2_neon
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
+<<<<<<< HEAD
.endm
.macro do_yuv_to_rgb_stage2
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
vrshrn.s32 d20, q10, #15
vrshrn.s32 d21, q11, #15
vrshrn.s32 d24, q12, #14
@@ -1259,6 +1442,7 @@ asm_function jsimd_idct_2x2_neon
vqmovun.s16 d1\b_offs, q14
.endm
+<<<<<<< HEAD
.macro do_yuv_to_rgb_stage2_store_load_stage1
vld1.8 {d4}, [U, :64]!
vrshrn.s32 d20, q10, #15
@@ -1296,6 +1480,8 @@ asm_function jsimd_idct_2x2_neon
do_yuv_to_rgb_stage2
.endm
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
/* Apple gas crashes on adrl, work around that by using adr.
* But this requires a copy of these constants for each function.
*/
@@ -1356,6 +1542,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
/* Inner loop over pixels */
subs N, N, #8
+<<<<<<< HEAD
blt 3f
do_load 8
do_yuv_to_rgb_stage1
@@ -1371,6 +1558,18 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
tst N, #7
beq 8f
3:
+=======
+ blt 2f
+1:
+ do_load 8
+ do_yuv_to_rgb
+ do_store \bpp, 8
+ subs N, N, #8
+ bge 1b
+ tst N, #7
+ beq 8f
+2:
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
tst N, #4
beq 3f
do_load 4
@@ -1418,9 +1617,12 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.endfunc
.purgem do_yuv_to_rgb
+<<<<<<< HEAD
.purgem do_yuv_to_rgb_stage1
.purgem do_yuv_to_rgb_stage2
.purgem do_yuv_to_rgb_stage2_store_load_stage1
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
.endm
@@ -1436,6 +1638,7 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
.purgem do_store
/*****************************************************************************/
+<<<<<<< HEAD
/*
* jsimd_extrgb_ycc_convert_neon
@@ -2031,3 +2234,5 @@ asm_function jsimd_quantize_neon
.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8