diff options
Diffstat (limited to 'simd/jsimd_arm_neon.S')
-rw-r--r-- | simd/jsimd_arm_neon.S | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 9ef6efc..3f1cf9e 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -62,6 +62,7 @@ _\fname: vtrn.32 \x1, \x3 .endm +<<<<<<< HEAD #define CENTERJSAMPLE 128 /*****************************************************************************/ @@ -536,6 +537,8 @@ asm_function jsimd_idct_islow_neon .unreq ROW7R .endfunc +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /*****************************************************************************/ /* @@ -543,6 +546,7 @@ asm_function jsimd_idct_islow_neon * * This function contains a fast, not so accurate integer implementation of * the inverse DCT (Discrete Cosine Transform). It uses the same calculations +<<<<<<< HEAD * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' * function from jidctfst.c * @@ -552,6 +556,12 @@ asm_function jsimd_idct_islow_neon * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", * which introduces an extra addition. Overall, there are 6 extra additions * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. +======= + * and produces exactly the same output as IJG's original 'jpeg_idct_fast' + * function from jidctfst.c + * + * TODO: a bit better instructions scheduling is needed. +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 */ #define XFIX_1_082392200 d0[0] @@ -566,12 +576,70 @@ jsimd_idct_ifast_neon_consts: .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ +<<<<<<< HEAD +======= +/* 1-D IDCT helper macro */ + +.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ + t10, t11, t12, t13, t14 + + vsub.s16 \t10, \x0, \x4 + vadd.s16 \x4, \x0, \x4 + vswp.s16 \t10, \x0 + vsub.s16 \t11, \x2, \x6 + vadd.s16 \x6, \x2, \x6 + vswp.s16 \t11, \x2 + vsub.s16 \t10, \x3, \x5 + vadd.s16 \x5, \x3, \x5 + vswp.s16 \t10, \x3 + vsub.s16 \t11, \x1, \x7 + vadd.s16 \x7, \x1, \x7 + vswp.s16 \t11, \x1 + + vqdmulh.s16 \t13, \x2, d0[1] + vadd.s16 \t12, \x3, \x3 + vadd.s16 \x2, \x2, \t13 + vqdmulh.s16 \t13, \x3, d0[3] + vsub.s16 \t10, \x1, \x3 + vadd.s16 \t12, \t12, \t13 + vqdmulh.s16 \t13, \t10, d0[2] + vsub.s16 \t11, \x7, \x5 + vadd.s16 \t10, \t10, \t13 + vqdmulh.s16 \t13, \t11, d0[1] + vadd.s16 \t11, \t11, \t13 + + vqdmulh.s16 \t13, \x1, d0[0] + vsub.s16 \x2, \x6, \x2 + vsub.s16 \t14, \x0, \x2 + vadd.s16 \x2, \x0, \x2 + vadd.s16 \x0, \x4, \x6 + vsub.s16 \x4, \x4, \x6 + vadd.s16 \x1, \x1, \t13 + vadd.s16 \t13, \x7, \x5 + vsub.s16 \t12, \t13, \t12 + vsub.s16 \t12, \t12, \t10 + vadd.s16 \t11, \t12, \t11 + vsub.s16 \t10, \x1, \t10 + vadd.s16 \t10, \t10, \t11 + + vsub.s16 \x7, \x0, \t13 + vadd.s16 \x0, \x0, \t13 + vadd.s16 \x6, \t14, \t12 + vsub.s16 \x1, \t14, \t12 + vsub.s16 \x5, \x2, \t11 + vadd.s16 \x2, \x2, \t11 + vsub.s16 \x3, \x4, \t10 + vadd.s16 \x4, \x4, \t10 +.endm + +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 asm_function jsimd_idct_ifast_neon DCT_TABLE .req r0 COEF_BLOCK .req r1 OUTPUT_BUF .req r2 OUTPUT_COL .req r3 +<<<<<<< HEAD TMP1 .req r0 TMP2 .req r1 TMP3 .req r2 @@ -756,18 +824,117 @@ asm_function jsimd_idct_ifast_neon vst1.8 {d21}, [TMP2] vst1.8 {d22}, [TMP3] vst1.8 {d23}, [TMP4] +======= + TMP .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP, jsimd_idct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | d12 | d13 + * 5 | d14 | d15 + * 6 | d16 | d17 + * 7 | d18 | d19 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! + /* Dequantize */ + vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! + vmul.s16 q2, q2, q10 + vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! + vmul.s16 q3, q3, q11 + vmul.s16 q4, q4, q12 + vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! + vmul.s16 q5, q5, q13 + vmul.s16 q6, q6, q14 + vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! + vmul.s16 q7, q7, q15 + vmul.s16 q8, q8, q10 + vmul.s16 q9, q9, q11 + + /* Pass 1 */ + idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 + /* Transpose */ + transpose_4x4 d4, d6, d8, d10 + transpose_4x4 d5, d7, d9, d11 + transpose_4x4 d12, d14, d16, d18 + transpose_4x4 d13, d15, d17, d19 + vswp d12, d5 + vswp d14, d7 + vswp d16, d9 + vswp d18, d11 + + /* Pass 2 */ + idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 + /* Transpose */ + transpose_4x4 d4, d6, d8, d10 + transpose_4x4 d5, d7, d9, d11 + transpose_4x4 d12, d14, d16, d18 + transpose_4x4 d13, d15, d17, d19 + vswp d12, d5 + vswp d14, d7 + vswp d16, d9 + vswp d18, d11 + + /* Descale and range limit */ + vmov.s16 q15, #(0x80 << 5) + vqadd.s16 q2, q2, q15 + vqadd.s16 q3, q3, q15 + vqadd.s16 q4, q4, q15 + vqadd.s16 q5, q5, q15 + vqadd.s16 q6, q6, q15 + vqadd.s16 q7, q7, q15 + vqadd.s16 q8, q8, q15 + vqadd.s16 q9, q9, q15 + vqshrun.s16 d4, q2, #5 + vqshrun.s16 d6, q3, #5 + vqshrun.s16 d8, q4, #5 + vqshrun.s16 d10, q5, #5 + vqshrun.s16 d12, q6, #5 + vqshrun.s16 d14, q7, #5 + vqshrun.s16 d16, q8, #5 + vqshrun.s16 d18, q9, #5 + + /* Store results to the output buffer */ + .irp x, d4, d6, d8, d10, d12, d14, d16, d18 + ldr TMP, [OUTPUT_BUF], #4 + add TMP, TMP, OUTPUT_COL + vst1.8 {\x}, [TMP]! + .endr + + vpop {d8-d15} +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 bx lr .unreq DCT_TABLE .unreq COEF_BLOCK .unreq OUTPUT_BUF .unreq OUTPUT_COL +<<<<<<< HEAD .unreq TMP1 .unreq TMP2 .unreq TMP3 .unreq TMP4 .endfunc +======= + .unreq TMP +.endfunc + +.purgem idct_helper + +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /*****************************************************************************/ /* @@ -1152,12 +1319,21 @@ asm_function jsimd_idct_2x2_neon .macro do_load size .if \size == 8 +<<<<<<< HEAD vld1.8 {d4}, [U, :64]! vld1.8 {d5}, [V, :64]! vld1.8 {d0}, [Y, :64]! pld [U, #64] pld [V, #64] pld [Y, #64] +======= + vld1.8 {d4}, [U]! + vld1.8 {d5}, [V]! + vld1.8 {d0}, [Y]! + pld [Y, #64] + pld [U, #64] + pld [V, #64] +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 .elseif \size == 4 vld1.8 {d4[0]}, [U]! vld1.8 {d4[1]}, [U]! @@ -1227,11 +1403,15 @@ asm_function jsimd_idct_2x2_neon .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs +<<<<<<< HEAD /* * 2 stage pipelined YCbCr->RGB conversion */ .macro do_yuv_to_rgb_stage1 +======= +.macro do_yuv_to_rgb +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ @@ -1242,9 +1422,12 @@ asm_function jsimd_idct_2x2_neon vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +<<<<<<< HEAD .endm .macro do_yuv_to_rgb_stage2 +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 vrshrn.s32 d20, q10, #15 vrshrn.s32 d21, q11, #15 vrshrn.s32 d24, q12, #14 @@ -1259,6 +1442,7 @@ asm_function jsimd_idct_2x2_neon vqmovun.s16 d1\b_offs, q14 .endm +<<<<<<< HEAD .macro do_yuv_to_rgb_stage2_store_load_stage1 vld1.8 {d4}, [U, :64]! vrshrn.s32 d20, q10, #15 @@ -1296,6 +1480,8 @@ asm_function jsimd_idct_2x2_neon do_yuv_to_rgb_stage2 .endm +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 /* Apple gas crashes on adrl, work around that by using adr. * But this requires a copy of these constants for each function. */ @@ -1356,6 +1542,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon /* Inner loop over pixels */ subs N, N, #8 +<<<<<<< HEAD blt 3f do_load 8 do_yuv_to_rgb_stage1 @@ -1371,6 +1558,18 @@ asm_function jsimd_ycc_\colorid\()_convert_neon tst N, #7 beq 8f 3: +======= + blt 2f +1: + do_load 8 + do_yuv_to_rgb + do_store \bpp, 8 + subs N, N, #8 + bge 1b + tst N, #7 + beq 8f +2: +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 tst N, #4 beq 3f do_load 4 @@ -1418,9 +1617,12 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .endfunc .purgem do_yuv_to_rgb +<<<<<<< HEAD .purgem do_yuv_to_rgb_stage1 .purgem do_yuv_to_rgb_stage2 .purgem do_yuv_to_rgb_stage2_store_load_stage1 +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 .endm @@ -1436,6 +1638,7 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 .purgem do_store /*****************************************************************************/ +<<<<<<< HEAD /* * jsimd_extrgb_ycc_convert_neon @@ -2031,3 +2234,5 @@ asm_function jsimd_quantize_neon .unreq SHIFT .unreq LOOP_COUNT .endfunc +======= +>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8 |