1 files changed, 205 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 9ef6efc..3f1cf9e 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -62,6 +62,7 @@ _\fname:
     vtrn.32 \x1, \x3
 .endm
 
+<<<<<<< HEAD
 #define CENTERJSAMPLE 128
 
 /*****************************************************************************/
@@ -536,6 +537,8 @@ asm_function jsimd_idct_islow_neon
     .unreq          ROW7R
 .endfunc
 
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
 /*****************************************************************************/
 
 /*
@@ -543,6 +546,7 @@ asm_function jsimd_idct_islow_neon
  *
  * This function contains a fast, not so accurate integer implementation of
  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+<<<<<<< HEAD
  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
  * function from jidctfst.c
  *
@@ -552,6 +556,12 @@ asm_function jsimd_idct_islow_neon
  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
  * which introduces an extra addition. Overall, there are 6 extra additions
  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+=======
+ * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
+ * function from jidctfst.c
+ *
+ * TODO: a bit better instructions scheduling is needed.
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
  */
 
 #define XFIX_1_082392200 d0[0]
@@ -566,12 +576,70 @@ jsimd_idct_ifast_neon_consts:
     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
     .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
 
+<<<<<<< HEAD
+=======
+/* 1-D IDCT helper macro */
+
+.macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \
+                    t10, t11, t12, t13, t14
+
+    vsub.s16        \t10, \x0, \x4
+    vadd.s16        \x4,  \x0, \x4
+    vswp.s16        \t10, \x0
+    vsub.s16        \t11, \x2, \x6
+    vadd.s16        \x6,  \x2, \x6
+    vswp.s16        \t11, \x2
+    vsub.s16        \t10, \x3, \x5
+    vadd.s16        \x5,  \x3, \x5
+    vswp.s16        \t10, \x3
+    vsub.s16        \t11, \x1, \x7
+    vadd.s16        \x7,  \x1, \x7
+    vswp.s16        \t11, \x1
+
+    vqdmulh.s16     \t13, \x2,  d0[1]
+    vadd.s16        \t12, \x3,  \x3
+    vadd.s16        \x2,  \x2,  \t13
+    vqdmulh.s16     \t13, \x3,  d0[3]
+    vsub.s16        \t10,  \x1, \x3
+    vadd.s16        \t12, \t12, \t13
+    vqdmulh.s16     \t13, \t10, d0[2]
+    vsub.s16        \t11, \x7,  \x5
+    vadd.s16        \t10, \t10, \t13
+    vqdmulh.s16     \t13, \t11, d0[1]
+    vadd.s16        \t11, \t11, \t13
+
+    vqdmulh.s16     \t13, \x1,  d0[0]
+    vsub.s16        \x2,  \x6,  \x2
+    vsub.s16        \t14, \x0,  \x2
+    vadd.s16        \x2,  \x0,  \x2
+    vadd.s16        \x0,  \x4,  \x6
+    vsub.s16        \x4,  \x4,  \x6
+    vadd.s16        \x1,  \x1,  \t13
+    vadd.s16        \t13, \x7,  \x5
+    vsub.s16        \t12, \t13, \t12
+    vsub.s16        \t12, \t12, \t10
+    vadd.s16        \t11, \t12, \t11
+    vsub.s16        \t10, \x1,  \t10
+    vadd.s16        \t10, \t10, \t11
+
+    vsub.s16        \x7,  \x0,  \t13
+    vadd.s16        \x0,  \x0,  \t13
+    vadd.s16        \x6,  \t14, \t12
+    vsub.s16        \x1,  \t14, \t12
+    vsub.s16        \x5,  \x2,  \t11
+    vadd.s16        \x2,  \x2,  \t11
+    vsub.s16        \x3,  \x4,  \t10
+    vadd.s16        \x4,  \x4,  \t10
+.endm
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
 asm_function jsimd_idct_ifast_neon
 
     DCT_TABLE       .req r0
     COEF_BLOCK      .req r1
     OUTPUT_BUF      .req r2
     OUTPUT_COL      .req r3
+<<<<<<< HEAD
     TMP1            .req r0
     TMP2            .req r1
     TMP3            .req r2
@@ -756,18 +824,117 @@ asm_function jsimd_idct_ifast_neon
       vst1.8          {d21}, [TMP2]
       vst1.8          {d22}, [TMP3]
       vst1.8          {d23}, [TMP4]
+=======
+    TMP             .req ip
+
+    vpush           {d8-d15}
+
+    /* Load constants */
+    adr             TMP, jsimd_idct_ifast_neon_consts
+    vld1.16         {d0}, [TMP, :64]
+
+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d4      | d5
+     *   1 | d6      | d7
+     *   2 | d8      | d9
+     *   3 | d10     | d11
+     *   4 | d12     | d13
+     *   5 | d14     | d15
+     *   6 | d16     | d17
+     *   7 | d18     | d19
+     */
+    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
+    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
+    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]!
+    /* Dequantize */
+    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
+    vmul.s16        q2, q2, q10
+    vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]!
+    vmul.s16        q3, q3, q11
+    vmul.s16        q4, q4, q12
+    vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]!
+    vmul.s16        q5, q5, q13
+    vmul.s16        q6, q6, q14
+    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
+    vmul.s16        q7, q7, q15
+    vmul.s16        q8, q8, q10
+    vmul.s16        q9, q9, q11
+
+    /* Pass 1 */
+    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+    /* Transpose */
+    transpose_4x4   d4,  d6,  d8,  d10
+    transpose_4x4   d5,  d7,  d9,  d11
+    transpose_4x4   d12, d14, d16, d18
+    transpose_4x4   d13, d15, d17, d19
+    vswp            d12, d5
+    vswp            d14, d7
+    vswp            d16, d9
+    vswp            d18, d11
+
+    /* Pass 2 */
+    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+    /* Transpose */
+    transpose_4x4   d4,  d6,  d8,  d10
+    transpose_4x4   d5,  d7,  d9,  d11
+    transpose_4x4   d12, d14, d16, d18
+    transpose_4x4   d13, d15, d17, d19
+    vswp            d12, d5
+    vswp            d14, d7
+    vswp            d16, d9
+    vswp            d18, d11
+
+    /* Descale and range limit */
+    vmov.s16        q15, #(0x80 << 5)
+    vqadd.s16       q2, q2, q15
+    vqadd.s16       q3, q3, q15
+    vqadd.s16       q4, q4, q15
+    vqadd.s16       q5, q5, q15
+    vqadd.s16       q6, q6, q15
+    vqadd.s16       q7, q7, q15
+    vqadd.s16       q8, q8, q15
+    vqadd.s16       q9, q9, q15
+    vqshrun.s16     d4, q2, #5
+    vqshrun.s16     d6, q3, #5
+    vqshrun.s16     d8, q4, #5
+    vqshrun.s16     d10, q5, #5
+    vqshrun.s16     d12, q6, #5
+    vqshrun.s16     d14, q7, #5
+    vqshrun.s16     d16, q8, #5
+    vqshrun.s16     d18, q9, #5
+
+    /* Store results to the output buffer */
+    .irp            x, d4, d6, d8, d10, d12, d14, d16, d18
+    ldr             TMP, [OUTPUT_BUF], #4
+    add             TMP, TMP, OUTPUT_COL
+    vst1.8          {\x}, [TMP]!
+    .endr
+
+    vpop            {d8-d15}
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
     bx              lr
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
     .unreq          OUTPUT_BUF
     .unreq          OUTPUT_COL
+<<<<<<< HEAD
     .unreq          TMP1
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
 .endfunc
 
+=======
+    .unreq          TMP
+.endfunc
+
+.purgem idct_helper
+
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
 /*****************************************************************************/
 
 /*
@@ -1152,12 +1319,21 @@ asm_function jsimd_idct_2x2_neon
 
 .macro do_load size
     .if \size == 8
+<<<<<<< HEAD
         vld1.8  {d4}, [U, :64]!
         vld1.8  {d5}, [V, :64]!
         vld1.8  {d0}, [Y, :64]!
         pld     [U, #64]
         pld     [V, #64]
         pld     [Y, #64]
+=======
+        vld1.8  {d4}, [U]!
+        vld1.8  {d5}, [V]!
+        vld1.8  {d0}, [Y]!
+        pld     [Y, #64]
+        pld     [U, #64]
+        pld     [V, #64]
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
     .elseif \size == 4
         vld1.8  {d4[0]}, [U]!
         vld1.8  {d4[1]}, [U]!
@@ -1227,11 +1403,15 @@ asm_function jsimd_idct_2x2_neon
 
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
+<<<<<<< HEAD
 /*
  * 2 stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
+=======
+.macro do_yuv_to_rgb
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
@@ -1242,9 +1422,12 @@ asm_function jsimd_idct_2x2_neon
     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
+<<<<<<< HEAD
 .endm
 
 .macro do_yuv_to_rgb_stage2
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
     vrshrn.s32      d20, q10, #15
     vrshrn.s32      d21, q11, #15
     vrshrn.s32      d24, q12, #14
@@ -1259,6 +1442,7 @@ asm_function jsimd_idct_2x2_neon
     vqmovun.s16     d1\b_offs, q14
 .endm
 
+<<<<<<< HEAD
 .macro do_yuv_to_rgb_stage2_store_load_stage1
     vld1.8          {d4}, [U, :64]!
       vrshrn.s32      d20, q10, #15
@@ -1296,6 +1480,8 @@ asm_function jsimd_idct_2x2_neon
     do_yuv_to_rgb_stage2
 .endm
 
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
 /* Apple gas crashes on adrl, work around that by using adr.
  * But this requires a copy of these constants for each function.
  */
@@ -1356,6 +1542,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 
     /* Inner loop over pixels */
     subs            N, N, #8
+<<<<<<< HEAD
     blt             3f
     do_load         8
     do_yuv_to_rgb_stage1
@@ -1371,6 +1558,18 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
     tst             N, #7
     beq             8f
 3:
+=======
+    blt             2f
+1:
+    do_load         8
+    do_yuv_to_rgb
+    do_store        \bpp, 8
+    subs            N, N, #8
+    bge             1b
+    tst             N, #7
+    beq             8f
+2:
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
     tst             N, #4
     beq             3f
     do_load         4
@@ -1418,9 +1617,12 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 .endfunc
 
 .purgem do_yuv_to_rgb
+<<<<<<< HEAD
 .purgem do_yuv_to_rgb_stage1
 .purgem do_yuv_to_rgb_stage2
 .purgem do_yuv_to_rgb_stage2_store_load_stage1
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8
 
 .endm
 
@@ -1436,6 +1638,7 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
 .purgem do_store
 
 /*****************************************************************************/
+<<<<<<< HEAD
 
 /*
  * jsimd_extrgb_ycc_convert_neon
@@ -2031,3 +2234,5 @@ asm_function jsimd_quantize_neon
     .unreq          SHIFT
     .unreq          LOOP_COUNT
 .endfunc
+=======
+>>>>>>> 4a72099711359606b1fc10c1744057a6c568d5d8