aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2014-07-23 14:14:14 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2014-07-23 14:14:14 +0000
commit1d3ec5424e6d4f30480718cdf8d37151ee340462 (patch)
tree0ac6a19f7442f5822ba946fcda115caf9ab05ca9 /simd
parentda1ea25e29db58d4fe2a8e058cbbd91b87a4cc6c (diff)
Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8)
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r--simd/jsimd_arm64.c83
-rw-r--r--simd/jsimd_arm_neon_64.S380
2 files changed, 193 insertions, 270 deletions
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 988023a..44225aa 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -27,98 +27,29 @@
static unsigned int simd_support = ~0;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
- char *p;
- if (*feature == 0)
- return 0;
- if (strncmp(buffer, "Features", 8) != 0)
- return 0;
- buffer += 8;
- while (isspace(*buffer))
- buffer++;
-
- /* Check if 'feature' is present in the buffer as a separate word */
- while ((p = strstr(buffer, feature))) {
- if (p > buffer && !isspace(*(p - 1))) {
- buffer++;
- continue;
- }
- p += strlen(feature);
- if (*p != 0 && !isspace(*p)) {
- buffer++;
- continue;
- }
- return 1;
- }
- return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
- char *buffer = (char *)malloc(bufsize);
- FILE *fd;
- simd_support = 0;
-
- if (!buffer)
- return 0;
-
- fd = fopen("/proc/cpuinfo", "r");
- if (fd) {
- while (fgets(buffer, bufsize, fd)) {
- if (!strchr(buffer, '\n') && !feof(fd)) {
- /* "impossible" happened - insufficient size of the buffer! */
- fclose(fd);
- free(buffer);
- return 0;
- }
- if (check_feature(buffer, "neon"))
- simd_support |= JSIMD_ARM_NEON;
- }
- fclose(fd);
- }
- free(buffer);
- return 1;
-}
-
-#endif
-
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
+
+/*
+ * ARMv8 architectures support NEON extensions by default.
+ * It is no longer optional as it was with ARMv7.
+ */
+
+
LOCAL(void)
init_simd (void)
{
char *env = NULL;
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
if (simd_support != ~0U)
return;
simd_support = 0;
-#if defined(__ARM_NEON__)
simd_support |= JSIMD_ARM_NEON;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- /* We still have a chance to use NEON regardless of globally used
- * -mcpu/-mfpu options passed to gcc by performing runtime detection via
- * /proc/cpuinfo parsing on linux/android */
- while (!parse_proc_cpuinfo(bufsize)) {
- bufsize *= 2;
- if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
- break;
- }
-#endif
/* Force different settings through environment variables */
env = getenv("JSIMD_FORCENEON");
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
index 8806abc..2c3989c 100644
--- a/simd/jsimd_arm_neon_64.S
+++ b/simd/jsimd_arm_neon_64.S
@@ -34,7 +34,6 @@
#define RESPECT_STRICT_ALIGNMENT 1
-#define RTSM_SQSHRN_SIM_ISSUE
/*****************************************************************************/
@@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon
ROW6R .req v29
ROW7L .req v30
ROW7R .req v31
-
+ /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
+ sub sp, sp, 272
+ str x15, [sp], 16
adr x15, jsimd_idct_islow_neon_consts
+ st1 {v0.8b - v3.8b}, [sp], 32
+ st1 {v4.8b - v7.8b}, [sp], 32
+ st1 {v8.8b - v11.8b}, [sp], 32
+ st1 {v12.8b - v15.8b}, [sp], 32
+ st1 {v16.8b - v19.8b}, [sp], 32
+ st1 {v20.8b - v23.8b}, [sp], 32
+ st1 {v24.8b - v27.8b}, [sp], 32
+ st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
@@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon
mul v22.4h, v22.4h, v6.4h
mul v23.4h, v23.4h, v7.4h
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
- ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
+ ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
mul v24.4h, v24.4h, v0.4h
mul v25.4h, v25.4h, v1.4h
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
@@ -293,80 +302,79 @@ asm_function jsimd_idct_islow_neon
mul v30.4h, v30.4h, v6.4h
mul v31.4h, v31.4h, v7.4h
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
- sub sp, sp, #32
- st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
- sub sp, sp, #32
- st1 {v12.4h-v15.4h}, [sp]
+ /* Go to the bottom of the stack */
+ sub sp, sp, 352
+ stp x4, x5, [sp], 16
+ st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
+ st1 {v12.4h - v15.4h}, [sp], 32
/* 1-D IDCT, pass 1, left 4x8 half */
add v4.4h, ROW7L.4h, ROW3L.4h
add v5.4h, ROW5L.4h, ROW1L.4h
smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
smlal v12.4s, v5.4h, XFIX_1_175875602
smull v14.4s, v4.4h, XFIX_1_175875602
- /* Check for the zero coefficients in the right 4x8 half */
- /* push {x4, x5} */
- stp x4, x5, [sp, -16]!
- mov x5, #0
+ /* Check for the zero coefficients in the right 4x8 half */
smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
ssubl v6.4s, ROW0L.4h, ROW4L.4h
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
smull v4.4s, ROW2L.4h, XFIX_0_541196100
smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
- orr x0, x4, x5
+ orr x0, x4, x5
mov v8.16b, v12.16b
smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
shl v6.4s, v6.4s, #13
- orr x0, x0, x4
+ orr x0, x0, x4
smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
- orr x0, x0 , x5
+ orr x0, x0 , x5
add v2.4s, v6.4s, v4.4s
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
mov v10.16b, v14.16b
add v2.4s, v2.4s, v12.4s
- orr x0, x0, x4
+ orr x0, x0, x4
smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
- orr x0, x0, x5
+ orr x0, x0, x5
smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
rshrn ROW1L.4h, v2.4s, #11
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
sub v2.4s, v2.4s, v12.4s
smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
- orr x0, x0, x4
+ orr x0, x0, x4
smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
- orr x0, x0, x5
+ orr x0, x0, x5
sub v2.4s, v2.4s, v12.4s
smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
smlal v12.4s, ROW6L.4h, XFIX_0_541196100
sub v6.4s, v6.4s, v4.4s
- orr x0, x0, x4
+ orr x0, x0, x4
rshrn ROW6L.4h, v2.4s, #11
- orr x0, x0, x5
+ orr x0, x0, x5
add v2.4s, v6.4s, v10.4s
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
sub v6.4s, v6.4s, v10.4s
saddl v10.4s, ROW0L.4h, ROW4L.4h
- orr x0, x0, x4
+ orr x0, x0, x4
rshrn ROW2L.4h, v2.4s, #11
- orr x0, x0, x5
+ orr x0, x0, x5
rshrn ROW5L.4h, v6.4s, #11
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
shl v10.4s, v10.4s, #13
smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
- orr x0, x0, x4
+ orr x0, x0, x4
add v4.4s, v10.4s, v12.4s
- orr x0, x0, x5
+ orr x0, x0, x5
sub v2.4s, v10.4s, v12.4s
add v12.4s, v4.4s, v14.4s
- ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+ ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
sub v4.4s, v4.4s, v14.4s
add v10.4s, v2.4s, v8.4s
- orr x0, x4, x5
+ orr x0, x4, x5
sub v6.4s, v2.4s, v8.4s
/* pop {x4, x5} */
- ldp x4, x5, [sp], 16
+ sub sp, sp, 80
+ ldp x4, x5, [sp], 16
rshrn ROW7L.4h, v4.4s, #11
rshrn ROW3L.4h, v10.4s, #11
rshrn ROW0L.4h, v12.4s, #11
@@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon
ins v18.2d[1], v19.2d[0]
ins v20.2d[1], v21.2d[0]
ins v22.2d[1], v23.2d[0]
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v16.8b, v16.8h, #2
sqrshrn2 v16.16b, v18.8h, #2
sqrshrn v18.8b, v20.8h, #2
sqrshrn2 v18.16b, v22.8h, #2
-#else
- sqrshrn v16.4h, v16.4s, #2
- sqrshrn2 v16.8h, v18.4s, #2
- sqrshrn v18.4h, v20.4s, #2
- sqrshrn2 v18.8h, v22.4s, #2
-#endif
- /* vpop {v8.4h-d15.4h} */ /* restore NEON registers */
- ld1 {v12.4h-v15.4h}, [sp], 32
- ld1 {v8.4h-v11.4h}, [sp], 32
+ /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
+ ld1 {v8.4h - v11.4h}, [sp], 32
+ ld1 {v12.4h - v15.4h}, [sp], 32
ins v24.2d[1], v25.2d[0]
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v20.8b, v24.8h, #2
-#else
-
- sqrshrn v20.4h, v24.4s, #2
-#endif
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
/* trn1 v16.8h, v16.8h, v18.8h */
transpose v16, v18, v3, .16b, .8h
ins v26.2d[1], v27.2d[0]
ins v28.2d[1], v29.2d[0]
ins v30.2d[1], v31.2d[0]
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v20.16b, v26.8h, #2
sqrshrn v22.8b, v28.8h, #2
-#else
- sqrshrn2 v20.8h, v26.4s, #2
- sqrshrn v22.4h, v28.4s, #2
-#endif
movi v0.16b, #(CENTERJSAMPLE)
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v22.16b, v30.8h, #2
-#else
- sqrshrn2 v22.8h, v30.4s, #2
-#endif
transpose_single v16, v17, v3, .2d, .8b
transpose_single v18, v19, v3, .2d, .8b
add v16.8b, v16.8b, v0.8b
@@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon
st1 {v21.8b}, [TMP2]
st1 {v22.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
+ ldr x15, [sp], 16
+ ld1 {v0.8b - v3.8b}, [sp], 32
+ ld1 {v4.8b - v7.8b}, [sp], 32
+ ld1 {v8.8b - v11.8b}, [sp], 32
+ ld1 {v12.8b - v15.8b}, [sp], 32
+ ld1 {v16.8b - v19.8b}, [sp], 32
+ ld1 {v20.8b - v23.8b}, [sp], 32
+ ld1 {v24.8b - v27.8b}, [sp], 32
+ ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon
TMP1 .req x0
TMP2 .req x1
TMP3 .req x2
- TMP4 .req x15
+ TMP4 .req x22
+ TMP5 .req x23
/* Load and dequantize coefficients into NEON registers
* with the following allocation:
@@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon
* 6 | d28 | d29 ( v14.8h )
* 7 | d30 | d31 ( v15.8h )
*/
- adr x15, jsimd_idct_ifast_neon_consts
+ /* Save NEON registers used in fast IDCT */
+ sub sp, sp, #176
+ stp x22, x23, [sp], 16
+ adr x23, jsimd_idct_ifast_neon_consts
+ st1 {v0.8b - v3.8b}, [sp], 32
+ st1 {v4.8b - v7.8b}, [sp], 32
+ st1 {v8.8b - v11.8b}, [sp], 32
+ st1 {v12.8b - v15.8b}, [sp], 32
+ st1 {v16.8b - v19.8b}, [sp], 32
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon
ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
mul v14.8h, v14.8h, v2.8h
mul v13.8h, v13.8h, v1.8h
- ld1 {v0.4h}, [x15] /* load constants */
+ ld1 {v0.4h}, [x23] /* load constants */
mul v15.8h, v15.8h, v3.8h
- /* vpush {v4.8h-v6.8h} */ /* save NEON registers */
- sub sp, sp, #32
- st1 {v4.8h-v5.8h}, [sp] /* save NEON registers */
- sub sp, sp, #16
- st1 {v6.8h}, [sp]
/* 1-D IDCT, pass 1 */
sub v2.8h, v10.8h, v14.8h
add v14.8h, v10.8h, v14.8h
@@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon
trn1 v13.4s, v13.4s, v15.4s
trn2 v15.4s, v18.4s, v15.4s
/* vswp v14.4h, v10-MSB.4h */
- umov x10, v14.d[0]
+ umov x22, v14.d[0]
ins v14.2d[0], v10.2d[1]
- ins v10.2d[1], x10
+ ins v10.2d[1], x22
/* vswp v13.4h, v9MSB.4h */
- umov x10, v13.d[0]
+ umov x22, v13.d[0]
ins v13.2d[0], v9.2d[1]
- ins v9.2d[1], x10
+ ins v9.2d[1], x22
/* 1-D IDCT, pass 2 */
sub v2.8h, v10.8h, v14.8h
/* vswp v15.4h, v11MSB.4h */
- umov x10, v15.d[0]
+ umov x22, v15.d[0]
ins v15.2d[0], v11.2d[1]
- ins v11.2d[1], x10
+ ins v11.2d[1], x22
add v14.8h, v10.8h, v14.8h
/* vswp v12.4h, v8-MSB.4h */
- umov x10, v12.d[0]
+ umov x22, v12.d[0]
ins v12.2d[0], v8.2d[1]
- ins v8.2d[1], x10
+ ins v8.2d[1], x22
sub v1.8h, v11.8h, v13.8h
add v13.8h, v11.8h, v13.8h
sub v5.8h, v9.8h, v15.8h
@@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon
add v14.8h, v5.8h, v3.8h
sub v9.8h, v5.8h, v3.8h
sub v13.8h, v10.8h, v2.8h
- /* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */
- ld1 {v6.8h}, [sp], 16
- ld1 {v4.8h-v5.8h}, [sp], 32
add v10.8h, v10.8h, v2.8h
sub v11.8h, v12.8h, v1.8h
add v12.8h, v12.8h, v1.8h
/* Descale to 8-bit and range limit */
movi v0.16b, #0x80
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqshrn v8.8b, v8.8h, #5
sqshrn2 v8.16b, v9.8h, #5
sqshrn v9.8b, v10.8h, #5
@@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon
sqshrn2 v10.16b, v13.8h, #5
sqshrn v11.8b, v14.8h, #5
sqshrn2 v11.16b, v15.8h, #5
-#else
- sqshrn v8.4h, v8.4s, #5
- sqshrn2 v8.8h, v9.4s, #5
- sqshrn v9.4h, v10.4s, #5
- sqshrn2 v9.8h, v11.4s, #5
- sqshrn v10.4h, v12.4s, #5
- sqshrn2 v10.8h, v13.4s, #5
- sqshrn v11.4h, v14.4s, #5
- sqshrn2 v11.8h, v15.4s, #5
-#endif
add v8.16b, v8.16b, v0.16b
add v9.16b, v9.16b, v0.16b
add v10.16b, v10.16b, v0.16b
@@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon
add TMP2, TMP2, OUTPUT_COL
st1 {v9.8b}, [TMP1]
/* make copy */
- ins v21.2d[0], v10.2d[1]
+ ins v7.2d[0], v10.2d[1]
mov v18.16b, v10.16b
- trn1 v10.8b, v10.8b, v21.8b
- trn2 v21.8b, v18.8b, v21.8b
+ trn1 v10.8b, v10.8b, v7.8b
+ trn2 v7.8b, v18.8b, v7.8b
st1 {v19.8b}, [TMP2]
ldp TMP1, TMP2, [OUTPUT_BUF], 16
- ldp TMP3, TMP4, [OUTPUT_BUF]
+ ldp TMP4, TMP5, [OUTPUT_BUF], 16
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
+ add TMP5, TMP5, OUTPUT_COL
st1 {v10.8b}, [TMP1]
/* make copy */
- ins v23.2d[0], v11.2d[1]
+ ins v16.2d[0], v11.2d[1]
mov v18.16b, v11.16b
- trn1 v11.8b, v11.8b, v23.8b
- trn2 v23.8b, v18.8b, v23.8b
- st1 {v21.8b}, [TMP2]
- st1 {v11.8b}, [TMP3]
- st1 {v23.8b}, [TMP4]
+ trn1 v11.8b, v11.8b, v16.8b
+ trn2 v16.8b, v18.8b, v16.8b
+ st1 {v7.8b}, [TMP2]
+ st1 {v11.8b}, [TMP4]
+ st1 {v16.8b}, [TMP5]
+ sub sp, sp, #176
+ ldp x22, x23, [sp], 16
+ ld1 {v0.8b - v3.8b}, [sp], 32
+ ld1 {v4.8b - v7.8b}, [sp], 32
+ ld1 {v8.8b - v11.8b}, [sp], 32
+ ld1 {v12.8b - v15.8b}, [sp], 32
+ ld1 {v16.8b - v19.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15
- /* vpush {v8.4h-v15.4h} */
- sub sp, sp, #32
- st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
- sub sp, sp, #32
- st1 {v12.4h-v15.4h}, [sp]
-
+ /* Save all used NEON registers */
+ sub sp, sp, 272
+ str x15, [sp], 16
/* Load constants (v3.4h is just used for padding) */
adr TMP4, jsimd_idct_4x4_neon_consts
+ st1 {v0.8b - v3.8b}, [sp], 32
+ st1 {v4.8b - v7.8b}, [sp], 32
+ st1 {v8.8b - v11.8b}, [sp], 32
+ st1 {v12.8b - v15.8b}, [sp], 32
+ st1 {v16.8b - v19.8b}, [sp], 32
+ st1 {v20.8b - v23.8b}, [sp], 32
+ st1 {v24.8b - v27.8b}, [sp], 32
+ st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1290,10 +1288,17 @@ asm_function jsimd_idct_4x4_neon
st1 {v27.b}[7], [TMP4], 1
#endif
- /* vpop {v8.4h-v15.4h} ;not available */
- ld1 {v12.4h-v15.4h}, [sp], 32
- ld1 {v8.4h-v11.4h}, [sp], 32
-
+ /* vpop {v8.4h - v15.4h} ;not available */
+ sub sp, sp, #272
+ ldr x15, [sp], 16
+ ld1 {v0.8b - v3.8b}, [sp], 32
+ ld1 {v4.8b - v7.8b}, [sp], 32
+ ld1 {v8.8b - v11.8b}, [sp], 32
+ ld1 {v12.8b - v15.8b}, [sp], 32
+ ld1 {v16.8b - v19.8b}, [sp], 32
+ ld1 {v20.8b - v23.8b}, [sp], 32
+ ld1 {v24.8b - v27.8b}, [sp], 32
+ ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts:
.short FIX_3_624509785 /* d0[3] */
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- sshll v28.4s, \x4, #15
+ sshll v15.4s, \x4, #15
smull v26.4s, \x6, v0.4h[3]
smlal v26.4s, \x10, v0.4h[2]
smlal v26.4s, \x12, v0.4h[1]
smlal v26.4s, \x16, v0.4h[0]
- add v20.4s, v28.4s, v26.4s
- sub v28.4s, v28.4s, v26.4s
+ add v20.4s, v15.4s, v26.4s
+ sub v15.4s, v15.4s, v26.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
- srshr v28.4s, v28.4s, #\shift
+ srshr v15.4s, v15.4s, #\shift
xtn \y26, v20.4s
- xtn \y27, v28.4s
+ xtn \y27, v15.4s
.else
rshrn \y26, v20.4s, #\shift
- rshrn \y27, v28.4s, #\shift
+ rshrn \y27, v15.4s, #\shift
.endif
.endm
@@ -1363,15 +1368,20 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req x0
TMP2 .req x15
- /* vpush {v8.4h-v15.4h} ; not available */
- sub sp, sp, #32
- st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
- sub sp, sp, #32
- st1 {v12.4h-v15.4h}, [sp]
+ /* vpush {v8.4h - v15.4h} ; not available */
+ sub sp, sp, 208
+ str x15, [sp], 16
/* Load constants */
adr TMP2, jsimd_idct_2x2_neon_consts
- ld1 {v0.4h}, [TMP2]
+ st1 {v4.8b - v7.8b}, [sp], 32
+ st1 {v8.8b - v11.8b}, [sp], 32
+ st1 {v12.8b - v15.8b}, [sp], 32
+ st1 {v16.8b - v19.8b}, [sp], 32
+ st1 {v21.8b - v22.8b}, [sp], 16
+ st1 {v24.8b - v27.8b}, [sp], 32
+ st1 {v30.8b - v31.8b}, [sp], 16
+ ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
#else
- smull v26.4s, v6.4h, v0.4h[3]
- smlal v26.4s, v10.4h, v0.4h[2]
- smlal v26.4s, v12.4h, v0.4h[1]
- smlal v26.4s, v16.4h, v0.4h[0]
- smull v24.4s, v7.4h, v0.4h[3]
- smlal v24.4s, v11.4h, v0.4h[2]
- smlal v24.4s, v13.4h, v0.4h[1]
- smlal v24.4s, v17.4h, v0.4h[0]
- sshll v28.4s, v4.4h, #15
+ smull v26.4s, v6.4h, v14.4h[3]
+ smlal v26.4s, v10.4h, v14.4h[2]
+ smlal v26.4s, v12.4h, v14.4h[1]
+ smlal v26.4s, v16.4h, v14.4h[0]
+ smull v24.4s, v7.4h, v14.4h[3]
+ smlal v24.4s, v11.4h, v14.4h[2]
+ smlal v24.4s, v13.4h, v14.4h[1]
+ smlal v24.4s, v17.4h, v14.4h[0]
+ sshll v15.4s, v4.4h, #15
sshll v30.4s, v5.4h, #15
- add v20.4s, v28.4s, v26.4s
- sub v28.4s, v28.4s, v26.4s
+ add v20.4s, v15.4s, v26.4s
+ sub v15.4s, v15.4s, v26.4s
rshrn v4.4h, v20.4s, #13
- rshrn v6.4h, v28.4s, #13
+ rshrn v6.4h, v15.4s, #13
add v20.4s, v30.4s, v24.4s
- sub v28.4s, v30.4s, v24.4s
+ sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
- rshrn v7.4h, v28.4s, #13
+ rshrn v7.4h, v15.4s, #13
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
#endif
@@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
- /* vpop {v8.4h-v15.4h} ;not available */
-
- ld1 {v12.4h-v15.4h}, [sp], 32
- ld1 {v8.4h-v11.4h}, [sp], 32
-
+ sub sp, sp, #208
+ ldr x15, [sp], 16
+ ld1 {v4.8b - v7.8b}, [sp], 32
+ ld1 {v8.8b - v11.8b}, [sp], 32
+ ld1 {v12.8b - v15.8b}, [sp], 32
+ ld1 {v16.8b - v19.8b}, [sp], 32
+ ld1 {v21.8b - v22.8b}, [sp], 16
+ ld1 {v24.8b - v27.8b}, [sp], 32
+ ld1 {v30.8b - v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon
.error unsupported bpp
.endif
.endm
-#ifdef RTSM_SQSHRN_SIM_ISSUE
+
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
-#else
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
-#endif
+
/*
- * 2 stage pipelined YCbCr->RGB conversion
+ * 2-stage pipelined YCbCr->RGB conversion
*/
.macro do_yuv_to_rgb_stage1
@@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
sqxtun v1\r_offs\defsize, v24.8h
sqxtun v1\b_offs\defsize, v28.8h
-#else
- sqxtun v1\g_offs\gsize, v20.4s
- sqxtun v1\r_offs\rsize, v24.4s
- sqxtun v1\b_offs\bsize, v28.4s
-#endif
.endm
.macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
-#else
- sqxtun v1\g_offs\gsize, v20.4s
-#endif
ld1 {v0.8b}, [Y], 8
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\r_offs\defsize, v24.8h
-#else
- sqxtun v1\r_offs\rsize, v24.4s
-#endif
prfm PLDL1KEEP, [U, #64]
prfm PLDL1KEEP, [V, #64]
prfm PLDL1KEEP, [Y, #64]
-#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\b_offs\defsize, v28.8h
-#else
- sqxtun v1\b_offs\gsize, v28.4s
-#endif
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
do_store \bpp, 8
@@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
V .req x10
N .req x15
+ sub sp, sp, 336
+ str x15, [sp], 16
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
adr x15, jsimd_ycc_\colorid\()_neon_consts
+ /* Save NEON registers */
+ st1 {v0.8b - v3.8b}, [sp], 32
+ st1 {v4.8b - v7.8b}, [sp], 32
+ st1 {v8.8b - v11.8b}, [sp], 32
+ st1 {v12.8b - v15.8b}, [sp], 32
+ st1 {v16.8b - v19.8b}, [sp], 32
+ st1 {v20.8b - v23.8b}, [sp], 32
+ st1 {v24.8b - v27.8b}, [sp], 32
+ st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
/* Save ARM registers and handle input arguments */
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
- stp x4, x5, [sp,-16]!
- stp x6, x7, [sp,-16]!
- stp x8, x9, [sp,-16]!
- stp x10, x30, [sp,-16]!
+ stp x4, x5, [sp], 16
+ stp x6, x7, [sp], 16
+ stp x8, x9, [sp], 16
+ stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, 8]
ldr INPUT_BUF2, [INPUT_BUF, 16]
.unreq INPUT_BUF
- /* Save NEON registers */
- /* vpush {v8.4h-v15.4h} */
- sub sp, sp, #32
- st1 {v8.4h-v11.4h}, [sp]
- sub sp, sp, #32
- st1 {v12.4h-v15.4h}, [sp]
-
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
movi v10.16b, #255
movi v12.16b, #255
@@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
bgt 0b
9:
/* Restore all registers and return */
- /* vpop {v8.4h-v15.4h} */
- ld1 {v12.4h-v15.4h}, [sp], #32
- ld1 {v8.4h-v11.4h}, [sp], #32
+ sub sp, sp, #336
+ ldr x15, [sp], 16
+ ld1 {v0.8b - v3.8b}, [sp], 32
+ ld1 {v4.8b - v7.8b}, [sp], 32
+ ld1 {v8.8b - v11.8b}, [sp], 32
+ ld1 {v12.8b - v15.8b}, [sp], 32
+ ld1 {v16.8b - v19.8b}, [sp], 32
+ ld1 {v20.8b - v23.8b}, [sp], 32
+ ld1 {v24.8b - v27.8b}, [sp], 32
+ ld1 {v28.8b - v31.8b}, [sp], 32
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
- ldp x10, x30, [sp], #16
- ldp x8, x9, [sp], #16
- ldp x6, x5, [sp], #16
- ldp x4, x5, [sp], #16
+ ldp x4, x5, [sp], 16
+ ldp x6, x7, [sp], 16
+ ldp x8, x9, [sp], 16
+ ldp x10, x30, [sp], 16
br x30
.unreq OUTPUT_WIDTH
.unreq INPUT_ROW
@@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.purgem do_yuv_to_rgb_stage2_store_load_stage1
.endm
-/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
- * as a workaround for the simulator fix
- */
-#ifdef RTSM_SQSHRN_SIM_ISSUE
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
@@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
-#else
-/*--------------------------------- id ----- bpp R rsize G gsize B bsize */
-generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h
-generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h
-#endif
.purgem do_load
.purgem do_store