summaryrefslogtreecommitdiff
path: root/trunk/simd
diff options
context:
space:
mode:
authordcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2011-05-03 08:47:43 +0000
committerdcommander <dcommander@3789f03b-4d11-0410-bbf8-ca57d06f2519>2011-05-03 08:47:43 +0000
commit7201ac1ca6add98eb12422d8672c855b63c7668b (patch)
tree563e4881a8d9d702482be3769cf1959fb747b016 /trunk/simd
parentf94f43ecb0ed2ecf68bd4a88da6399c88c5d0aae (diff)
ARM NEON support
git-svn-id: https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo@607 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'trunk/simd')
-rw-r--r--trunk/simd/Makefile.am6
-rw-r--r--trunk/simd/jsimd.h45
-rw-r--r--trunk/simd/jsimd_arm.c524
-rw-r--r--trunk/simd/jsimd_arm_neon.S484
4 files changed, 1054 insertions, 5 deletions
diff --git a/trunk/simd/Makefile.am b/trunk/simd/Makefile.am
index 959e82b..fbba807 100644
--- a/trunk/simd/Makefile.am
+++ b/trunk/simd/Makefile.am
@@ -52,6 +52,12 @@ jdmermmx.lo: jdmrgmmx.asm
jdmerss2.lo: jdmrgss2.asm
endif
+if SIMD_ARM
+
+libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
+
+endif
+
AM_CPPFLAGS = -I$(top_srcdir)
.asm.lo:
diff --git a/trunk/simd/jsimd.h b/trunk/simd/jsimd.h
index 60ae0e0..3b801f4 100644
--- a/trunk/simd/jsimd.h
+++ b/trunk/simd/jsimd.h
@@ -12,11 +12,12 @@
/* Bitmask for supported acceleration methods */
-#define JSIMD_NONE 0x00
-#define JSIMD_MMX 0x01
-#define JSIMD_3DNOW 0x02
-#define JSIMD_SSE 0x04
-#define JSIMD_SSE2 0x08
+#define JSIMD_NONE 0x00
+#define JSIMD_MMX 0x01
+#define JSIMD_3DNOW 0x02
+#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
+#define JSIMD_ARM_NEON 0x10
/* Short forms of external names for systems with brain-damaged linkers. */
@@ -327,6 +328,35 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx
JPP((JDIMENSION image_width, int max_v_samp_factor,
@@ -560,6 +590,11 @@ EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,
diff --git a/trunk/simd/jsimd_arm.c b/trunk/simd/jsimd_arm.c
new file mode 100644
index 0000000..b70b94e
--- /dev/null
+++ b/trunk/simd/jsimd_arm.c
@@ -0,0 +1,524 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * ARM architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#ifdef __linux__
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature (char *buffer, char *feature)
+{
+ char *p;
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "Features", 8) != 0)
+ return 0;
+ buffer += 8;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo (int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "neon"))
+ simd_support |= JSIMD_ARM_NEON;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+ char *env = NULL;
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+
+ if (simd_support != ~0)
+ return;
+
+ simd_support = 0;
+
+#ifdef __linux__
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCE_ARM_NEON");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_ARM_NEON;
+ env = getenv("JSIMD_FORCE_NO_SIMD");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+ if (simd_support & JSIMD_ARM_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ neonfct=jsimd_ycc_extrgb_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ neonfct=jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct=jsimd_ycc_extbgr_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ neonfct=jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ neonfct=jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ neonfct=jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+ neonfct=jsimd_ycc_extrgb_convert_neon;
+ break;
+ }
+
+ if (simd_support & JSIMD_ARM_NEON)
+ neonfct(cinfo->output_width, input_buf,
+ input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_ARM_NEON))
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ init_simd();
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_ARM_NEON))
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
diff --git a/trunk/simd/jsimd_arm_neon.S b/trunk/simd/jsimd_arm_neon.S
new file mode 100644
index 0000000..2d66ab2
--- /dev/null
+++ b/trunk/simd/jsimd_arm_neon.S
@@ -0,0 +1,484 @@
+/*
+ * ARM NEON optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
+ * All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.altmacro
+.arm
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
+.macro transpose_4x4 x0, x1, x2, x3
+ vtrn.16 x0, x1
+ vtrn.16 x2, x3
+ vtrn.32 x0, x2
+ vtrn.32 x1, x3
+.endm
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
+ * function from jidctfst.c
+ *
+ * TODO: a bit better instructions scheduling is needed.
+ */
+
+#define XFIX_1_082392200 d0[0]
+#define XFIX_1_414213562 d0[1]
+#define XFIX_1_847759065 d0[2]
+#define XFIX_2_613125930 d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+/* 1-D IDCT helper macro */
+
+.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
+ t10, t11, t12, t13, t14
+
+ vsub.s16 \t10, \x0, \x4
+ vadd.s16 \x4, \x0, \x4
+ vswp.s16 \t10, \x0
+ vsub.s16 \t11, \x2, \x6
+ vadd.s16 \x6, \x2, \x6
+ vswp.s16 \t11, \x2
+ vsub.s16 \t10, \x3, \x5
+ vadd.s16 \x5, \x3, \x5
+ vswp.s16 \t10, \x3
+ vsub.s16 \t11, \x1, \x7
+ vadd.s16 \x7, \x1, \x7
+ vswp.s16 \t11, \x1
+
+ vqdmulh.s16 \t13, \x2, d0[1]
+ vadd.s16 \t12, \x3, \x3
+ vadd.s16 \x2, \x2, \t13
+ vqdmulh.s16 \t13, \x3, d0[3]
+ vsub.s16 \t10, \x1, \x3
+ vadd.s16 \t12, \t12, \t13
+ vqdmulh.s16 \t13, \t10, d0[2]
+ vsub.s16 \t11, \x7, \x5
+ vadd.s16 \t10, \t10, \t13
+ vqdmulh.s16 \t13, \t11, d0[1]
+ vadd.s16 \t11, \t11, \t13
+
+ vqdmulh.s16 \t13, \x1, d0[0]
+ vsub.s16 \x2, \x6, \x2
+ vsub.s16 \t14, \x0, \x2
+ vadd.s16 \x2, \x0, \x2
+ vadd.s16 \x0, \x4, \x6
+ vsub.s16 \x4, \x4, \x6
+ vadd.s16 \x1, \x1, \t13
+ vadd.s16 \t13, \x7, \x5
+ vsub.s16 \t12, \t13, \t12
+ vsub.s16 \t12, \t12, \t10
+ vadd.s16 \t11, \t12, \t11
+ vsub.s16 \t10, \x1, \t10
+ vadd.s16 \t10, \t10, \t11
+
+ vsub.s16 \x7, \x0, \t13
+ vadd.s16 \x0, \x0, \t13
+ vadd.s16 \x6, \t14, \t12
+ vsub.s16 \x1, \t14, \t12
+ vsub.s16 \x5, \x2, \t11
+ vadd.s16 \x2, \x2, \t11
+ vsub.s16 \x3, \x4, \t10
+ vadd.s16 \x4, \x4, \t10
+.endm
+
+asm_function jsimd_idct_ifast_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP .req ip
+
+ vpush {d8-d15}
+
+ /* Load constants */
+ adr TMP, jsimd_idct_ifast_neon_consts
+ vld1.16 {d0}, [TMP, :64]
+
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d4 | d5
+ * 1 | d6 | d7
+ * 2 | d8 | d9
+ * 3 | d10 | d11
+ * 4 | d12 | d13
+ * 5 | d14 | d15
+ * 6 | d16 | d17
+ * 7 | d18 | d19
+ */
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
+ vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
+ vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
+ /* Dequantize */
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q2, q2, q10
+ vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
+ vmul.s16 q3, q3, q11
+ vmul.s16 q4, q4, q12
+ vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
+ vmul.s16 q5, q5, q13
+ vmul.s16 q6, q6, q14
+ vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
+ vmul.s16 q7, q7, q15
+ vmul.s16 q8, q8, q10
+ vmul.s16 q9, q9, q11
+
+ /* Pass 1 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Pass 2 */
+ idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
+ /* Transpose */
+ transpose_4x4 d4, d6, d8, d10
+ transpose_4x4 d5, d7, d9, d11
+ transpose_4x4 d12, d14, d16, d18
+ transpose_4x4 d13, d15, d17, d19
+ vswp d12, d5
+ vswp d14, d7
+ vswp d16, d9
+ vswp d18, d11
+
+ /* Descale and range limit */
+ vmov.s16 q15, #(0x80 << 5)
+ vqadd.s16 q2, q2, q15
+ vqadd.s16 q3, q3, q15
+ vqadd.s16 q4, q4, q15
+ vqadd.s16 q5, q5, q15
+ vqadd.s16 q6, q6, q15
+ vqadd.s16 q7, q7, q15
+ vqadd.s16 q8, q8, q15
+ vqadd.s16 q9, q9, q15
+ vqshrun.s16 d4, q2, #5
+ vqshrun.s16 d6, q3, #5
+ vqshrun.s16 d8, q4, #5
+ vqshrun.s16 d10, q5, #5
+ vqshrun.s16 d12, q6, #5
+ vqshrun.s16 d14, q7, #5
+ vqshrun.s16 d16, q8, #5
+ vqshrun.s16 d18, q9, #5
+
+ /* Store results to the output buffer */
+ .irp x, d4, d6, d8, d10, d12, d14, d16, d18
+ ldr TMP, [OUTPUT_BUF], #4
+ add TMP, TMP, OUTPUT_COL
+ vst1.8 {x}, [TMP]!
+ .endr
+
+ vpop {d8-d15}
+ bx lr
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP
+.endfunc
+
+.purgem idct_helper
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.balign 16
+jsimd_ycc_rgb_neon_consts:
+ .short 0, 0, 0, 0
+ .short 22971, -11277, -23401, 29033
+ .short -128, -128, -128, -128
+ .short -128, -128, -128, -128
+
+.macro do_load size
+ .if size == 8
+ vld1.8 {d4}, [U]!
+ vld1.8 {d5}, [V]!
+ vld1.8 {d0}, [Y]!
+ pld [Y, #64]
+ pld [U, #64]
+ pld [V, #64]
+ .elseif size == 4
+ vld1.8 {d4[0]}, [U]!
+ vld1.8 {d4[1]}, [U]!
+ vld1.8 {d4[2]}, [U]!
+ vld1.8 {d4[3]}, [U]!
+ vld1.8 {d5[0]}, [V]!
+ vld1.8 {d5[1]}, [V]!
+ vld1.8 {d5[2]}, [V]!
+ vld1.8 {d5[3]}, [V]!
+ vld1.8 {d0[0]}, [Y]!
+ vld1.8 {d0[1]}, [Y]!
+ vld1.8 {d0[2]}, [Y]!
+ vld1.8 {d0[3]}, [Y]!
+ .elseif size == 2
+ vld1.8 {d4[4]}, [U]!
+ vld1.8 {d4[5]}, [U]!
+ vld1.8 {d5[4]}, [V]!
+ vld1.8 {d5[5]}, [V]!
+ vld1.8 {d0[4]}, [Y]!
+ vld1.8 {d0[5]}, [Y]!
+ .elseif size == 1
+ vld1.8 {d4[6]}, [U]!
+ vld1.8 {d5[6]}, [V]!
+ vld1.8 {d0[6]}, [Y]!
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_store bpp, size
+ .if bpp == 24
+ .if size == 8
+ vst3.8 {d10, d11, d12}, [RGB]!
+ .elseif size == 4
+ vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
+ vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
+ vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
+ vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
+ .elseif size == 2
+ vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
+ vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
+ .elseif size == 1
+ vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif bpp == 32
+ .if size == 8
+ vst4.8 {d10, d11, d12, d13}, [RGB]!
+ .elseif size == 4
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+ .elseif size == 2
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+ .elseif size == 1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+.macro do_yuv_to_rgb
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
+ vrshrn.s32 d20, q10, #15
+ vrshrn.s32 d21, q11, #15
+ vrshrn.s32 d24, q12, #14
+ vrshrn.s32 d25, q13, #14
+ vrshrn.s32 d28, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vaddw.u8 q10, q10, d0
+ vaddw.u8 q12, q12, d0
+ vaddw.u8 q14, q14, d0
+ vqmovun.s16 d1&g_offs, q10
+ vqmovun.s16 d1&r_offs, q12
+ vqmovun.s16 d1&b_offs, q14
+.endm
+
+asm_function jsimd_ycc_&colorid&_convert_neon
+ OUTPUT_WIDTH .req r0
+ INPUT_BUF .req r1
+ INPUT_ROW .req r2
+ OUTPUT_BUF .req r3
+ NUM_ROWS .req r4
+
+ INPUT_BUF0 .req r5
+ INPUT_BUF1 .req r6
+ INPUT_BUF2 .req INPUT_BUF
+
+ RGB .req r7
+ Y .req r8
+ U .req r9
+ V .req r10
+ N .req ip
+
+ /* Load constants to d1, d2, d3 (d0 is just used for padding) */
+ adrl ip, jsimd_ycc_rgb_neon_consts
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
+
+ /* Save ARM registers and handle input arguments */
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ ldr NUM_ROWS, [sp, #(4 * 8)]
+ ldr INPUT_BUF0, [INPUT_BUF]
+ ldr INPUT_BUF1, [INPUT_BUF, #4]
+ ldr INPUT_BUF2, [INPUT_BUF, #8]
+ .unreq INPUT_BUF
+
+ /* Save NEON registers */
+ vpush {d8-d15}
+
+ /* Initially set d10, d11, d12, d13 to 0xFF */
+ vmov.u8 q5, #255
+ vmov.u8 q6, #255
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ blt 9f
+0:
+ ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
+ ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
+ mov N, OUTPUT_WIDTH
+ ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
+ add INPUT_ROW, INPUT_ROW, #1
+ ldr RGB, [OUTPUT_BUF], #4
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ blt 2f
+1:
+ do_load 8
+ do_yuv_to_rgb
+ do_store bpp, 8
+ subs N, N, #8
+ bge 1b
+ tst N, #7
+ beq 8f
+2:
+ tst N, #4
+ beq 3f
+ do_load 4
+3:
+ tst N, #2
+ beq 4f
+ do_load 2
+4:
+ tst N, #1
+ beq 5f
+ do_load 1
+5:
+ do_yuv_to_rgb
+ tst N, #4
+ beq 6f
+ do_store bpp, 4
+6:
+ tst N, #2
+ beq 7f
+ do_store bpp, 2
+7:
+ tst N, #1
+ beq 8f
+ do_store bpp, 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ bgt 0b
+9:
+ /* Restore all registers and return */
+ vpop {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+
+ .unreq OUTPUT_WIDTH
+ .unreq INPUT_ROW
+ .unreq OUTPUT_BUF
+ .unreq NUM_ROWS
+ .unreq INPUT_BUF0
+ .unreq INPUT_BUF1
+ .unreq INPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+.endfunc
+
+.purgem do_yuv_to_rgb
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B */
+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
+
+/*****************************************************************************/