From c14f6455230cbff4948fde981eed116298f5abb6 Mon Sep 17 00:00:00 2001 From: Tom Gall Date: Mon, 29 Aug 2011 09:28:27 -0500 Subject: sync to 1.1.90 upstream, change 691 --- simd/Makefile.in | 540 ------------- simd/jcgrammx.asm | 116 +++ simd/jcgrass2-64.asm | 113 +++ simd/jcgrass2.asm | 113 +++ simd/jcgrymmx.asm | 357 +++++++++ simd/jcgryss2-64.asm | 364 +++++++++ simd/jcgryss2.asm | 383 ++++++++++ simd/jsimd_arm.c | 662 ++++++++++++++++ simd/jsimd_arm_neon.S | 2033 +++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 4141 insertions(+), 540 deletions(-) delete mode 100644 simd/Makefile.in create mode 100644 simd/jcgrammx.asm create mode 100644 simd/jcgrass2-64.asm create mode 100644 simd/jcgrass2.asm create mode 100644 simd/jcgrymmx.asm create mode 100644 simd/jcgryss2-64.asm create mode 100644 simd/jcgryss2.asm create mode 100644 simd/jsimd_arm.c create mode 100644 simd/jsimd_arm_neon.S (limited to 'simd') diff --git a/simd/Makefile.in b/simd/Makefile.in deleted file mode 100644 index bf14b58..0000000 --- a/simd/Makefile.in +++ /dev/null @@ -1,540 +0,0 @@ -# Makefile.in generated by automake 1.9.2 from Makefile.am. -# @configure_input@ - -# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004 Free Software Foundation, Inc. -# This Makefile.in is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -@SET_MAKE@ - -SOURCES = $(libsimd_la_SOURCES) - -srcdir = @srcdir@ -top_srcdir = @top_srcdir@ -VPATH = @srcdir@ -pkgdatadir = $(datadir)/@PACKAGE@ -pkglibdir = $(libdir)/@PACKAGE@ -pkgincludedir = $(includedir)/@PACKAGE@ -top_builddir = .. -am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -INSTALL = @INSTALL@ -install_sh_DATA = $(install_sh) -c -m 644 -install_sh_PROGRAM = $(install_sh) -c -install_sh_SCRIPT = $(install_sh) -c -INSTALL_HEADER = $(INSTALL_DATA) -transform = $(program_transform_name) -NORMAL_INSTALL = : -PRE_INSTALL = : -POST_INSTALL = : -NORMAL_UNINSTALL = : -PRE_UNINSTALL = : -POST_UNINSTALL = : -build_triplet = @build@ -host_triplet = @host@ -subdir = simd -DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in -ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ - $(top_srcdir)/configure.ac -am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ - $(ACLOCAL_M4) -mkinstalldirs = $(install_sh) -d -CONFIG_HEADER = $(top_builddir)/config.h $(top_builddir)/jconfig.h -CONFIG_CLEAN_FILES = -LTLIBRARIES = $(noinst_LTLIBRARIES) -libsimd_la_LIBADD = -am__libsimd_la_SOURCES_DIST = jsimd_i386.c jsimd.h jsimdcfg.inc.h \ - jsimdext.inc jcolsamp.inc jdct.inc jsimdcpu.asm jccolmmx.asm \ - jdcolmmx.asm jcsammmx.asm jdsammmx.asm jdmermmx.asm \ - jcqntmmx.asm jfmmxfst.asm jfmmxint.asm jimmxred.asm \ - jimmxint.asm jimmxfst.asm jcqnt3dn.asm jf3dnflt.asm \ - ji3dnflt.asm jcqntsse.asm jfsseflt.asm jisseflt.asm \ - jccolss2.asm jdcolss2.asm jcsamss2.asm jdsamss2.asm \ - jdmerss2.asm jcqnts2i.asm jfss2fst.asm jfss2int.asm \ - jiss2red.asm jiss2int.asm jiss2fst.asm jcqnts2f.asm \ - jiss2flt.asm jsimd_x86_64.c jfsseflt-64.asm jccolss2-64.asm \ - jdcolss2-64.asm jcsamss2-64.asm jdsamss2-64.asm \ - jdmerss2-64.asm jcqnts2i-64.asm jfss2fst-64.asm \ - jfss2int-64.asm jiss2red-64.asm jiss2int-64.asm \ - jiss2fst-64.asm jcqnts2f-64.asm jiss2flt-64.asm -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@am_libsimd_la_OBJECTS = \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jsimd_x86_64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jfsseflt-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jccolss2-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jdcolss2-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jcsamss2-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jdsamss2-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jdmerss2-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jcqnts2i-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jfss2fst-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jfss2int-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jiss2red-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jiss2int-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jiss2fst-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jcqnts2f-64.lo \ -@SIMD_I386_FALSE@@SIMD_X86_64_TRUE@ jiss2flt-64.lo -@SIMD_I386_TRUE@am_libsimd_la_OBJECTS = jsimd_i386.lo jsimdcpu.lo \ -@SIMD_I386_TRUE@ jccolmmx.lo jdcolmmx.lo jcsammmx.lo \ -@SIMD_I386_TRUE@ jdsammmx.lo jdmermmx.lo jcqntmmx.lo \ -@SIMD_I386_TRUE@ jfmmxfst.lo jfmmxint.lo jimmxred.lo \ -@SIMD_I386_TRUE@ jimmxint.lo jimmxfst.lo jcqnt3dn.lo \ -@SIMD_I386_TRUE@ jf3dnflt.lo ji3dnflt.lo jcqntsse.lo \ -@SIMD_I386_TRUE@ jfsseflt.lo jisseflt.lo jccolss2.lo \ -@SIMD_I386_TRUE@ jdcolss2.lo jcsamss2.lo jdsamss2.lo \ -@SIMD_I386_TRUE@ jdmerss2.lo jcqnts2i.lo jfss2fst.lo \ -@SIMD_I386_TRUE@ jfss2int.lo jiss2red.lo jiss2int.lo \ -@SIMD_I386_TRUE@ jiss2fst.lo jcqnts2f.lo jiss2flt.lo -libsimd_la_OBJECTS = $(am_libsimd_la_OBJECTS) -DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) -I$(top_builddir) -depcomp = $(SHELL) $(top_srcdir)/depcomp -am__depfiles_maybe = depfiles -COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ - $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -LTCOMPILE = $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) \ - $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ - $(AM_CFLAGS) $(CFLAGS) -CCLD = $(CC) -LINK = $(LIBTOOL) --mode=link --tag=CC $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ - $(AM_LDFLAGS) $(LDFLAGS) -o $@ -SOURCES = $(libsimd_la_SOURCES) -DIST_SOURCES = $(am__libsimd_la_SOURCES_DIST) -ETAGS = etags -CTAGS = ctags -DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) -ACLOCAL = @ACLOCAL@ -AMDEP_FALSE = @AMDEP_FALSE@ -AMDEP_TRUE = @AMDEP_TRUE@ -AMTAR = @AMTAR@ -AR = @AR@ -AUTOCONF = @AUTOCONF@ -AUTOHEADER = @AUTOHEADER@ -AUTOMAKE = @AUTOMAKE@ -AWK = @AWK@ -BUILD = @BUILD@ -CC = @CC@ -CCDEPMODE = @CCDEPMODE@ -CFLAGS = @CFLAGS@ -CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ -CXX = @CXX@ -CXXCPP = @CXXCPP@ -CXXDEPMODE = @CXXDEPMODE@ -CXXFLAGS = @CXXFLAGS@ -CYGPATH_W = @CYGPATH_W@ -DEBARCH = @DEBARCH@ -DEFS = @DEFS@ -DEPDIR = @DEPDIR@ -ECHO = @ECHO@ -ECHO_C = @ECHO_C@ -ECHO_N = @ECHO_N@ -ECHO_T = @ECHO_T@ -EGREP = @EGREP@ -EXEEXT = @EXEEXT@ -F77 = @F77@ -FFLAGS = @FFLAGS@ -INSTALL_DATA = @INSTALL_DATA@ -INSTALL_PROGRAM = @INSTALL_PROGRAM@ -INSTALL_SCRIPT = @INSTALL_SCRIPT@ -INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -JPEG_LIB_VERSION = @JPEG_LIB_VERSION@ -JPEG_LIB_VERSION_DECIMAL = @JPEG_LIB_VERSION_DECIMAL@ -LDFLAGS = @LDFLAGS@ -LIBOBJS = @LIBOBJS@ -LIBS = @LIBS@ -LIBTOOL = @LIBTOOL@ -LN_S = @LN_S@ -LTLIBOBJS = @LTLIBOBJS@ -MAKEINFO = @MAKEINFO@ -NAFLAGS = @NAFLAGS@ -NASM = @NASM@ -OBJEXT = @OBJEXT@ -PACKAGE = @PACKAGE@ -PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ -PACKAGE_NAME = @PACKAGE_NAME@ -PACKAGE_STRING = @PACKAGE_STRING@ -PACKAGE_TARNAME = @PACKAGE_TARNAME@ -PACKAGE_VERSION = @PACKAGE_VERSION@ -PATH_SEPARATOR = @PATH_SEPARATOR@ -RANLIB = @RANLIB@ -RPMARCH = @RPMARCH@ -SET_MAKE = @SET_MAKE@ -SHELL = @SHELL@ -SIMD_I386_FALSE = @SIMD_I386_FALSE@ -SIMD_I386_TRUE = @SIMD_I386_TRUE@ -SIMD_X86_64_FALSE = @SIMD_X86_64_FALSE@ -SIMD_X86_64_TRUE = @SIMD_X86_64_TRUE@ -SO_MAJOR_VERSION = @SO_MAJOR_VERSION@ -SO_MINOR_VERSION = @SO_MINOR_VERSION@ -STRIP = @STRIP@ -VERSION = @VERSION@ -VERSION_SCRIPT_FALSE = @VERSION_SCRIPT_FALSE@ -VERSION_SCRIPT_FLAG = @VERSION_SCRIPT_FLAG@ -VERSION_SCRIPT_TRUE = @VERSION_SCRIPT_TRUE@ -WITH_ARITH_DEC_FALSE = @WITH_ARITH_DEC_FALSE@ -WITH_ARITH_DEC_TRUE = @WITH_ARITH_DEC_TRUE@ -WITH_ARITH_ENC_FALSE = @WITH_ARITH_ENC_FALSE@ -WITH_ARITH_ENC_TRUE = @WITH_ARITH_ENC_TRUE@ -WITH_ARITH_FALSE = @WITH_ARITH_FALSE@ -WITH_ARITH_TRUE = @WITH_ARITH_TRUE@ -WITH_SIMD_FALSE = @WITH_SIMD_FALSE@ -WITH_SIMD_TRUE = @WITH_SIMD_TRUE@ -X86_64_FALSE = @X86_64_FALSE@ -X86_64_TRUE = @X86_64_TRUE@ -ac_ct_AR = @ac_ct_AR@ -ac_ct_CC = @ac_ct_CC@ -ac_ct_CXX = @ac_ct_CXX@ -ac_ct_F77 = @ac_ct_F77@ -ac_ct_RANLIB = @ac_ct_RANLIB@ -ac_ct_STRIP = @ac_ct_STRIP@ -am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ -am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ -am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ -am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ -am__include = @am__include@ -am__leading_dot = @am__leading_dot@ -am__quote = @am__quote@ -am__tar = @am__tar@ -am__untar = @am__untar@ -bindir = @bindir@ -build = @build@ -build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_vendor = @build_vendor@ -datadir = @datadir@ -exec_prefix = @exec_prefix@ -host = @host@ -host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ -includedir = @includedir@ -infodir = @infodir@ -install_sh = @install_sh@ -libdir = @libdir@ -libexecdir = @libexecdir@ -localstatedir = @localstatedir@ -mandir = @mandir@ -mkdir_p = @mkdir_p@ -oldincludedir = @oldincludedir@ -prefix = @prefix@ -program_transform_name = @program_transform_name@ -sbindir = @sbindir@ -sharedstatedir = @sharedstatedir@ -sysconfdir = @sysconfdir@ -target_alias = @target_alias@ -noinst_LTLIBRARIES = libsimd.la -BUILT_SOURCES = jsimdcfg.inc -EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcclrss2.asm jdclrmmx.asm jdclrss2.asm \ - jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \ - jdmrgss2-64.asm CMakeLists.txt - -@SIMD_I386_TRUE@libsimd_la_SOURCES = jsimd_i386.c \ -@SIMD_I386_TRUE@ jsimd.h jsimdcfg.inc.h \ -@SIMD_I386_TRUE@ jsimdext.inc jcolsamp.inc jdct.inc \ -@SIMD_I386_TRUE@ jsimdcpu.asm \ -@SIMD_I386_TRUE@ jccolmmx.asm jdcolmmx.asm \ -@SIMD_I386_TRUE@ jcsammmx.asm jdsammmx.asm jdmermmx.asm \ -@SIMD_I386_TRUE@ jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \ -@SIMD_I386_TRUE@ jimmxred.asm jimmxint.asm jimmxfst.asm \ -@SIMD_I386_TRUE@ jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \ -@SIMD_I386_TRUE@ jcqntsse.asm jfsseflt.asm jisseflt.asm \ -@SIMD_I386_TRUE@ jccolss2.asm jdcolss2.asm \ -@SIMD_I386_TRUE@ jcsamss2.asm jdsamss2.asm jdmerss2.asm \ -@SIMD_I386_TRUE@ jcqnts2i.asm jfss2fst.asm jfss2int.asm \ -@SIMD_I386_TRUE@ jiss2red.asm jiss2int.asm jiss2fst.asm \ -@SIMD_I386_TRUE@ jcqnts2f.asm jiss2flt.asm - -@SIMD_X86_64_TRUE@libsimd_la_SOURCES = jsimd_x86_64.c \ -@SIMD_X86_64_TRUE@ jsimd.h jsimdcfg.inc.h \ -@SIMD_X86_64_TRUE@ jsimdext.inc jcolsamp.inc jdct.inc \ -@SIMD_X86_64_TRUE@ jfsseflt-64.asm \ -@SIMD_X86_64_TRUE@ jccolss2-64.asm jdcolss2-64.asm \ -@SIMD_X86_64_TRUE@ jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \ -@SIMD_X86_64_TRUE@ jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \ -@SIMD_X86_64_TRUE@ jiss2red-64.asm jiss2int-64.asm jiss2fst-64.asm \ -@SIMD_X86_64_TRUE@ jcqnts2f-64.asm jiss2flt-64.asm - -AM_CPPFLAGS = -I$(top_srcdir) -all: $(BUILT_SOURCES) - $(MAKE) $(AM_MAKEFLAGS) all-am - -.SUFFIXES: -.SUFFIXES: .asm .c .lo .o .obj -$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) - @for dep in $?; do \ - case '$(am__configure_deps)' in \ - *$$dep*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ - && exit 0; \ - exit 1;; \ - esac; \ - done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign simd/Makefile'; \ - cd $(top_srcdir) && \ - $(AUTOMAKE) --foreign simd/Makefile -.PRECIOUS: Makefile -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - @case '$?' in \ - *config.status*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ - *) \ - echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ - cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ - esac; - -$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -$(top_srcdir)/configure: $(am__configure_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(ACLOCAL_M4): $(am__aclocal_m4_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -clean-noinstLTLIBRARIES: - -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) - @list='$(noinst_LTLIBRARIES)'; for p in $$list; do \ - dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ - test "$$dir" != "$$p" || dir=.; \ - echo "rm -f \"$${dir}/so_locations\""; \ - rm -f "$${dir}/so_locations"; \ - done -libsimd.la: $(libsimd_la_OBJECTS) $(libsimd_la_DEPENDENCIES) - $(LINK) $(libsimd_la_LDFLAGS) $(libsimd_la_OBJECTS) $(libsimd_la_LIBADD) $(LIBS) - -mostlyclean-compile: - -rm -f *.$(OBJEXT) - -distclean-compile: - -rm -f *.tab.c - -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_i386.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jsimd_x86_64.Plo@am__quote@ - -.c.o: -@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ -@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(COMPILE) -c $< - -.c.obj: -@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ -@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` - -.c.lo: -@am__fastdepCC_TRUE@ if $(LTCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ -@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Plo"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< - -mostlyclean-libtool: - -rm -f *.lo - -clean-libtool: - -rm -rf .libs _libs - -distclean-libtool: - -rm -f libtool -uninstall-info-am: - -ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) - list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - mkid -fID $$unique -tags: TAGS - -TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ - $(TAGS_FILES) $(LISP) - tags=; \ - here=`pwd`; \ - list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ - test -n "$$unique" || unique=$$empty_fix; \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - $$tags $$unique; \ - fi -ctags: CTAGS -CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ - $(TAGS_FILES) $(LISP) - tags=; \ - here=`pwd`; \ - list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - test -z "$(CTAGS_ARGS)$$tags$$unique" \ - || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ - $$tags $$unique - -GTAGS: - here=`$(am__cd) $(top_builddir) && pwd` \ - && cd $(top_srcdir) \ - && gtags -i $(GTAGS_ARGS) $$here - -distclean-tags: - -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags - -distdir: $(DISTFILES) - @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ - list='$(DISTFILES)'; for file in $$list; do \ - case $$file in \ - $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ - $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ - esac; \ - if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ - dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ - if test "$$dir" != "$$file" && test "$$dir" != "."; then \ - dir="/$$dir"; \ - $(mkdir_p) "$(distdir)$$dir"; \ - else \ - dir=''; \ - fi; \ - if test -d $$d/$$file; then \ - if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ - cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ - fi; \ - cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ - else \ - test -f $(distdir)/$$file \ - || cp -p $$d/$$file $(distdir)/$$file \ - || exit 1; \ - fi; \ - done -check-am: all-am -check: $(BUILT_SOURCES) - $(MAKE) $(AM_MAKEFLAGS) check-am -all-am: Makefile $(LTLIBRARIES) -installdirs: -install: $(BUILT_SOURCES) - $(MAKE) $(AM_MAKEFLAGS) install-am -install-exec: install-exec-am -install-data: install-data-am -uninstall: uninstall-am - -install-am: all-am - @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am - -installcheck: installcheck-am -install-strip: - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - `test -z '$(STRIP)' || \ - echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install -mostlyclean-generic: - -clean-generic: - -distclean-generic: - -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) - -maintainer-clean-generic: - @echo "This command is intended for maintainers to use" - @echo "it deletes files that may require special tools to rebuild." - -test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES) -clean: clean-am - -clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ - mostlyclean-am - -distclean: distclean-am - -rm -rf ./$(DEPDIR) - -rm -f Makefile -distclean-am: clean-am distclean-compile distclean-generic \ - distclean-libtool distclean-tags - -dvi: dvi-am - -dvi-am: - -html: html-am - -info: info-am - -info-am: - -install-data-am: - -install-exec-am: - -install-info: install-info-am - -install-man: - -installcheck-am: - -maintainer-clean: maintainer-clean-am - -rm -rf ./$(DEPDIR) - -rm -f Makefile -maintainer-clean-am: distclean-am maintainer-clean-generic - -mostlyclean: mostlyclean-am - -mostlyclean-am: mostlyclean-compile mostlyclean-generic \ - mostlyclean-libtool - -pdf: pdf-am - -pdf-am: - -ps: ps-am - -ps-am: - -uninstall-am: uninstall-info-am - -.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ - clean-libtool clean-noinstLTLIBRARIES ctags distclean \ - distclean-compile distclean-generic distclean-libtool \ - distclean-tags distdir dvi dvi-am html html-am info info-am \ - install install-am install-data install-data-am install-exec \ - install-exec-am install-info install-info-am install-man \ - install-strip installcheck installcheck-am installdirs \ - maintainer-clean maintainer-clean-generic mostlyclean \ - mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ - pdf pdf-am ps ps-am tags uninstall uninstall-am \ - uninstall-info-am - - -@SIMD_X86_64_TRUE@jccolss2-64.lo: jcclrss2-64.asm -@SIMD_X86_64_TRUE@jdcolss2-64.lo: jdclrss2-64.asm -@SIMD_X86_64_TRUE@jdmerss2-64.lo: jdmrgss2-64.asm - -@SIMD_I386_TRUE@jccolmmx.lo: jcclrmmx.asm -@SIMD_I386_TRUE@jccolss2.lo: jcclrss2.asm -@SIMD_I386_TRUE@jdcolmmx.lo: jdclrmmx.asm -@SIMD_I386_TRUE@jdcolss2.lo: jdclrss2.asm -@SIMD_I386_TRUE@jdmermmx.lo: jdmrgmmx.asm -@SIMD_I386_TRUE@jdmerss2.lo: jdmrgss2.asm - -.asm.lo: - $(LIBTOOL) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) $< -o $@ - -jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h - $(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@ -# Tell versions [3.59,3.63) of GNU make to not export all variables. -# Otherwise a system limit (for SysV at least) may be exceeded. -.NOEXPORT: diff --git a/simd/jcgrammx.asm b/simd/jcgrammx.asm new file mode 100644 index 0000000..8c96824 --- /dev/null +++ b/simd/jcgrammx.asm @@ -0,0 +1,116 @@ +; +; jcgrammx.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright 2011 D. R. Commander +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_mmx) + +EXTN(jconst_rgb_gray_convert_mmx): + +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx +%include "jcgrymmx.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx +%include "jcgrymmx.asm" diff --git a/simd/jcgrass2-64.asm b/simd/jcgrass2-64.asm new file mode 100644 index 0000000..232f329 --- /dev/null +++ b/simd/jcgrass2-64.asm @@ -0,0 +1,113 @@ +; +; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryss2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryss2-64.asm" diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm new file mode 100644 index 0000000..45ea33a --- /dev/null +++ b/simd/jcgrass2.asm @@ -0,0 +1,113 @@ +; +; jcgrass2.asm - grayscale colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) + +EXTN(jconst_rgb_gray_convert_sse2): + +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 0 +%define RGB_GREEN 1 +%define RGB_BLUE 2 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 3 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 2 +%define RGB_GREEN 1 +%define RGB_BLUE 0 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 3 +%define RGB_GREEN 2 +%define RGB_BLUE 1 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2 +%include "jcgryss2.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED 1 +%define RGB_GREEN 2 +%define RGB_BLUE 3 +%define RGB_PIXELSIZE 4 +%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2 +%include "jcgryss2.asm" diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm new file mode 100644 index 0000000..bbeea09 --- /dev/null +++ b/simd/jcgrymmx.asm @@ -0,0 +1,357 @@ +; +; jcgrymmx.asm - grayscale colorspace conversion (MMX) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright 2011 D. R. Commander +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + global EXTN(jsimd_rgb_gray_convert_mmx) + +EXTN(jsimd_rgb_gray_convert_mmx): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv +.column_ld16: + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) + + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + + pxor mmH,mmH + + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) + + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) + + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld4: + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + +.rgb_gray_cnv: + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) + + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + + pxor mmF,mmF + + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) + + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) + + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) + + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm new file mode 100644 index 0000000..23ae8af --- /dev/null +++ b/simd/jcgryss2-64.asm @@ -0,0 +1,364 @@ +; +; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +; r10 = JDIMENSION img_width +; r11 = JSAMPARRAY input_buf +; r12 = JSAMPIMAGE output_buf +; r13 = JDIMENSION output_row +; r14 = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov rcx, r13 + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return +.rowloop: + push rdi + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx +.column_ld4: + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + +.columnloop: + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop + +.return: + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm new file mode 100644 index 0000000..c294287 --- /dev/null +++ b/simd/jcgryss2.asm @@ -0,0 +1,383 @@ +; +; jcgryss2.asm - grayscale colorspace conversion (SSE2) +; +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; Copyright (C) 2011, D. R. Commander. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, +; JDIMENSION output_row, int num_rows); +; + +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows + +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 + + global EXTN(jsimd_rgb_gray_convert_sse2) + +EXTN(jsimd_rgb_gray_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 +.rowloop: + pushpic eax + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + +%if RGB_PIXELSIZE == 3 ; --------------- + +.column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] +.column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx +.column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF +.column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB +.column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv +.column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +.column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE +.column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] +.column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 + +.columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + +.rgb_gray_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + +.return: + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 16 diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c new file mode 100644 index 0000000..a9d920c --- /dev/null +++ b/simd/jsimd_arm.c @@ -0,0 +1,662 @@ +/* + * jsimd_arm.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * Copyright 2009-2011 D. R. Commander + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on + * ARM architecture. + * + * Based on the stubs from 'jsimd_none.c' + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include +#include +#include + +static unsigned int simd_support = ~0; + +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + +#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) + +LOCAL(int) +check_feature (char *buffer, char *feature) +{ + char *p; + if (*feature == 0) + return 0; + if (strncmp(buffer, "Features", 8) != 0) + return 0; + buffer += 8; + while (isspace(*buffer)) + buffer++; + + /* Check if 'feature' is present in the buffer as a separate word */ + while ((p = strstr(buffer, feature))) { + if (p > buffer && !isspace(*(p - 1))) { + buffer++; + continue; + } + p += strlen(feature); + if (*p != 0 && !isspace(*p)) { + buffer++; + continue; + } + return 1; + } + return 0; +} + +LOCAL(int) +parse_proc_cpuinfo (int bufsize) +{ + char *buffer = (char *)malloc(bufsize); + FILE *fd; + simd_support = 0; + + if (!buffer) + return 0; + + fd = fopen("/proc/cpuinfo", "r"); + if (fd) { + while (fgets(buffer, bufsize, fd)) { + if (!strchr(buffer, '\n') && !feof(fd)) { + /* "impossible" happened - insufficient size of the buffer! */ + fclose(fd); + free(buffer); + return 0; + } + if (check_feature(buffer, "neon")) + simd_support |= JSIMD_ARM_NEON; + } + fclose(fd); + } + free(buffer); + return 1; +} + +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + char *env = NULL; +#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif + + if (simd_support != ~0) + return; + + simd_support = 0; + +#if defined(__ARM_NEON__) + simd_support |= JSIMD_ARM_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use NEON regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ + while (!parse_proc_cpuinfo(bufsize)) { + bufsize *= 2; + if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) + break; + } +#endif + + /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCE_ARM_NEON"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_ARM_NEON; + env = getenv("JSIMD_FORCE_NO_SIMD"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support = 0; +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + case JCS_EXT_RGBX: + neonfct=jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + neonfct=jsimd_extbgr_ycc_convert_neon; + break; + case JCS_EXT_BGRX: + neonfct=jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + neonfct=jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + neonfct=jsimd_extxrgb_ycc_convert_neon; + break; + default: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + } + + if (simd_support & JSIMD_ARM_NEON) + neonfct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) + { + case JCS_EXT_RGB: + neonfct=jsimd_ycc_extrgb_convert_neon; + break; + case JCS_EXT_RGBX: + neonfct=jsimd_ycc_extrgbx_convert_neon; + break; + case JCS_EXT_BGR: + neonfct=jsimd_ycc_extbgr_convert_neon; + break; + case JCS_EXT_BGRX: + neonfct=jsimd_ycc_extbgrx_convert_neon; + break; + case JCS_EXT_XBGR: + neonfct=jsimd_ycc_extxbgr_convert_neon; + break; + case JCS_EXT_XRGB: + neonfct=jsimd_ycc_extxrgb_convert_neon; + break; + default: + neonfct=jsimd_ycc_extrgb_convert_neon; + break; + } + + if (simd_support & JSIMD_ARM_NEON) + neonfct(cinfo->output_width, input_buf, + input_row, output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace) +{ + if (simd_support & JSIMD_ARM_NEON) + jsimd_convsamp_neon(sample_data, start_col, workspace); +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM * data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM * data) +{ + if (simd_support & JSIMD_ARM_NEON) + jsimd_fdct_ifast_neon(data); +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT * data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace) +{ + if (simd_support & JSIMD_ARM_NEON) + jsimd_quantize_neon(coef_block, divisors, workspace); +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_ARM_NEON)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_ARM_NEON)) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_ARM_NEON)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_ARM_NEON)) + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S new file mode 100644 index 0000000..9ef6efc --- /dev/null +++ b/simd/jsimd_arm_neon.S @@ -0,0 +1,2033 @@ +/* + * ARM NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). + * All rights reserved. + * Contact: Alexander Bokovoy + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm + + +#define RESPECT_STRICT_ALIGNMENT 1 + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .func _\fname + .globl _\fname +_\fname: +#else + .func \fname + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ +.macro transpose_4x4 x0, x1, x2, x3 + vtrn.16 \x0, \x1 + vtrn.16 \x2, \x3 + vtrn.32 \x0, \x2 + vtrn.32 \x1, \x3 +.endm + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ +{ \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + INT32 q1, q2, q3, q4, q5, q6, q7; \ + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((INT32) row0 - (INT32) row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((INT32) row0 + (INT32) row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_2_562915447 d0[2] +#define XFIX_0_298631336_MINUS_0_899976223 d0[3] +#define XFIX_1_501321110_MINUS_0_899976223 d1[0] +#define XFIX_2_053119869_MINUS_2_562915447 d1[1] +#define XFIX_0_541196100_PLUS_0_765366865 d1[2] +#define XFIX_1_175875602 d1[3] +#define XFIX_1_175875602_MINUS_0_390180644 d2[0] +#define XFIX_0_541196100_MINUS_1_847759065 d2[1] +#define XFIX_3_072711026_MINUS_2_562915447 d2[2] +#define XFIX_1_175875602_MINUS_1_961570560 d2[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +asm_function jsimd_idct_islow_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + ROW0L .req d16 + ROW0R .req d17 + ROW1L .req d18 + ROW1R .req d19 + ROW2L .req d20 + ROW2R .req d21 + ROW3L .req d22 + ROW3R .req d23 + ROW4L .req d24 + ROW4R .req d25 + ROW5L .req d26 + ROW5R .req d27 + ROW6L .req d28 + ROW6R .req d29 + ROW7L .req d30 + ROW7R .req d31 + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_islow_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ + add ip, ip, #16 + vmul.s16 q15, q15, q3 + vpush {d8-d15} /* save NEON registers */ + /* 1-D IDCT, pass 1, left 4x8 half */ + vadd.s16 d4, ROW7L, ROW3L + vadd.s16 d5, ROW5L, ROW1L + vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d5, XFIX_1_175875602 + vmull.s16 q7, d4, XFIX_1_175875602 + vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1L, q1, #11 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6L, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + vrshrn.s32 ROW2L, q1, #11 + vrshrn.s32 ROW5L, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7L, q2, #11 + vrshrn.s32 ROW3L, q5, #11 + vrshrn.s32 ROW0L, q6, #11 + vrshrn.s32 ROW4L, q3, #11 + /* 1-D IDCT, pass 1, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + /* Transpose left 4x8 half */ + vtrn.16 ROW6L, ROW7L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vtrn.16 ROW2L, ROW3L + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vtrn.16 ROW0L, ROW1L + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vtrn.16 ROW4L, ROW5L + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vtrn.32 ROW1L, ROW3L + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vtrn.32 ROW4L, ROW6L + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vtrn.32 ROW0L, ROW2L + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vrshrn.s32 ROW1R, q1, #11 + vtrn.32 ROW5L, ROW7L + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vrshrn.s32 ROW6R, q1, #11 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vrshrn.s32 ROW2R, q1, #11 + vrshrn.s32 ROW5R, q3, #11 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vrshrn.s32 ROW7R, q2, #11 + vrshrn.s32 ROW3R, q5, #11 + vrshrn.s32 ROW0R, q6, #11 + vrshrn.s32 ROW4R, q3, #11 + vld1.s16 {d2}, [ip, :64] /* reload constants */ + /* Transpose right 4x8 half */ + vtrn.16 ROW6R, ROW7R + vtrn.16 ROW2R, ROW3R + vtrn.16 ROW0R, ROW1R + vtrn.16 ROW4R, ROW5R + vmov.s16 q7, #(CENTERJSAMPLE << 5) + vtrn.32 ROW1R, ROW3R + vtrn.32 ROW4R, ROW6R + vtrn.32 ROW0R, ROW2R + vtrn.32 ROW5R, ROW7R + /* 1-D IDCT, pass 2, left 4x8 half */ + vswp ROW7L, ROW3R + vadd.s16 d10, ROW7L, ROW3L + vswp ROW5L, ROW1R + vadd.s16 d8, ROW5L, ROW1L + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vswp ROW4L, ROW0R + vadd.s16 q8, q8, q7 + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0L, ROW4L + vswp ROW6L, ROW2R + vmull.s16 q2, ROW2L, XFIX_0_541196100 + vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5L, XFIX_2_562915447 + vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1L, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7L, XFIX_0_899976223 + vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1L, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3L, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6L, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6L, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0L, ROW4L + vshrn.s32 ROW2L, q1, #16 + vshrn.s32 ROW5L, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7L, q2, #16 + vshrn.s32 ROW3L, q5, #16 + vshrn.s32 ROW0L, q6, #16 + vshrn.s32 ROW4L, q3, #16 + /* 1-D IDCT, pass 2, right 4x8 half */ + vld1.s16 {d2}, [ip, :64] /* reload constants */ + vadd.s16 d10, ROW7R, ROW3R + vadd.s16 d8, ROW5R, ROW1R + vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 + vmlal.s16 q6, d8, XFIX_1_175875602 + vmull.s16 q7, d10, XFIX_1_175875602 + vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 + vsubl.s16 q3, ROW0R, ROW4R + vmull.s16 q2, ROW2R, XFIX_0_541196100 + vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 + vmov q4, q6 + vmlsl.s16 q6, ROW5R, XFIX_2_562915447 + vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 + vshl.s32 q3, q3, #13 + vmlsl.s16 q4, ROW1R, XFIX_0_899976223 + vadd.s32 q1, q3, q2 + vmov q5, q7 + vadd.s32 q1, q1, q6 + vmlsl.s16 q7, ROW7R, XFIX_0_899976223 + vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 + vshrn.s32 ROW1R, q1, #16 + vsub.s32 q1, q1, q6 + vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 + vmlsl.s16 q5, ROW3R, XFIX_2_562915447 + vsub.s32 q1, q1, q6 + vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 + vmlal.s16 q6, ROW6R, XFIX_0_541196100 + vsub.s32 q3, q3, q2 + vshrn.s32 ROW6R, q1, #16 + vadd.s32 q1, q3, q5 + vsub.s32 q3, q3, q5 + vaddl.s16 q5, ROW0R, ROW4R + vshrn.s32 ROW2R, q1, #16 + vshrn.s32 ROW5R, q3, #16 + vshl.s32 q5, q5, #13 + vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 + vadd.s32 q2, q5, q6 + vsub.s32 q1, q5, q6 + vadd.s32 q6, q2, q7 + vsub.s32 q2, q2, q7 + vadd.s32 q5, q1, q4 + vsub.s32 q3, q1, q4 + vshrn.s32 ROW7R, q2, #16 + vshrn.s32 ROW3R, q5, #16 + vshrn.s32 ROW0R, q6, #16 + vshrn.s32 ROW4R, q3, #16 + /* Descale to 8-bit and range limit */ + vqrshrun.s16 d16, q8, #2 + vqrshrun.s16 d17, q9, #2 + vqrshrun.s16 d18, q10, #2 + vqrshrun.s16 d19, q11, #2 + vpop {d8-d15} /* restore NEON registers */ + vqrshrun.s16 d20, q12, #2 + vqrshrun.s16 d21, q13, #2 + vqrshrun.s16 d22, q14, #2 + vqrshrun.s16 d23, q15, #2 + /* Transpose the final 8-bit samples */ + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.8 d16, d17 + vtrn.8 d18, d19 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d16}, [TMP1] + vst1.8 {d17}, [TMP2] + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vtrn.8 d20, d21 + vst1.8 {d19}, [TMP2] + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vtrn.8 d22, d23 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R +.endfunc + +/*****************************************************************************/ + +/* + * jsimd_idct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' + * function from jidctfst.c + * + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. + * But in ARM NEON case some extra additions are required because VQDMULH + * instruction can't handle the constants larger than 1. So the expressions + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", + * which introduces an extra addition. Overall, there are 6 extra additions + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. + */ + +#define XFIX_1_082392200 d0[0] +#define XFIX_1_414213562 d0[1] +#define XFIX_1_847759065 d0[2] +#define XFIX_2_613125930 d0[3] + +.balign 16 +jsimd_idct_ifast_neon_consts: + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ + +asm_function jsimd_idct_ifast_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + /* Load and dequantize coefficients into NEON registers + * with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 ( q8 ) + * 1 | d18 | d19 ( q9 ) + * 2 | d20 | d21 ( q10 ) + * 3 | d22 | d23 ( q11 ) + * 4 | d24 | d25 ( q12 ) + * 5 | d26 | d27 ( q13 ) + * 6 | d28 | d29 ( q14 ) + * 7 | d30 | d31 ( q15 ) + */ + adr ip, jsimd_idct_ifast_neon_consts + vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! + vmul.s16 q8, q8, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q9, q9, q1 + vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! + vmul.s16 q10, q10, q2 + vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! + vmul.s16 q11, q11, q3 + vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] + vmul.s16 q12, q12, q0 + vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! + vmul.s16 q14, q14, q2 + vmul.s16 q13, q13, q1 + vld1.16 {d0}, [ip, :64] /* load constants */ + vmul.s16 q15, q15, q3 + vpush {d8-d13} /* save NEON registers */ + /* 1-D IDCT, pass 1 */ + vsub.s16 q2, q10, q14 + vadd.s16 q14, q10, q14 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d28, d21 + vswp d26, d19 + /* 1-D IDCT, pass 2 */ + vsub.s16 q2, q10, q14 + vswp d30, d23 + vadd.s16 q14, q10, q14 + vswp d24, d17 + vsub.s16 q1, q11, q13 + vadd.s16 q13, q11, q13 + vsub.s16 q5, q9, q15 + vadd.s16 q15, q9, q15 + vqdmulh.s16 q4, q2, XFIX_1_414213562 + vqdmulh.s16 q6, q1, XFIX_2_613125930 + vadd.s16 q3, q1, q1 + vsub.s16 q1, q5, q1 + vadd.s16 q10, q2, q4 + vqdmulh.s16 q4, q1, XFIX_1_847759065 + vsub.s16 q2, q15, q13 + vadd.s16 q3, q3, q6 + vqdmulh.s16 q6, q2, XFIX_1_414213562 + vadd.s16 q1, q1, q4 + vqdmulh.s16 q4, q5, XFIX_1_082392200 + vsub.s16 q10, q10, q14 + vadd.s16 q2, q2, q6 + vsub.s16 q6, q8, q12 + vadd.s16 q12, q8, q12 + vadd.s16 q9, q5, q4 + vadd.s16 q5, q6, q10 + vsub.s16 q10, q6, q10 + vadd.s16 q6, q15, q13 + vadd.s16 q8, q12, q14 + vsub.s16 q3, q6, q3 + vsub.s16 q12, q12, q14 + vsub.s16 q3, q3, q1 + vsub.s16 q1, q9, q1 + vadd.s16 q2, q3, q2 + vsub.s16 q15, q8, q6 + vadd.s16 q1, q1, q2 + vadd.s16 q8, q8, q6 + vadd.s16 q14, q5, q3 + vsub.s16 q9, q5, q3 + vsub.s16 q13, q10, q2 + vpop {d8-d13} /* restore NEON registers */ + vadd.s16 q10, q10, q2 + /* Transpose */ + vtrn.16 q8, q9 + vsub.s16 q11, q12, q1 + vtrn.16 q14, q15 + vadd.s16 q12, q12, q1 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + /* Descale and range limit */ + vmov.s16 q0, #(0x80 << 5) + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q8, q10 + vtrn.32 q13, q15 + vswp d24, d17 + vswp d26, d19 + vqadd.s16 q8, q8, q0 + vswp d28, d21 + vqadd.s16 q9, q9, q0 + vswp d30, d23 + vqadd.s16 q10, q10, q0 + vqadd.s16 q11, q11, q0 + /* Store results to the output buffer */ + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vqshrun.s16 d16, q8, #5 + vqshrun.s16 d17, q9, #5 + vqshrun.s16 d18, q10, #5 + vqshrun.s16 d19, q11, #5 + vst1.8 {d16}, [TMP1] + vqadd.s16 q12, q12, q0 + vqadd.s16 q13, q13, q0 + vst1.8 {d17}, [TMP2] + vqadd.s16 q14, q14, q0 + vqadd.s16 q15, q15, q0 + ldmia OUTPUT_BUF!, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + vst1.8 {d18}, [TMP1] + vqshrun.s16 d20, q12, #5 + vqshrun.s16 d21, q13, #5 + vst1.8 {d19}, [TMP2] + vqshrun.s16 d22, q14, #5 + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + vst1.8 {d20}, [TMP1] + vqshrun.s16 d23, q15, #5 + vst1.8 {d21}, [TMP2] + vst1.8 {d22}, [TMP3] + vst1.8 {d23}, [TMP4] + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 +.endfunc + +/*****************************************************************************/ + +/* + * jsimd_idct_4x4_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + * + * TODO: a bit better instructions scheduling can be achieved by expanding + * idct_helper/transpose_4x4 macros and reordering instructions, + * but readability will suffer somewhat. + */ + +#define CONST_BITS 13 + +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ + +.balign 16 +jsimd_idct_4x4_neon_consts: + .short FIX_1_847759065 /* d0[0] */ + .short -FIX_0_765366865 /* d0[1] */ + .short -FIX_0_211164243 /* d0[2] */ + .short FIX_1_451774981 /* d0[3] */ + .short -FIX_2_172734803 /* d1[0] */ + .short FIX_1_061594337 /* d1[1] */ + .short -FIX_0_509795579 /* d1[2] */ + .short -FIX_0_601344887 /* d1[3] */ + .short FIX_0_899976223 /* d2[0] */ + .short FIX_2_562915447 /* d2[1] */ + .short 1 << (CONST_BITS+1) /* d2[2] */ + .short 0 /* d2[3] */ + +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 + vmull.s16 q14, \x4, d2[2] + vmlal.s16 q14, \x8, d0[0] + vmlal.s16 q14, \x14, d0[1] + + vmull.s16 q13, \x16, d1[2] + vmlal.s16 q13, \x12, d1[3] + vmlal.s16 q13, \x10, d2[0] + vmlal.s16 q13, \x6, d2[1] + + vmull.s16 q15, \x4, d2[2] + vmlsl.s16 q15, \x8, d0[0] + vmlsl.s16 q15, \x14, d0[1] + + vmull.s16 q12, \x16, d0[2] + vmlal.s16 q12, \x12, d0[3] + vmlal.s16 q12, \x10, d1[0] + vmlal.s16 q12, \x6, d1[1] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y29, q14 +.else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y29, q14, #\shift +.endif + + vadd.s32 q10, q15, q12 + vsub.s32 q15, q15, q12 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q15, q15, #\shift + vmovn.s32 \y27, q10 + vmovn.s32 \y28, q15 +.else + vrshrn.s32 \y27, q10, #\shift + vrshrn.s32 \y28, q15, #\shift +.endif + +.endm + +asm_function jsimd_idct_4x4_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req r1 + TMP3 .req r2 + TMP4 .req ip + + vpush {d8-d15} + + /* Load constants (d3 is just used for padding) */ + adr TMP4, jsimd_idct_4x4_neon_consts + vld1.16 {d0, d1, d2, d3}, [TMP4, :128] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | d8 | d9 + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | d14 | d15 + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q3, q3, q10 + vmul.s16 q4, q4, q11 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + vmul.s16 q6, q6, q13 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q7, q7, q14 + vmul.s16 q8, q8, q15 + + /* Pass 1 */ + idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 + transpose_4x4 d5, d7, d9, d11 + + /* Pass 2 */ + idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 + transpose_4x4 d26, d27, d28, d29 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vadd.s16 q14, q14, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q14 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT + /* We can use much less instructions on little endian systems if the + * OS kernel is not configured to trap unaligned memory accesses + */ + vst1.32 {d26[0]}, [TMP1]! + vst1.32 {d27[0]}, [TMP3]! + vst1.32 {d26[1]}, [TMP2]! + vst1.32 {d27[1]}, [TMP4]! +#else + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[0]}, [TMP3]! + vst1.8 {d26[1]}, [TMP1]! + vst1.8 {d27[1]}, [TMP3]! + vst1.8 {d26[2]}, [TMP1]! + vst1.8 {d27[2]}, [TMP3]! + vst1.8 {d26[3]}, [TMP1]! + vst1.8 {d27[3]}, [TMP3]! + + vst1.8 {d26[4]}, [TMP2]! + vst1.8 {d27[4]}, [TMP4]! + vst1.8 {d26[5]}, [TMP2]! + vst1.8 {d27[5]}, [TMP4]! + vst1.8 {d26[6]}, [TMP2]! + vst1.8 {d27[6]}, [TMP4]! + vst1.8 {d26[7]}, [TMP2]! + vst1.8 {d27[7]}, [TMP4]! +#endif + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 +.endfunc + +.purgem idct_helper + +/*****************************************************************************/ + +/* + * jsimd_idct_2x2_neon + * + * This function contains inverse-DCT code for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' + * function from jpeg-6b (jidctred.c). + * + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which + * requires much less arithmetic operations and hence should be faster. + * The primary purpose of this particular NEON optimized function is + * bit exact compatibility with jpeg-6b. + */ + +.balign 8 +jsimd_idct_2x2_neon_consts: + .short -FIX_0_720959822 /* d0[0] */ + .short FIX_0_850430095 /* d0[1] */ + .short -FIX_1_272758580 /* d0[2] */ + .short FIX_3_624509785 /* d0[3] */ + +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 + vshll.s16 q14, \x4, #15 + vmull.s16 q13, \x6, d0[3] + vmlal.s16 q13, \x10, d0[2] + vmlal.s16 q13, \x12, d0[1] + vmlal.s16 q13, \x16, d0[0] + + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + +.if \shift > 16 + vrshr.s32 q10, q10, #\shift + vrshr.s32 q14, q14, #\shift + vmovn.s32 \y26, q10 + vmovn.s32 \y27, q14 +.else + vrshrn.s32 \y26, q10, #\shift + vrshrn.s32 \y27, q14, #\shift +.endif + +.endm + +asm_function jsimd_idct_2x2_neon + + DCT_TABLE .req r0 + COEF_BLOCK .req r1 + OUTPUT_BUF .req r2 + OUTPUT_COL .req r3 + TMP1 .req r0 + TMP2 .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP2, jsimd_idct_2x2_neon_consts + vld1.16 {d0}, [TMP2, :64] + + /* Load all COEF_BLOCK into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d4 | d5 + * 1 | d6 | d7 + * 2 | - | - + * 3 | d10 | d11 + * 4 | - | - + * 5 | d12 | d13 + * 6 | - | - + * 7 | d16 | d17 + */ + vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d10, d11}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d12, d13}, [COEF_BLOCK, :128]! + add COEF_BLOCK, COEF_BLOCK, #16 + vld1.16 {d16, d17}, [COEF_BLOCK, :128]! + /* Dequantize */ + vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! + vmul.s16 q2, q2, q9 + vmul.s16 q3, q3, q10 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d24, d25}, [DCT_TABLE, :128]! + vmul.s16 q5, q5, q12 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d26, d27}, [DCT_TABLE, :128]! + vmul.s16 q6, q6, q13 + add DCT_TABLE, DCT_TABLE, #16 + vld1.16 {d30, d31}, [DCT_TABLE, :128]! + vmul.s16 q8, q8, q15 + + /* Pass 1 */ +#if 0 + idct_helper d4, d6, d10, d12, d16, 13, d4, d6 + transpose_4x4 d4, d6, d8, d10 + idct_helper d5, d7, d11, d13, d17, 13, d5, d7 + transpose_4x4 d5, d7, d9, d11 +#else + vmull.s16 q13, d6, d0[3] + vmlal.s16 q13, d10, d0[2] + vmlal.s16 q13, d12, d0[1] + vmlal.s16 q13, d16, d0[0] + vmull.s16 q12, d7, d0[3] + vmlal.s16 q12, d11, d0[2] + vmlal.s16 q12, d13, d0[1] + vmlal.s16 q12, d17, d0[0] + vshll.s16 q14, d4, #15 + vshll.s16 q15, d5, #15 + vadd.s32 q10, q14, q13 + vsub.s32 q14, q14, q13 + vrshrn.s32 d4, q10, #13 + vrshrn.s32 d6, q14, #13 + vadd.s32 q10, q15, q12 + vsub.s32 q14, q15, q12 + vrshrn.s32 d5, q10, #13 + vrshrn.s32 d7, q14, #13 + vtrn.16 q2, q3 + vtrn.32 q3, q5 +#endif + + /* Pass 2 */ + idct_helper d4, d6, d10, d7, d11, 20, d26, d27 + + /* Range limit */ + vmov.u16 q15, #0x80 + vadd.s16 q13, q13, q15 + vqmovun.s16 d26, q13 + vqmovun.s16 d27, q13 + + /* Store results to the output buffer */ + ldmia OUTPUT_BUF, {TMP1, TMP2} + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + + vst1.8 {d26[0]}, [TMP1]! + vst1.8 {d27[4]}, [TMP1]! + vst1.8 {d26[1]}, [TMP2]! + vst1.8 {d27[5]}, [TMP2]! + + vpop {d8-d15} + bx lr + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 +.endfunc + +.purgem idct_helper + +/*****************************************************************************/ + +/* + * jsimd_ycc_extrgb_convert_neon + * jsimd_ycc_extbgr_convert_neon + * jsimd_ycc_extrgbx_convert_neon + * jsimd_ycc_extbgrx_convert_neon + * jsimd_ycc_extxbgr_convert_neon + * jsimd_ycc_extxrgb_convert_neon + * + * Colorspace conversion YCbCr -> RGB + */ + + +.macro do_load size + .if \size == 8 + vld1.8 {d4}, [U, :64]! + vld1.8 {d5}, [V, :64]! + vld1.8 {d0}, [Y, :64]! + pld [U, #64] + pld [V, #64] + pld [Y, #64] + .elseif \size == 4 + vld1.8 {d4[0]}, [U]! + vld1.8 {d4[1]}, [U]! + vld1.8 {d4[2]}, [U]! + vld1.8 {d4[3]}, [U]! + vld1.8 {d5[0]}, [V]! + vld1.8 {d5[1]}, [V]! + vld1.8 {d5[2]}, [V]! + vld1.8 {d5[3]}, [V]! + vld1.8 {d0[0]}, [Y]! + vld1.8 {d0[1]}, [Y]! + vld1.8 {d0[2]}, [Y]! + vld1.8 {d0[3]}, [Y]! + .elseif \size == 2 + vld1.8 {d4[4]}, [U]! + vld1.8 {d4[5]}, [U]! + vld1.8 {d5[4]}, [V]! + vld1.8 {d5[5]}, [V]! + vld1.8 {d0[4]}, [Y]! + vld1.8 {d0[5]}, [Y]! + .elseif \size == 1 + vld1.8 {d4[6]}, [U]! + vld1.8 {d5[6]}, [V]! + vld1.8 {d0[6]}, [Y]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_store bpp, size + .if \bpp == 24 + .if \size == 8 + vst3.8 {d10, d11, d12}, [RGB]! + .elseif \size == 4 + vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vst4.8 {d10, d11, d12, d13}, [RGB]! + .elseif \size == 4 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2 stage pipelined YCbCr->RGB conversion + */ + +.macro do_yuv_to_rgb_stage1 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb_stage2 + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vrshrn.s32 d29, q15, #14 + vaddw.u8 q10, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + vqmovun.s16 d1\g_offs, q10 + vqmovun.s16 d1\r_offs, q12 + vqmovun.s16 d1\b_offs, q14 +.endm + +.macro do_yuv_to_rgb_stage2_store_load_stage1 + vld1.8 {d4}, [U, :64]! + vrshrn.s32 d20, q10, #15 + vrshrn.s32 d21, q11, #15 + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d25, q13, #14 + vrshrn.s32 d28, q14, #14 + vld1.8 {d5}, [V, :64]! + vrshrn.s32 d29, q15, #14 + vaddw.u8 q10, q10, d0 + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 + vqmovun.s16 d1\g_offs, q10 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\r_offs, q12 + pld [U, #64] + pld [V, #64] + pld [Y, #64] + vqmovun.s16 d1\b_offs, q14 + vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ + vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ + do_store \bpp, 8 + vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ + vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ + vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ + vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ + vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ + vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.endm + +.macro do_yuv_to_rgb + do_yuv_to_rgb_stage1 + do_yuv_to_rgb_stage2 +.endm + +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +jsimd_ycc_\colorid\()_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +asm_function jsimd_ycc_\colorid\()_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + INPUT_ROW .req r2 + OUTPUT_BUF .req r3 + NUM_ROWS .req r4 + + INPUT_BUF0 .req r5 + INPUT_BUF1 .req r6 + INPUT_BUF2 .req INPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d1, d2, d3 (d0 is just used for padding) */ + adr ip, jsimd_ycc_\colorid\()_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr INPUT_BUF0, [INPUT_BUF] + ldr INPUT_BUF1, [INPUT_BUF, #4] + ldr INPUT_BUF2, [INPUT_BUF, #8] + .unreq INPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Initially set d10, d11, d12, d13 to 0xFF */ + vmov.u8 q5, #255 + vmov.u8 q6, #255 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] + add INPUT_ROW, INPUT_ROW, #1 + ldr RGB, [OUTPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load 8 + do_yuv_to_rgb_stage1 + subs N, N, #8 + blt 2f +1: + do_yuv_to_rgb_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_yuv_to_rgb_stage2 + do_store \bpp, 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load 4 +3: + tst N, #2 + beq 4f + do_load 2 +4: + tst N, #1 + beq 5f + do_load 1 +5: + do_yuv_to_rgb + tst N, #4 + beq 6f + do_store \bpp, 4 +6: + tst N, #2 + beq 7f + do_store \bpp, 2 +7: + tst N, #1 + beq 8f + do_store \bpp, 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq INPUT_ROW + .unreq OUTPUT_BUF + .unreq NUM_ROWS + .unreq INPUT_BUF0 + .unreq INPUT_BUF1 + .unreq INPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N +.endfunc + +.purgem do_yuv_to_rgb +.purgem do_yuv_to_rgb_stage1 +.purgem do_yuv_to_rgb_stage2 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + vst1.8 {d20}, [Y]! + vst1.8 {d21}, [U]! + vst1.8 {d22}, [V]! + .elseif \size == 4 + vst1.8 {d20[0]}, [Y]! + vst1.8 {d20[1]}, [Y]! + vst1.8 {d20[2]}, [Y]! + vst1.8 {d20[3]}, [Y]! + vst1.8 {d21[0]}, [U]! + vst1.8 {d21[1]}, [U]! + vst1.8 {d21[2]}, [U]! + vst1.8 {d21[3]}, [U]! + vst1.8 {d22[0]}, [V]! + vst1.8 {d22[1]}, [V]! + vst1.8 {d22[2]}, [V]! + vst1.8 {d22[3]}, [V]! + .elseif \size == 2 + vst1.8 {d20[4]}, [Y]! + vst1.8 {d20[5]}, [Y]! + vst1.8 {d21[4]}, [U]! + vst1.8 {d21[5]}, [U]! + vst1.8 {d22[4]}, [V]! + vst1.8 {d22[5]}, [V]! + .elseif \size == 1 + vst1.8 {d20[6]}, [Y]! + vst1.8 {d21[6]}, [U]! + vst1.8 {d22[6]}, [V]! + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + vld3.8 {d10, d11, d12}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! + vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! + vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! + vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! + .elseif \size == 2 + vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! + vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! + .elseif \size == 1 + vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + vld4.8 {d10, d11, d12, d13}, [RGB]! + pld [RGB, #128] + .elseif \size == 4 + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! + .elseif \size == 2 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! + .elseif \size == 1 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2 stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vrev64.32 q9, q1 + vrev64.32 q13, q1 + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.macro do_rgb_to_yuv_stage2 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vshrn.u32 d23, q13, #16 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + vmovn.u16 d20, q10 /* d20 = y */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovn.u16 d22, q12 /* d22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + vrshrn.u32 d20, q7, #16 + vrshrn.u32 d21, q8, #16 + vshrn.u32 d22, q9, #16 + vrev64.32 q9, q1 + vshrn.u32 d23, q13, #16 + vrev64.32 q13, q1 + vshrn.u32 d24, q14, #16 + vshrn.u32 d25, q15, #16 + do_load \bpp, 8 + vmovn.u16 d20, q10 /* d20 = y */ + vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ + vmovn.u16 d21, q11 /* d21 = u */ + vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ + vmovn.u16 d22, q12 /* d22 = v */ + vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ + vmull.u16 q7, d4, d0[0] + vmlal.u16 q7, d6, d0[1] + vmlal.u16 q7, d8, d0[2] + vst1.8 {d20}, [Y]! + vmull.u16 q8, d5, d0[0] + vmlal.u16 q8, d7, d0[1] + vmlal.u16 q8, d9, d0[2] + vmlsl.u16 q9, d4, d0[3] + vmlsl.u16 q9, d6, d1[0] + vmlal.u16 q9, d8, d1[1] + vst1.8 {d21}, [U]! + vmlsl.u16 q13, d5, d0[3] + vmlsl.u16 q13, d7, d1[0] + vmlal.u16 q13, d9, d1[1] + vrev64.32 q14, q1 + vrev64.32 q15, q1 + vmlal.u16 q14, d4, d1[1] + vmlsl.u16 q14, d6, d1[2] + vmlsl.u16 q14, d8, d1[3] + vst1.8 {d22}, [V]! + vmlal.u16 q15, d5, d1[1] + vmlsl.u16 q15, d7, d1[2] + vmlsl.u16 q15, d9, d1[3] +.endm + +.balign 16 +jsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req r0 + INPUT_BUF .req r1 + OUTPUT_BUF .req r2 + OUTPUT_ROW .req r3 + NUM_ROWS .req r4 + + OUTPUT_BUF0 .req r5 + OUTPUT_BUF1 .req r6 + OUTPUT_BUF2 .req OUTPUT_BUF + + RGB .req r7 + Y .req r8 + U .req r9 + V .req r10 + N .req ip + + /* Load constants to d0, d1, d2, d3 */ + adr ip, jsimd_\colorid\()_ycc_neon_consts + vld1.16 {d0, d1, d2, d3}, [ip, :128] + + /* Save ARM registers and handle input arguments */ + push {r4, r5, r6, r7, r8, r9, r10, lr} + ldr NUM_ROWS, [sp, #(4 * 8)] + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + vpush {d8-d15} + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + blt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #4 + + /* Inner loop over pixels */ + subs N, N, #8 + blt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + blt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + bge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + beq 8f +3: + tst N, #4 + beq 3f + do_load \bpp, 4 +3: + tst N, #2 + beq 4f + do_load \bpp, 2 +4: + tst N, #1 + beq 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tst N, #4 + beq 6f + do_store 4 +6: + tst N, #2 + beq 7f + do_store 2 +7: + tst N, #1 + beq 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + bgt 0b +9: + /* Restore all registers and return */ + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N +.endfunc + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req r0 + START_COL .req r1 + WORKSPACE .req r2 + TMP1 .req r3 + TMP2 .req r4 + TMP3 .req r5 + TMP4 .req ip + + push {r4, r5} + vmov.u8 d0, #128 + + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d16}, [TMP1] + vsubl.u8 q8, d16, d0 + vld1.8 {d18}, [TMP2] + vsubl.u8 q9, d18, d0 + vld1.8 {d20}, [TMP3] + vsubl.u8 q10, d20, d0 + vld1.8 {d22}, [TMP4] + ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} + vsubl.u8 q11, d22, d0 + vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + vld1.8 {d24}, [TMP1] + vsubl.u8 q12, d24, d0 + vld1.8 {d26}, [TMP2] + vsubl.u8 q13, d26, d0 + vld1.8 {d28}, [TMP3] + vsubl.u8 q14, d28, d0 + vld1.8 {d30}, [TMP4] + vsubl.u8 q15, d30, d0 + vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! + vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! + pop {r4, r5} + bx lr + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 +.endfunc + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define XFIX_0_382683433 d0[0] +#define XFIX_0_541196100 d0[1] +#define XFIX_0_707106781 d0[2] +#define XFIX_1_306562965 d0[3] + +.balign 16 +jsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req r0 + TMP .req ip + + vpush {d8-d15} + + /* Load constants */ + adr TMP, jsimd_fdct_ifast_neon_consts + vld1.16 {d0}, [TMP, :64] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | q8 + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + vld1.16 {d16, d17, d18, d19}, [DATA, :128]! + vld1.16 {d20, d21, d22, d23}, [DATA, :128]! + vld1.16 {d24, d25, d26, d27}, [DATA, :128]! + vld1.16 {d28, d29, d30, d31}, [DATA, :128] + sub DATA, DATA, #(128 - 32) + + mov TMP, #2 +1: + /* Transpose */ + vtrn.16 q12, q13 + vtrn.16 q10, q11 + vtrn.16 q8, q9 + vtrn.16 q14, q15 + vtrn.32 q9, q11 + vtrn.32 q13, q15 + vtrn.32 q8, q10 + vtrn.32 q12, q14 + vswp d30, d23 + vswp d24, d17 + vswp d26, d19 + /* 1-D FDCT */ + vadd.s16 q2, q11, q12 + vswp d28, d21 + vsub.s16 q12, q11, q12 + vsub.s16 q6, q10, q13 + vadd.s16 q10, q10, q13 + vsub.s16 q7, q9, q14 + vadd.s16 q9, q9, q14 + vsub.s16 q1, q8, q15 + vadd.s16 q8, q8, q15 + vsub.s16 q4, q9, q10 + vsub.s16 q5, q8, q2 + vadd.s16 q3, q9, q10 + vadd.s16 q4, q4, q5 + vadd.s16 q2, q8, q2 + vqdmulh.s16 q4, q4, XFIX_0_707106781 + vadd.s16 q11, q12, q6 + vadd.s16 q8, q2, q3 + vsub.s16 q12, q2, q3 + vadd.s16 q3, q6, q7 + vadd.s16 q7, q7, q1 + vqdmulh.s16 q3, q3, XFIX_0_707106781 + vsub.s16 q6, q11, q7 + vadd.s16 q10, q5, q4 + vqdmulh.s16 q6, q6, XFIX_0_382683433 + vsub.s16 q14, q5, q4 + vqdmulh.s16 q11, q11, XFIX_0_541196100 + vqdmulh.s16 q5, q7, XFIX_1_306562965 + vadd.s16 q4, q1, q3 + vsub.s16 q3, q1, q3 + vadd.s16 q7, q7, q6 + vadd.s16 q11, q11, q6 + vadd.s16 q7, q7, q5 + vadd.s16 q13, q3, q11 + vsub.s16 q11, q3, q11 + vadd.s16 q9, q4, q7 + vsub.s16 q15, q4, q7 + subs TMP, TMP, #1 + bne 1b + + /* store results */ + vst1.16 {d16, d17, d18, d19}, [DATA, :128]! + vst1.16 {d20, d21, d22, d23}, [DATA, :128]! + vst1.16 {d24, d25, d26, d27}, [DATA, :128]! + vst1.16 {d28, d29, d30, d31}, [DATA, :128] + + vpop {d8-d15} + bx lr + + .unreq DATA + .unreq TMP +.endfunc + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, + * DCTELEM * workspace); + * + * Note: the code uses 2 stage pipelining in order to improve instructions + * scheduling and eliminate stalls (this provides ~15% better + * performance for this function on both ARM Cortex-A8 and + * ARM Cortex-A9 when compared to the non-pipelined variant). + * The instructions which belong to the second stage use different + * indentation for better readiability. + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req r0 + DIVISORS .req r1 + WORKSPACE .req r2 + + RECIPROCAL .req DIVISORS + CORRECTION .req r3 + SHIFT .req ip + LOOP_COUNT .req r4 + + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + vabs.s16 q12, q0 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + + push {r4, r5} + mov LOOP_COUNT, #3 +1: + vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! + veor.u16 q14, q14, q2 /* restore sign */ + vabs.s16 q12, q0 + vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! + vabs.s16 q13, q1 + veor.u16 q15, q15, q3 + vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! + vadd.u16 q12, q12, q10 /* add correction */ + vadd.u16 q13, q13, q11 + vmull.u16 q10, d24, d16 /* multiply by reciprocal */ + vmull.u16 q11, d25, d17 + vmull.u16 q8, d26, d18 + vmull.u16 q9, d27, d19 + vsub.u16 q14, q14, q2 + vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! + vsub.u16 q15, q15, q3 + vshrn.u32 d20, q10, #16 + vshrn.u32 d21, q11, #16 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + vshrn.u32 d22, q8, #16 + vshrn.u32 d23, q9, #16 + vneg.s16 q12, q12 + vneg.s16 q13, q13 + vshr.s16 q2, q0, #15 /* extract sign */ + vshr.s16 q3, q1, #15 + vshl.u16 q14, q10, q12 /* shift */ + vshl.u16 q15, q11, q13 + subs LOOP_COUNT, LOOP_COUNT, #1 + bne 1b + pop {r4, r5} + + veor.u16 q14, q14, q2 /* restore sign */ + veor.u16 q15, q15, q3 + vsub.u16 q14, q14, q2 + vsub.u16 q15, q15, q3 + vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! + + bx lr /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT +.endfunc -- cgit v1.2.3