aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2013-07-24 21:50:20 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2013-07-24 21:50:20 +0000
commitf6f573d99ce2ad07b7368f2d16849e7867be36d2 (patch)
treee9f76bc75e886c7d29f8fff622a7d8437827d6b4 /simd
parent7575a17036e85e89584b2fdb6867362b767def7e (diff)
SIMD support for performing color conversion using MIPS DSPr2 instructions
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@993 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r--simd/Makefile.am6
-rw-r--r--simd/jsimd.h60
-rw-r--r--simd/jsimd_mips.c465
-rw-r--r--simd/jsimd_mips_dspr2.S250
-rw-r--r--simd/jsimd_mips_dspr2_asm.h252
5 files changed, 1033 insertions, 0 deletions
diff --git a/simd/Makefile.am b/simd/Makefile.am
index a12ff6e..272c2a4 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -58,6 +58,12 @@ libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
endif
+if SIMD_MIPSEL
+
+libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2.S
+
+endif
+
AM_CPPFLAGS = -I$(top_srcdir)
.asm.lo:
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 3d4751f..de1f495 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -3,6 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,6 +19,7 @@
#define JSIMD_SSE 0x04
#define JSIMD_SSE2 0x08
#define JSIMD_ARM_NEON 0x10
+#define JSIMD_MIPS_DSPR2 0x20
/* Short forms of external names for systems with brain-damaged linkers. */
@@ -386,6 +388,64 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_neon
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
+EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
+ JPP((JDIMENSION img_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx
JPP((JDIMENSION image_width, int max_v_samp_factor,
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
new file mode 100644
index 0000000..f74d908
--- /dev/null
+++ b/simd/jsimd_mips.c
@@ -0,0 +1,465 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011 D. R. Commander
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on
+ * MIPS architecture.
+ *
+ * Based on the stubs from 'jsimd_none.c'
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char* search_string)
+{
+ const char* file_name = "/proc/cpuinfo";
+ char cpuinfo_line[256];
+ FILE* f = NULL;
+ simd_support = 0;
+
+ if ((f = fopen(file_name, "r")) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ simd_support |= JSIMD_MIPS_DSPR2;
+ return 1;
+ }
+ }
+ fclose(f);
+ }
+ /* Did not find string in the proc file, or not Linux ELF. */
+ return 0;
+}
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ simd_support |= JSIMD_MIPS_DSPR2;
+#elif defined(__linux__)
+ /* We still have a chance to use MIPS DSPR2 regardless of globally used
+ * -mdspr2 options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux */
+ if (!parse_proc_cpuinfo("MIPS 74K"))
+ return;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+ if (simd_support & JSIMD_MIPS_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+ if (simd_support & JSIMD_MIPS_DSPR2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ switch(cinfo->in_color_space)
+ {
+ case JCS_EXT_RGB:
+ mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
+
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
+ break;
+ default:
+ mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+ break;
+ }
+
+ if (simd_support & JSIMD_MIPS_DSPR2)
+ mipsdspr2fct(cinfo->image_width, input_buf,
+ output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
+ break;
+ case JCS_EXT_BGR:
+ mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
+ break;
+ default:
+ mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+ break;
+ }
+
+ if (simd_support & JSIMD_MIPS_DSPR2)
+ mipsdspr2fct(cinfo->output_width, input_buf,
+ input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
new file mode 100644
index 0000000..b2ab04f
--- /dev/null
+++ b/simd/jsimd_mips_dspr2.S
@@ -0,0 +1,250 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
+ * Darko Laus (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_mips_dspr2_asm.h"
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_mips_dspr2
+ * jsimd_extbgr_ycc_convert_mips_dspr2
+ * jsimd_extrgbx_ycc_convert_mips_dspr2
+ * jsimd_extbgrx_ycc_convert_mips_dspr2
+ * jsimd_extxbgr_ycc_convert_mips_dspr2
+ * jsimd_extxrgb_ycc_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r, \
+ g, \
+ b, \
+ inptr
+ lbu \r, \r_offs(\inptr)
+ lbu \g, \g_offs(\inptr)
+ lbu \b, \b_offs(\inptr)
+ addiu \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+/*
+ * a0 - cinfo->image_width
+ * a1 - input_buf
+ * a2 - output_buf
+ * a3 - output_row
+ * 16(sp) - num_rows
+ */
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw t7, 48(sp) // t7 = num_rows
+ li s0, 0x4c8b // FIX(0.29900)
+ li s1, 0x9646 // FIX(0.58700)
+ li s2, 0x1d2f // FIX(0.11400)
+ li s3, 0xffffd4cd // -FIX(0.16874)
+ li s4, 0xffffab33 // -FIX(0.33126)
+ li s5, 0x8000 // FIX(0.50000)
+ li s6, 0xffff94d1 // -FIX(0.41869)
+ li s7, 0xffffeb2f // -FIX(0.08131)
+ li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
+
+0:
+ addiu t7, -1 // --num_rows
+ lw t6, 0(a1) // t6 = input_buf[0]
+ lw t0, 0(a2)
+ lw t1, 4(a2)
+ lw t2, 8(a2)
+ sll t3, a3, 2
+ lwx t0, t3(t0) // t0 = output_buf[0][output_row]
+ lwx t1, t3(t1) // t1 = output_buf[1][output_row]
+ lwx t2, t3(t2) // t2 = output_buf[2][output_row]
+
+ addu t9, t2, a0 // t9 = end address
+ addiu a3, 1
+
+1:
+ DO_RGB_TO_YCC t3, t4, t5, t6
+
+ mtlo s5, $ac0
+ mtlo t8, $ac1
+ mtlo t8, $ac2
+ maddu $ac0, s2, t5
+ maddu $ac1, s5, t5
+ maddu $ac2, s5, t3
+ maddu $ac0, s0, t3
+ maddu $ac1, s3, t3
+ maddu $ac2, s6, t4
+ maddu $ac0, s1, t4
+ maddu $ac1, s4, t4
+ maddu $ac2, s7, t5
+ extr.w t3, $ac0, 16
+ extr.w t4, $ac1, 16
+ extr.w t5, $ac2, 16
+ sb t3, 0(t0)
+ sb t4, 0(t1)
+ sb t5, 0(t2)
+ addiu t0, 1
+ addiu t2, 1
+ bne t2, t9, 1b
+ addiu t1, 1
+ bgtz t7, 0b
+ addiu a1, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*------------------------------------------id -- pix R G B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_mips_dspr2
+ * jsimd_ycc_extbgr_convert_mips_dspr2
+ * jsimd_ycc_extrgbx_convert_mips_dspr2
+ * jsimd_ycc_extbgrx_convert_mips_dspr2
+ * jsimd_ycc_extxbgr_convert_mips_dspr2
+ * jsimd_ycc_extxrgb_convert_mips_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB scratch0 \
+ scratch1 \
+ scratch2 \
+ outptr
+ sb \scratch0, \r_offs(\outptr)
+ sb \scratch1, \g_offs(\outptr)
+ sb \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+ li t0, 0xFF
+ sb t0, \a_offs(\outptr)
+.endif
+ addiu \outptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+/*
+ * a0 - cinfo->image_width
+ * a1 - input_buf
+ * a2 - input_row
+ * a3 - output_buf
+ * 16(sp) - num_rows
+ */
+
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ lw s1, 48(sp)
+ li t3, 0x8000
+ li t4, 0x166e9 // FIX(1.40200)
+ li t5, 0x1c5a2 // FIX(1.77200)
+ li t6, 0xffff492e // -FIX(0.71414)
+ li t7, 0xffffa7e6 // -FIX(0.34414)
+ repl.ph t8, 128
+
+0:
+ lw s0, 0(a3)
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ sll s5, a2, 2
+ addiu s1, -1
+ lwx s2, s5(t0)
+ lwx s3, s5(t1)
+ lwx s4, s5(t2)
+ addu t9, s2, a0
+ addiu a2, 1
+
+1:
+ lbu s7, 0(s4) // cr
+ lbu s6, 0(s3) // cb
+ lbu s5, 0(s2) // y
+ addiu s2, 1
+ addiu s4, 1
+ addiu s7, -128
+ addiu s6, -128
+ mul t2, t7, s6
+ mul t0, t6, s7 // Crgtab[cr]
+ sll s7, 15
+ mulq_rs.w t1, t4, s7 // Crrtab[cr]
+ sll s6, 15
+ addu t2, t3 // Cbgtab[cb]
+ addu t2, t0
+
+ mulq_rs.w t0, t5, s6 // Cbbtab[cb]
+ sra t2, 16
+ addu t1, s5
+ addu t2, s5 // add y
+ ins t2, t1, 16, 16
+ subu.ph t2, t2, t8
+ addu t0, s5
+ shll_s.ph t2, t2, 8
+ subu t0, 128
+ shra.ph t2, t2, 8
+ shll_s.w t0, t0, 24
+ addu.ph t2, t2, t8 // clip & store
+ sra t0, t0, 24
+ sra t1, t2, 16
+ addiu t0, 128
+
+ STORE_YCC_TO_RGB t1, t2, t0, s0
+
+ bne s2, t9, 1b
+ addiu s3, 1
+ bgtz s1, 0b
+ addiu a3, 4
+
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+ j ra
+ nop
+END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*------------------------------------------id -- pix R G B A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+/*****************************************************************************/
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
new file mode 100644
index 0000000..53cf2bc
--- /dev/null
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -0,0 +1,252 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
+ * Darko Laus (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT $1
+#define v0 $2
+#define v1 $3
+#define a0 $4
+#define a1 $5
+#define a2 $6
+#define a3 $7
+#define t0 $8
+#define t1 $9
+#define t2 $10
+#define t3 $11
+#define t4 $12
+#define t5 $13
+#define t6 $14
+#define t7 $15
+#define s0 $16
+#define s1 $17
+#define s2 $18
+#define s3 $19
+#define s4 $20
+#define s5 $21
+#define s6 $22
+#define s7 $23
+#define t8 $24
+#define t9 $25
+#define k0 $26
+#define k1 $27
+#define gp $28
+#define sp $29
+#define fp $30
+#define s8 $30
+#define ra $31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+ .globl symbol; \
+ .align 2; \
+ .type symbol, @function; \
+ .ent symbol, 0; \
+symbol: .frame sp, 0, ra; \
+ .set push; \
+ .set arch=mips32r2; \
+ .set noreorder; \
+ .set noat;
+
+/*
+ * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+ .set dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+ .set pop; \
+ .end function; \
+ .size function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+ .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+ .error "Stack offset must be pozitive and multiple of 4."
+ .endif
+ .if \stack_offset != 0
+ addiu sp, sp, -\stack_offset
+ .endif
+ sw \r1, 0(sp)
+ .if \r2 != 0
+ sw \r2, 4(sp)
+ .endif
+ .if \r3 != 0
+ sw \r3, 8(sp)
+ .endif
+ .if \r4 != 0
+ sw \r4, 12(sp)
+ .endif
+ .if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ sw \r5, 16(sp)
+ .endif
+ .if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ sw \r6, 20(sp)
+ .endif
+ .if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ sw \r7, 24(sp)
+ .endif
+ .if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ sw \r8, 28(sp)
+ .endif
+ .if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ sw \r9, 32(sp)
+ .endif
+ .if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ sw \r10, 36(sp)
+ .endif
+ .if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ sw \r11, 40(sp)
+ .endif
+ .if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ sw \r12, 44(sp)
+ .endif
+ .if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ sw \r13, 48(sp)
+ .endif
+ .if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ sw \r14, 52(sp)
+ .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
+ * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+ r2 = 0, r3 = 0, r4 = 0, \
+ r5 = 0, r6 = 0, r7 = 0, \
+ r8 = 0, r9 = 0, r10 = 0, \
+ r11 = 0, r12 = 0, r13 = 0, \
+ r14 = 0
+ .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+ .error "Stack offset must be pozitive and multiple of 4."
+ .endif
+ lw \r1, 0(sp)
+ .if \r2 != 0
+ lw \r2, 4(sp)
+ .endif
+ .if \r3 != 0
+ lw \r3, 8(sp)
+ .endif
+ .if \r4 != 0
+ lw \r4, 12(sp)
+ .endif
+ .if \r5 != 0
+ CHECK_STACK_OFFSET 5, \stack_offset
+ lw \r5, 16(sp)
+ .endif
+ .if \r6 != 0
+ CHECK_STACK_OFFSET 6, \stack_offset
+ lw \r6, 20(sp)
+ .endif
+ .if \r7 != 0
+ CHECK_STACK_OFFSET 7, \stack_offset
+ lw \r7, 24(sp)
+ .endif
+ .if \r8 != 0
+ CHECK_STACK_OFFSET 8, \stack_offset
+ lw \r8, 28(sp)
+ .endif
+ .if \r9 != 0
+ CHECK_STACK_OFFSET 9, \stack_offset
+ lw \r9, 32(sp)
+ .endif
+ .if \r10 != 0
+ CHECK_STACK_OFFSET 10, \stack_offset
+ lw \r10, 36(sp)
+ .endif
+ .if \r11 != 0
+ CHECK_STACK_OFFSET 11, \stack_offset
+ lw \r11, 40(sp)
+ .endif
+ .if \r12 != 0
+ CHECK_STACK_OFFSET 12, \stack_offset
+ lw \r12, 44(sp)
+ .endif
+ .if \r13 != 0
+ CHECK_STACK_OFFSET 13, \stack_offset
+ lw \r13, 48(sp)
+ .endif
+ .if \r14 != 0
+ CHECK_STACK_OFFSET 14, \stack_offset
+ lw \r14, 52(sp)
+ .endif
+ .if \stack_offset != 0
+ addiu sp, sp, \stack_offset
+ .endif
+.endm
+
+