diff options
author | dcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db> | 2013-07-24 21:50:20 +0000 |
---|---|---|
committer | dcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db> | 2013-07-24 21:50:20 +0000 |
commit | f6f573d99ce2ad07b7368f2d16849e7867be36d2 (patch) | |
tree | e9f76bc75e886c7d29f8fff622a7d8437827d6b4 /simd | |
parent | 7575a17036e85e89584b2fdb6867362b767def7e (diff) |
SIMD support for performing color conversion using MIPS DSPr2 instructions
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@993 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r-- | simd/Makefile.am | 6 | ||||
-rw-r--r-- | simd/jsimd.h | 60 | ||||
-rw-r--r-- | simd/jsimd_mips.c | 465 | ||||
-rw-r--r-- | simd/jsimd_mips_dspr2.S | 250 | ||||
-rw-r--r-- | simd/jsimd_mips_dspr2_asm.h | 252 |
5 files changed, 1033 insertions, 0 deletions
diff --git a/simd/Makefile.am b/simd/Makefile.am index a12ff6e..272c2a4 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -58,6 +58,12 @@ libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S endif +if SIMD_MIPSEL + +libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2.S + +endif + AM_CPPFLAGS = -I$(top_srcdir) .asm.lo: diff --git a/simd/jsimd.h b/simd/jsimd.h index 3d4751f..de1f495 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -3,6 +3,7 @@ * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright 2011 D. R. Commander + * Copyright (C) 2013, MIPS Technologies, Inc., California * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -18,6 +19,7 @@ #define JSIMD_SSE 0x04 #define JSIMD_SSE2 0x08 #define JSIMD_ARM_NEON 0x10 +#define JSIMD_MIPS_DSPR2 0x20 /* Short forms of external names for systems with brain-damaged linkers. */ @@ -386,6 +388,64 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_neon JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); + +EXTERN (void) jsimd_ycc_rgb_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); +EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2 + JPP((JDIMENSION img_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); + /* SIMD Downsample */ EXTERN(void) jsimd_h2v2_downsample_mmx JPP((JDIMENSION image_width, int max_v_samp_factor, diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c new file mode 100644 index 0000000..f74d908 --- /dev/null +++ b/simd/jsimd_mips.c @@ -0,0 +1,465 @@ +/* + * jsimd_mips.c + * + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * Copyright 2009-2011 D. R. Commander + * Copyright (C) 2013, MIPS Technologies, Inc., California + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * For conditions of distribution and use, see copyright notice in jsimdext.inc + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations when running on + * MIPS architecture. + * + * Based on the stubs from 'jsimd_none.c' + */ + +#define JPEG_INTERNALS +#include "../jinclude.h" +#include "../jpeglib.h" +#include "../jsimd.h" +#include "../jdct.h" +#include "../jsimddct.h" +#include "jsimd.h" + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +static unsigned int simd_support = ~0; + +#if defined(__linux__) + +LOCAL(int) +parse_proc_cpuinfo(const char* search_string) +{ + const char* file_name = "/proc/cpuinfo"; + char cpuinfo_line[256]; + FILE* f = NULL; + simd_support = 0; + + if ((f = fopen(file_name, "r")) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + simd_support |= JSIMD_MIPS_DSPR2; + return 1; + } + } + fclose(f); + } + /* Did not find string in the proc file, or not Linux ELF. */ + return 0; +} +#endif + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + if (simd_support != ~0U) + return; + + simd_support = 0; + +#if defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) + simd_support |= JSIMD_MIPS_DSPR2; +#elif defined(__linux__) + /* We still have a chance to use MIPS DSPR2 regardless of globally used + * -mdspr2 options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux */ + if (!parse_proc_cpuinfo("MIPS 74K")) + return; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_rgb_gray (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + switch(cinfo->in_color_space) + { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2; + + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); +} + +GLOBAL(void) +jsimd_rgb_gray_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + + switch(cinfo->out_color_space) + { + case JCS_EXT_RGB: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2; + break; + case JCS_EXT_BGR: + mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2; + break; + default: + mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2; + break; + } + + if (simd_support & JSIMD_MIPS_DSPR2) + mipsdspr2fct(cinfo->output_width, input_buf, + input_row, output_buf, num_rows); +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace) +{ +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM * data) +{ +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM * data) +{ +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT * data) +{ +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace) +{ +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, + FAST_FLOAT * workspace) +{ +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S new file mode 100644 index 0000000..b2ab04f --- /dev/null +++ b/simd/jsimd_mips_dspr2.S @@ -0,0 +1,250 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * All rights reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "jsimd_mips_dspr2_asm.h" + +/*****************************************************************************/ +/* + * jsimd_extrgb_ycc_convert_mips_dspr2 + * jsimd_extbgr_ycc_convert_mips_dspr2 + * jsimd_extrgbx_ycc_convert_mips_dspr2 + * jsimd_extbgrx_ycc_convert_mips_dspr2 + * jsimd_extxbgr_ycc_convert_mips_dspr2 + * jsimd_extxrgb_ycc_convert_mips_dspr2 + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs + +.macro DO_RGB_TO_YCC r, \ + g, \ + b, \ + inptr + lbu \r, \r_offs(\inptr) + lbu \g, \g_offs(\inptr) + lbu \b, \b_offs(\inptr) + addiu \inptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - output_buf + * a3 - output_row + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw t7, 48(sp) // t7 = num_rows + li s0, 0x4c8b // FIX(0.29900) + li s1, 0x9646 // FIX(0.58700) + li s2, 0x1d2f // FIX(0.11400) + li s3, 0xffffd4cd // -FIX(0.16874) + li s4, 0xffffab33 // -FIX(0.33126) + li s5, 0x8000 // FIX(0.50000) + li s6, 0xffff94d1 // -FIX(0.41869) + li s7, 0xffffeb2f // -FIX(0.08131) + li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 + +0: + addiu t7, -1 // --num_rows + lw t6, 0(a1) // t6 = input_buf[0] + lw t0, 0(a2) + lw t1, 4(a2) + lw t2, 8(a2) + sll t3, a3, 2 + lwx t0, t3(t0) // t0 = output_buf[0][output_row] + lwx t1, t3(t1) // t1 = output_buf[1][output_row] + lwx t2, t3(t2) // t2 = output_buf[2][output_row] + + addu t9, t2, a0 // t9 = end address + addiu a3, 1 + +1: + DO_RGB_TO_YCC t3, t4, t5, t6 + + mtlo s5, $ac0 + mtlo t8, $ac1 + mtlo t8, $ac2 + maddu $ac0, s2, t5 + maddu $ac1, s5, t5 + maddu $ac2, s5, t3 + maddu $ac0, s0, t3 + maddu $ac1, s3, t3 + maddu $ac2, s6, t4 + maddu $ac0, s1, t4 + maddu $ac1, s4, t4 + maddu $ac2, s7, t5 + extr.w t3, $ac0, 16 + extr.w t4, $ac1, 16 + extr.w t5, $ac2, 16 + sb t3, 0(t0) + sb t4, 0(t1) + sb t5, 0(t2) + addiu t0, 1 + addiu t2, 1 + bne t2, t9, 1b + addiu t1, 1 + bgtz t7, 0b + addiu a1, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_\colorid\()_ycc_convert_mips_dspr2) + +.purgem DO_RGB_TO_YCC + +.endm + +/*------------------------------------------id -- pix R G B */ +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 +GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 + +/*****************************************************************************/ +/* + * jsimd_ycc_extrgb_convert_mips_dspr2 + * jsimd_ycc_extbgr_convert_mips_dspr2 + * jsimd_ycc_extrgbx_convert_mips_dspr2 + * jsimd_ycc_extbgrx_convert_mips_dspr2 + * jsimd_ycc_extxbgr_convert_mips_dspr2 + * jsimd_ycc_extxrgb_convert_mips_dspr2 + * + * Colorspace conversion YCbCr -> RGB + */ + +.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs + +.macro STORE_YCC_TO_RGB scratch0 \ + scratch1 \ + scratch2 \ + outptr + sb \scratch0, \r_offs(\outptr) + sb \scratch1, \g_offs(\outptr) + sb \scratch2, \b_offs(\outptr) +.if (\pixel_size == 4) + li t0, 0xFF + sb t0, \a_offs(\outptr) +.endif + addiu \outptr, \pixel_size +.endm + +LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) +/* + * a0 - cinfo->image_width + * a1 - input_buf + * a2 - input_row + * a3 - output_buf + * 16(sp) - num_rows + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + lw s1, 48(sp) + li t3, 0x8000 + li t4, 0x166e9 // FIX(1.40200) + li t5, 0x1c5a2 // FIX(1.77200) + li t6, 0xffff492e // -FIX(0.71414) + li t7, 0xffffa7e6 // -FIX(0.34414) + repl.ph t8, 128 + +0: + lw s0, 0(a3) + lw t0, 0(a1) + lw t1, 4(a1) + lw t2, 8(a1) + sll s5, a2, 2 + addiu s1, -1 + lwx s2, s5(t0) + lwx s3, s5(t1) + lwx s4, s5(t2) + addu t9, s2, a0 + addiu a2, 1 + +1: + lbu s7, 0(s4) // cr + lbu s6, 0(s3) // cb + lbu s5, 0(s2) // y + addiu s2, 1 + addiu s4, 1 + addiu s7, -128 + addiu s6, -128 + mul t2, t7, s6 + mul t0, t6, s7 // Crgtab[cr] + sll s7, 15 + mulq_rs.w t1, t4, s7 // Crrtab[cr] + sll s6, 15 + addu t2, t3 // Cbgtab[cb] + addu t2, t0 + + mulq_rs.w t0, t5, s6 // Cbbtab[cb] + sra t2, 16 + addu t1, s5 + addu t2, s5 // add y + ins t2, t1, 16, 16 + subu.ph t2, t2, t8 + addu t0, s5 + shll_s.ph t2, t2, 8 + subu t0, 128 + shra.ph t2, t2, 8 + shll_s.w t0, t0, 24 + addu.ph t2, t2, t8 // clip & store + sra t0, t0, 24 + sra t1, t2, 16 + addiu t0, 128 + + STORE_YCC_TO_RGB t1, t2, t0, s0 + + bne s2, t9, 1b + addiu s3, 1 + bgtz s1, 0b + addiu a3, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop +END(jsimd_ycc_\colorid\()_convert_mips_dspr2) + +.purgem STORE_YCC_TO_RGB + +.endm + +/*------------------------------------------id -- pix R G B A */ +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 +GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 + +/*****************************************************************************/ diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h new file mode 100644 index 0000000..53cf2bc --- /dev/null +++ b/simd/jsimd_mips_dspr2_asm.h @@ -0,0 +1,252 @@ +/* + * MIPS DSPr2 optimizations for libjpeg-turbo + * + * Copyright (C) 2013, MIPS Technologies, Inc., California. + * All rights reserved. + * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) + * Darko Laus (darko.laus@imgtec.com) + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define zero $0 +#define AT $1 +#define v0 $2 +#define v1 $3 +#define a0 $4 +#define a1 $5 +#define a2 $6 +#define a3 $7 +#define t0 $8 +#define t1 $9 +#define t2 $10 +#define t3 $11 +#define t4 $12 +#define t5 $13 +#define t6 $14 +#define t7 $15 +#define s0 $16 +#define s1 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define t8 $24 +#define t9 $25 +#define k0 $26 +#define k1 $27 +#define gp $28 +#define sp $29 +#define fp $30 +#define s8 $30 +#define ra $31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol) \ + .globl symbol; \ + .align 2; \ + .type symbol, @function; \ + .ent symbol, 0; \ +symbol: .frame sp, 0, ra; \ + .set push; \ + .set arch=mips32r2; \ + .set noreorder; \ + .set noat; + +/* + * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol) \ +LEAF_MIPS32R2(symbol) \ + .set dspr2; + +/* + * END - mark end of function + */ +#define END(function) \ + .set pop; \ + .end function; \ + .size function,.-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + .if \stack_offset != 0 + addiu sp, sp, -\stack_offset + .endif + sw \r1, 0(sp) + .if \r2 != 0 + sw \r2, 4(sp) + .endif + .if \r3 != 0 + sw \r3, 8(sp) + .endif + .if \r4 != 0 + sw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + sw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + sw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + sw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + sw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + sw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + sw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + sw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + sw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + sw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + sw \r14, 52(sp) + .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 + * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ + r2 = 0, r3 = 0, r4 = 0, \ + r5 = 0, r6 = 0, r7 = 0, \ + r8 = 0, r9 = 0, r10 = 0, \ + r11 = 0, r12 = 0, r13 = 0, \ + r14 = 0 + .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) + .error "Stack offset must be pozitive and multiple of 4." + .endif + lw \r1, 0(sp) + .if \r2 != 0 + lw \r2, 4(sp) + .endif + .if \r3 != 0 + lw \r3, 8(sp) + .endif + .if \r4 != 0 + lw \r4, 12(sp) + .endif + .if \r5 != 0 + CHECK_STACK_OFFSET 5, \stack_offset + lw \r5, 16(sp) + .endif + .if \r6 != 0 + CHECK_STACK_OFFSET 6, \stack_offset + lw \r6, 20(sp) + .endif + .if \r7 != 0 + CHECK_STACK_OFFSET 7, \stack_offset + lw \r7, 24(sp) + .endif + .if \r8 != 0 + CHECK_STACK_OFFSET 8, \stack_offset + lw \r8, 28(sp) + .endif + .if \r9 != 0 + CHECK_STACK_OFFSET 9, \stack_offset + lw \r9, 32(sp) + .endif + .if \r10 != 0 + CHECK_STACK_OFFSET 10, \stack_offset + lw \r10, 36(sp) + .endif + .if \r11 != 0 + CHECK_STACK_OFFSET 11, \stack_offset + lw \r11, 40(sp) + .endif + .if \r12 != 0 + CHECK_STACK_OFFSET 12, \stack_offset + lw \r12, 44(sp) + .endif + .if \r13 != 0 + CHECK_STACK_OFFSET 13, \stack_offset + lw \r13, 48(sp) + .endif + .if \r14 != 0 + CHECK_STACK_OFFSET 14, \stack_offset + lw \r14, 52(sp) + .endif + .if \stack_offset != 0 + addiu sp, sp, \stack_offset + .endif +.endm + + |