aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authordcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2013-10-09 18:39:44 +0000
committerdcommander <dcommander@632fc199-4ca6-4c93-a231-07263d6284db>2013-10-09 18:39:44 +0000
commit57346ead7f083755eb2bd13eeed4f79f97b4685c (patch)
treeaa400afda6304977695e298121f9e6e7fc252137 /simd
parent79b101d78c777df7968cb4773919c421298ebae3 (diff)
SIMD-accelerated floating point quantize and convsamp routines for MIPS DSPr2
git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1058 632fc199-4ca6-4c93-a231-07263d6284db
Diffstat (limited to 'simd')
-rw-r--r--simd/jsimd.h8
-rw-r--r--simd/jsimd_mips.c38
-rw-r--r--simd/jsimd_mips_dspr2.S440
-rw-r--r--simd/jsimd_mips_dspr2_asm.h33
4 files changed, 519 insertions, 0 deletions
diff --git a/simd/jsimd.h b/simd/jsimd.h
index ec32a8f..e5c5d44 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -663,6 +663,10 @@ EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,
FAST_FLOAT * workspace));
+EXTERN(void) jsimd_convsamp_float_mips_dspr2 JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
/* SIMD Forward DCT */
EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
@@ -711,6 +715,10 @@ EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
FAST_FLOAT * divisors,
FAST_FLOAT * workspace));
+EXTERN(void) jsimd_quantize_float_mips_dspr2 JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
/* SIMD Reduced Inverse DCT */
EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
JCOEFPTR coef_block,
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
index f3b5afe..d8d6b19 100644
--- a/simd/jsimd_mips.c
+++ b/simd/jsimd_mips.c
@@ -459,6 +459,23 @@ jsimd_can_convsamp (void)
GLOBAL(int)
jsimd_can_convsamp_float (void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_MIPS_DSPR2))
+ return 1;
+
return 0;
}
@@ -472,6 +489,8 @@ GLOBAL(void)
jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT * workspace)
{
+ if ((simd_support & JSIMD_MIPS_DSPR2))
+ jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
}
GLOBAL(int)
@@ -555,6 +574,23 @@ jsimd_can_quantize (void)
GLOBAL(int)
jsimd_can_quantize_float (void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_MIPS_DSPR2))
+ return 1;
+
return 0;
}
@@ -570,6 +606,8 @@ GLOBAL(void)
jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
FAST_FLOAT * workspace)
{
+ if ((simd_support & JSIMD_MIPS_DSPR2))
+ jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
}
GLOBAL(int)
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
index bfedae7..d478a6d 100644
--- a/simd/jsimd_mips_dspr2.S
+++ b/simd/jsimd_mips_dspr2.S
@@ -1666,6 +1666,86 @@ LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
END(jsimd_quantize_mips_dspr2)
/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
+/*
+ * a0 - coef_block
+ * a1 - divisors
+ * a2 - workspace
+ */
+
+ .set at
+
+ li t1, 0x46800100 //integer representation 16384.5
+ mtc1 t1, f0
+ li t0, 63
+0:
+ lwc1 f1, 0(a2)
+ lwc1 f5, 0(a1)
+ lwc1 f2, 4(a2)
+ lwc1 f6, 4(a1)
+ lwc1 f3, 8(a2)
+ lwc1 f7, 8(a1)
+ lwc1 f4, 12(a2)
+ lwc1 f8, 12(a1)
+ madd.s f1, f0, f1, f5
+ madd.s f2, f0, f2, f6
+ madd.s f3, f0, f3, f7
+ madd.s f4, f0, f4, f8
+ lwc1 f5, 16(a1)
+ lwc1 f6, 20(a1)
+ trunc.w.s f1, f1
+ trunc.w.s f2, f2
+ trunc.w.s f3, f3
+ trunc.w.s f4, f4
+ lwc1 f7, 24(a1)
+ lwc1 f8, 28(a1)
+ mfc1 t1, f1
+ mfc1 t2, f2
+ mfc1 t3, f3
+ mfc1 t4, f4
+ lwc1 f1, 16(a2)
+ lwc1 f2, 20(a2)
+ lwc1 f3, 24(a2)
+ lwc1 f4, 28(a2)
+ madd.s f1, f0, f1, f5
+ madd.s f2, f0, f2, f6
+ madd.s f3, f0, f3, f7
+ madd.s f4, f0, f4, f8
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ trunc.w.s f1, f1
+ trunc.w.s f2, f2
+ trunc.w.s f3, f3
+ trunc.w.s f4, f4
+ sh t1, 0(a0)
+ sh t2, 2(a0)
+ sh t3, 4(a0)
+ sh t4, 6(a0)
+ mfc1 t1, f1
+ mfc1 t2, f2
+ mfc1 t3, f3
+ mfc1 t4, f4
+ addiu t0, t0, -8
+ addiu a2, a2, 32
+ addiu a1, a1, 32
+ addiu t1, t1, -16384
+ addiu t2, t2, -16384
+ addiu t3, t3, -16384
+ addiu t4, t4, -16384
+ sh t1, 8(a0)
+ sh t2, 10(a0)
+ sh t3, 12(a0)
+ sh t4, 14(a0)
+ bgez t0, 0b
+ addiu a0, a0, 16
+
+ j ra
+ nop
+
+END(jsimd_quantize_float_mips_dspr2)
+/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
/*
* a0 - compptr->dct_table
@@ -2733,3 +2813,363 @@ LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
END(jsimd_idct_12x12_pass2_mips_dspr2)
/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
+/*
+ * a0 - sample_data
+ * a1 - start_col
+ * a2 - workspace
+ */
+
+ .set at
+
+ lw t0, 0(a0)
+ addu t0, t0, a1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 4(a0)
+ swc1 f1, 0(a2)
+ swc1 f2, 4(a2)
+ swc1 f3, 8(a2)
+ addu t0, t0, a1
+ swc1 f4, 12(a2)
+ swc1 f5, 16(a2)
+ swc1 f6, 20(a2)
+ swc1 f7, 24(a2)
+ swc1 f8, 28(a2)
+ //elemr 1
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 8(a0)
+ swc1 f1, 32(a2)
+ swc1 f2, 36(a2)
+ swc1 f3, 40(a2)
+ addu t0, t0, a1
+ swc1 f4, 44(a2)
+ swc1 f5, 48(a2)
+ swc1 f6, 52(a2)
+ swc1 f7, 56(a2)
+ swc1 f8, 60(a2)
+ //elemr 2
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 12(a0)
+ swc1 f1, 64(a2)
+ swc1 f2, 68(a2)
+ swc1 f3, 72(a2)
+ addu t0, t0, a1
+ swc1 f4, 76(a2)
+ swc1 f5, 80(a2)
+ swc1 f6, 84(a2)
+ swc1 f7, 88(a2)
+ swc1 f8, 92(a2)
+ //elemr 3
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 16(a0)
+ swc1 f1, 96(a2)
+ swc1 f2, 100(a2)
+ swc1 f3, 104(a2)
+ addu t0, t0, a1
+ swc1 f4, 108(a2)
+ swc1 f5, 112(a2)
+ swc1 f6, 116(a2)
+ swc1 f7, 120(a2)
+ swc1 f8, 124(a2)
+ //elemr 4
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 20(a0)
+ swc1 f1, 128(a2)
+ swc1 f2, 132(a2)
+ swc1 f3, 136(a2)
+ addu t0, t0, a1
+ swc1 f4, 140(a2)
+ swc1 f5, 144(a2)
+ swc1 f6, 148(a2)
+ swc1 f7, 152(a2)
+ swc1 f8, 156(a2)
+ //elemr 5
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 24(a0)
+ swc1 f1, 160(a2)
+ swc1 f2, 164(a2)
+ swc1 f3, 168(a2)
+ addu t0, t0, a1
+ swc1 f4, 172(a2)
+ swc1 f5, 176(a2)
+ swc1 f6, 180(a2)
+ swc1 f7, 184(a2)
+ swc1 f8, 188(a2)
+ //elemr 6
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ lw t0, 28(a0)
+ swc1 f1, 192(a2)
+ swc1 f2, 196(a2)
+ swc1 f3, 200(a2)
+ addu t0, t0, a1
+ swc1 f4, 204(a2)
+ swc1 f5, 208(a2)
+ swc1 f6, 212(a2)
+ swc1 f7, 216(a2)
+ swc1 f8, 220(a2)
+ //elemr 7
+ lbu t1, 0(t0)
+ lbu t2, 1(t0)
+ lbu t3, 2(t0)
+ lbu t4, 3(t0)
+ lbu t5, 4(t0)
+ lbu t6, 5(t0)
+ lbu t7, 6(t0)
+ lbu t8, 7(t0)
+ addiu t1, t1, -128
+ addiu t2, t2, -128
+ addiu t3, t3, -128
+ addiu t4, t4, -128
+ addiu t5, t5, -128
+ addiu t6, t6, -128
+ addiu t7, t7, -128
+ addiu t8, t8, -128
+ mtc1 t1, f1
+ mtc1 t2, f2
+ mtc1 t3, f3
+ mtc1 t4, f4
+ mtc1 t5, f5
+ mtc1 t6, f6
+ mtc1 t7, f7
+ mtc1 t8, f8
+ cvt.s.w f1, f1
+ cvt.s.w f2, f2
+ cvt.s.w f3, f3
+ cvt.s.w f4, f4
+ cvt.s.w f5, f5
+ cvt.s.w f6, f6
+ cvt.s.w f7, f7
+ cvt.s.w f8, f8
+ swc1 f1, 224(a2)
+ swc1 f2, 228(a2)
+ swc1 f3, 232(a2)
+ swc1 f4, 236(a2)
+ swc1 f5, 240(a2)
+ swc1 f6, 244(a2)
+ swc1 f7, 248(a2)
+ swc1 f8, 252(a2)
+
+ j ra
+ nop
+
+END(jsimd_convsamp_float_mips_dspr2)
+
+/*****************************************************************************/
+
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
index 53cf2bc..50ec31b 100644
--- a/simd/jsimd_mips_dspr2_asm.h
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -56,6 +56,39 @@
#define s8 $30
#define ra $31
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
/*
* LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
*/