aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jcdctmgr.c174
-rw-r--r--jdct.h7
2 files changed, 149 insertions, 32 deletions
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 75d48e0..539ccc4 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -2,6 +2,7 @@
* jcdctmgr.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
@@ -65,6 +66,128 @@ typedef my_fdct_controller * my_fdct_ptr;
/*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+fls (UINT16 val)
+{
+ int bit;
+
+ bit = 16;
+
+ if (!val)
+ return 0;
+
+ if (!(val & 0xff00)) {
+ bit -= 8;
+ val <<= 8;
+ }
+ if (!(val & 0xf000)) {
+ bit -= 4;
+ val <<= 4;
+ }
+ if (!(val & 0xc000)) {
+ bit -= 2;
+ val <<= 2;
+ }
+ if (!(val & 0x8000)) {
+ bit -= 1;
+ val <<= 1;
+ }
+
+ return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ * "How to optimize for the Pentium family of microprocessors"
+ * (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ * b = (the number of significant bits in divisor) - 1
+ * r = (word size) + b
+ * f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ * no fractional part:
+ *
+ * result = input >> r
+ *
+ * fractional part of f < 0.5:
+ *
+ * round f down to nearest integer
+ * result = ((input + 1) * f) >> r
+ *
+ * fractional part of f > 0.5:
+ *
+ * round f up to nearest integer
+ * result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ *
+ * dctbl[0] = f rounded to nearest integer
+ * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ * dctbl[2] = 1 << ((word size) * 2 - r)
+ * dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(void)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+ UDCTELEM2 fq, fr;
+ UDCTELEM c;
+ int b, r;
+
+ b = fls(divisor) - 1;
+ r = sizeof(DCTELEM) * 8 + b;
+
+ fq = ((UDCTELEM2)1 << r) / divisor;
+ fr = ((UDCTELEM2)1 << r) % divisor;
+
+ c = divisor / 2; /* for rounding */
+
+ if (fr == 0) { /* divisor is power of two */
+ /* fq will be one bit too large to fit in DCTELEM, so adjust */
+ fq >>= 1;
+ r--;
+ } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+ c++;
+ } else { /* fractional part is > 0.5 */
+ fq++;
+ }
+
+ dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
+ dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
+ dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+}
+
+/*
* Initialize for a processing pass.
* Verify that all referenced Q-tables are present, and set up
* the divisor table for each one.
@@ -101,11 +224,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- DCTSIZE2 * SIZEOF(DCTELEM));
+ (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+ compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
}
break;
#endif
@@ -135,14 +258,14 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- DCTSIZE2 * SIZEOF(DCTELEM));
+ (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- dtbl[i] = (DCTELEM)
+ compute_reciprocal(
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
(INT32) aanscales[i]),
- CONST_BITS-3);
+ CONST_BITS-3), &dtbl[i]);
}
}
break;
@@ -233,41 +356,30 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
METHODDEF(void)
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
{
- register DCTELEM temp, qval;
- register int i;
- register JCOEFPTR output_ptr = coef_block;
+ int i;
+ DCTELEM temp;
+ UDCTELEM recip, corr, shift;
+ UDCTELEM2 product;
+ JCOEFPTR output_ptr = coef_block;
for (i = 0; i < DCTSIZE2; i++) {
- qval = divisors[i];
temp = workspace[i];
-
- /* Divide the coefficient value by qval, ensuring proper rounding.
- * Since C does not specify the direction of rounding for negative
- * quotients, we have to force the dividend positive for portability.
- *
- * In most files, at least half of the output values will be zero
- * (at default quantization settings, more like three-quarters...)
- * so we should ensure that this case is fast. On many machines,
- * a comparison is enough cheaper than a divide to make a special test
- * a win. Since both inputs will be nonnegative, we need only test
- * for a < b to discover whether a/b is 0.
- * If your machine's division is fast enough, define FAST_DIVIDE.
- */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b) a /= b
-#else
-#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
-#endif
+ recip = divisors[i + DCTSIZE2 * 0];
+ corr = divisors[i + DCTSIZE2 * 1];
+ shift = divisors[i + DCTSIZE2 * 3];
if (temp < 0) {
temp = -temp;
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM)*8;
+ temp = product;
temp = -temp;
} else {
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM)*8;
+ temp = product;
}
+
output_ptr[i] = (JCOEF) temp;
}
}
diff --git a/jdct.h b/jdct.h
index c3650ea..c834410 100644
--- a/jdct.h
+++ b/jdct.h
@@ -23,13 +23,18 @@
* have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
* convention improves accuracy in integer implementations and saves some
* work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
*/
#if BITS_IN_JSAMPLE == 8
typedef int DCTELEM; /* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
#else
typedef INT32 DCTELEM; /* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
#endif