2 files changed, 149 insertions, 32 deletions
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 75d48e0..539ccc4 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -2,6 +2,7 @@
  * jcdctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
@@ -65,6 +66,128 @@ typedef my_fdct_controller * my_fdct_ptr;
 
 
 /*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+fls (UINT16 val)
+{
+  int bit;
+
+  bit = 16;
+
+  if (!val)
+    return 0;
+
+  if (!(val & 0xff00)) {
+    bit -= 8;
+    val <<= 8;
+  }
+  if (!(val & 0xf000)) {
+    bit -= 4;
+    val <<= 4;
+  }
+  if (!(val & 0xc000)) {
+    bit -= 2;
+    val <<= 2;
+  }
+  if (!(val & 0x8000)) {
+    bit -= 1;
+    val <<= 1;
+  }
+
+  return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ *   b = (the number of significant bits in divisor) - 1
+ *   r = (word size) + b
+ *   f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ *   no fractional part:
+ *
+ *       result = input >> r
+ *
+ *   fractional part of f < 0.5:
+ *
+ *       round f down to nearest integer
+ *       result = ((input + 1) * f) >> r
+ *
+ *   fractional part of f > 0.5:
+ *
+ *       round f up to nearest integer
+ *       result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ * 
+ *   dctbl[0] = f rounded to nearest integer
+ *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ *   dctbl[2] = 1 << ((word size) * 2 - r)
+ *   dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(void)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+  UDCTELEM2 fq, fr;
+  UDCTELEM c;
+  int b, r;
+
+  b = fls(divisor) - 1;
+  r  = sizeof(DCTELEM) * 8 + b;
+
+  fq = ((UDCTELEM2)1 << r) / divisor;
+  fr = ((UDCTELEM2)1 << r) % divisor;
+
+  c = divisor / 2; /* for rounding */
+
+  if (fr == 0) { /* divisor is power of two */
+    /* fq will be one bit too large to fit in DCTELEM, so adjust */
+    fq >>= 1;
+    r--;
+  } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+    c++;
+  } else { /* fractional part is > 0.5 */
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+}
+
+/*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
  * the divisor table for each one.
@@ -101,11 +224,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      DCTSIZE2 * SIZEOF(DCTELEM));
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
-	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+	compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
       }
       break;
 #endif
@@ -135,14 +258,14 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					DCTSIZE2 * SIZEOF(DCTELEM));
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 	}
 	dtbl = fdct->divisors[qtblno];
 	for (i = 0; i < DCTSIZE2; i++) {
-	  dtbl[i] = (DCTELEM)
+	  compute_reciprocal(
 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 				  (INT32) aanscales[i]),
-		    CONST_BITS-3);
+		    CONST_BITS-3), &dtbl[i]);
 	}
       }
       break;
@@ -233,41 +356,30 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
 METHODDEF(void)
 quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 {
-  register DCTELEM temp, qval;
-  register int i;
-  register JCOEFPTR output_ptr = coef_block;
+  int i;
+  DCTELEM temp;
+  UDCTELEM recip, corr, shift;
+  UDCTELEM2 product;
+  JCOEFPTR output_ptr = coef_block;
 
   for (i = 0; i < DCTSIZE2; i++) {
-    qval = divisors[i];
     temp = workspace[i];
-
-    /* Divide the coefficient value by qval, ensuring proper rounding.
-     * Since C does not specify the direction of rounding for negative
-     * quotients, we have to force the dividend positive for portability.
-     *
-     * In most files, at least half of the output values will be zero
-     * (at default quantization settings, more like three-quarters...)
-     * so we should ensure that this case is fast.  On many machines,
-     * a comparison is enough cheaper than a divide to make a special test
-     * a win.  Since both inputs will be nonnegative, we need only test
-     * for a < b to discover whether a/b is 0.
-     * If your machine's division is fast enough, define FAST_DIVIDE.
-     */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
+    recip = divisors[i + DCTSIZE2 * 0];
+    corr =  divisors[i + DCTSIZE2 * 1];
+    shift = divisors[i + DCTSIZE2 * 3];
 
     if (temp < 0) {
       temp = -temp;
-      temp += qval>>1;	/* for rounding */
-      DIVIDE_BY(temp, qval);
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
       temp = -temp;
     } else {
-      temp += qval>>1;	/* for rounding */
-      DIVIDE_BY(temp, qval);
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
     }
+
     output_ptr[i] = (JCOEF) temp;
   }
 }
diff --git a/jdct.h b/jdct.h
index c3650ea..c834410 100644
--- a/jdct.h
+++ b/jdct.h
@@ -23,13 +23,18 @@
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
  * work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
  */
 
 #if BITS_IN_JSAMPLE == 8
 typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
 #else
 typedef INT32 DCTELEM;		/* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
 #endif