From 5e5540254137b18be88653a7681d776f26eceb53 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Wed, 10 Nov 2010 04:03:36 +0200 Subject: ARM assembly optimizations for 'encode_one_block' Almost 2x faster than original C variant. --- jchuff.c | 362 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 362 insertions(+) diff --git a/jchuff.c b/jchuff.c index b05c8e7..5fbc53d 100644 --- a/jchuff.c +++ b/jchuff.c @@ -2,6 +2,11 @@ * jchuff.c * * Copyright (C) 1991-1997, Thomas G. Lane. + * + * ARM optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -505,6 +510,349 @@ flush_bits (working_state * state) return TRUE; } +/*********************************/ + +#ifdef __arm__ + +/* +#!/usr/bin/env ruby + +require 'generator' + +$jpeg_natural_order = Generator.new( +[ + 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +]) + +$ldr_reg = Generator.new {|g| + while true do + ["%[tmp1]", "%[tmp2]", "%[tmp3]"].each {|x| g.yield x} + end +} + +$cmp_reg = Generator.new {|g| + while true do + ["%[tmp1]", "%[tmp2]", "%[tmp3]"].each {|x| g.yield x} + end +} + +$idx = Generator.new(1.upto(1000)) + +def load + printf("\t\t\"ldrh %s, [%%[blk], #%d]\\n\"\n", + $ldr_reg.next, $jpeg_natural_order.next * 2) +end + +def store + r = $cmp_reg.next + i = $idx.next + printf("\t\t\"cmp %s, #0\\n\"\n", r) + printf("\t\t\"strh %s, [%%[out], #%d]\\n\"\n", r, i * 2) + printf("\t\t\"movne %%[n], #%d\\n\"\n", i) +end + +load +load +while $jpeg_natural_order.next? do + load + store +end +store +store +*/ + +/* + * Find last nonzero coefficient and produce output in natural order, + * instructions are scheduled to make use of ARM Cortex-A8 dual-issue + * capability + */ +LOCAL(int) +find_last_nonzero_index (JCOEFPTR block, JCOEFPTR out) +{ + int tmp1, tmp2, tmp3, n = 0; + asm volatile ( + "ldrh %[tmp1], [%[blk], #2]\n" + "ldrh %[tmp2], [%[blk], #16]\n" + "ldrh %[tmp3], [%[blk], #32]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #2]\n" + "movne %[n], #1\n" + "ldrh %[tmp1], [%[blk], #18]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #4]\n" + "movne %[n], #2\n" + "ldrh %[tmp2], [%[blk], #4]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #6]\n" + "movne %[n], #3\n" + "ldrh %[tmp3], [%[blk], #6]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #8]\n" + "movne %[n], #4\n" + "ldrh %[tmp1], [%[blk], #20]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #10]\n" + "movne %[n], #5\n" + "ldrh %[tmp2], [%[blk], #34]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #12]\n" + "movne %[n], #6\n" + "ldrh %[tmp3], [%[blk], #48]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #14]\n" + "movne %[n], #7\n" + "ldrh %[tmp1], [%[blk], #64]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #16]\n" + "movne %[n], #8\n" + "ldrh %[tmp2], [%[blk], #50]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #18]\n" + "movne %[n], #9\n" + "ldrh %[tmp3], [%[blk], #36]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #20]\n" + "movne %[n], #10\n" + "ldrh %[tmp1], [%[blk], #22]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #22]\n" + "movne %[n], #11\n" + "ldrh %[tmp2], [%[blk], #8]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #24]\n" + "movne %[n], #12\n" + "ldrh %[tmp3], [%[blk], #10]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #26]\n" + "movne %[n], #13\n" + "ldrh %[tmp1], [%[blk], #24]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #28]\n" + "movne %[n], #14\n" + "ldrh %[tmp2], [%[blk], #38]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #30]\n" + "movne %[n], #15\n" + "ldrh %[tmp3], [%[blk], #52]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #32]\n" + "movne %[n], #16\n" + "ldrh %[tmp1], [%[blk], #66]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #34]\n" + "movne %[n], #17\n" + "ldrh %[tmp2], [%[blk], #80]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #36]\n" + "movne %[n], #18\n" + "ldrh %[tmp3], [%[blk], #96]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #38]\n" + "movne %[n], #19\n" + "ldrh %[tmp1], [%[blk], #82]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #40]\n" + "movne %[n], #20\n" + "ldrh %[tmp2], [%[blk], #68]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #42]\n" + "movne %[n], #21\n" + "ldrh %[tmp3], [%[blk], #54]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #44]\n" + "movne %[n], #22\n" + "ldrh %[tmp1], [%[blk], #40]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #46]\n" + "movne %[n], #23\n" + "ldrh %[tmp2], [%[blk], #26]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #48]\n" + "movne %[n], #24\n" + "ldrh %[tmp3], [%[blk], #12]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #50]\n" + "movne %[n], #25\n" + "ldrh %[tmp1], [%[blk], #14]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #52]\n" + "movne %[n], #26\n" + "ldrh %[tmp2], [%[blk], #28]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #54]\n" + "movne %[n], #27\n" + "ldrh %[tmp3], [%[blk], #42]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #56]\n" + "movne %[n], #28\n" + "ldrh %[tmp1], [%[blk], #56]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #58]\n" + "movne %[n], #29\n" + "ldrh %[tmp2], [%[blk], #70]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #60]\n" + "movne %[n], #30\n" + "ldrh %[tmp3], [%[blk], #84]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #62]\n" + "movne %[n], #31\n" + "ldrh %[tmp1], [%[blk], #98]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #64]\n" + "movne %[n], #32\n" + "ldrh %[tmp2], [%[blk], #112]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #66]\n" + "movne %[n], #33\n" + "ldrh %[tmp3], [%[blk], #114]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #68]\n" + "movne %[n], #34\n" + "ldrh %[tmp1], [%[blk], #100]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #70]\n" + "movne %[n], #35\n" + "ldrh %[tmp2], [%[blk], #86]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #72]\n" + "movne %[n], #36\n" + "ldrh %[tmp3], [%[blk], #72]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #74]\n" + "movne %[n], #37\n" + "ldrh %[tmp1], [%[blk], #58]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #76]\n" + "movne %[n], #38\n" + "ldrh %[tmp2], [%[blk], #44]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #78]\n" + "movne %[n], #39\n" + "ldrh %[tmp3], [%[blk], #30]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #80]\n" + "movne %[n], #40\n" + "ldrh %[tmp1], [%[blk], #46]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #82]\n" + "movne %[n], #41\n" + "ldrh %[tmp2], [%[blk], #60]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #84]\n" + "movne %[n], #42\n" + "ldrh %[tmp3], [%[blk], #74]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #86]\n" + "movne %[n], #43\n" + "ldrh %[tmp1], [%[blk], #88]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #88]\n" + "movne %[n], #44\n" + "ldrh %[tmp2], [%[blk], #102]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #90]\n" + "movne %[n], #45\n" + "ldrh %[tmp3], [%[blk], #116]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #92]\n" + "movne %[n], #46\n" + "ldrh %[tmp1], [%[blk], #118]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #94]\n" + "movne %[n], #47\n" + "ldrh %[tmp2], [%[blk], #104]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #96]\n" + "movne %[n], #48\n" + "ldrh %[tmp3], [%[blk], #90]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #98]\n" + "movne %[n], #49\n" + "ldrh %[tmp1], [%[blk], #76]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #100]\n" + "movne %[n], #50\n" + "ldrh %[tmp2], [%[blk], #62]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #102]\n" + "movne %[n], #51\n" + "ldrh %[tmp3], [%[blk], #78]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #104]\n" + "movne %[n], #52\n" + "ldrh %[tmp1], [%[blk], #92]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #106]\n" + "movne %[n], #53\n" + "ldrh %[tmp2], [%[blk], #106]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #108]\n" + "movne %[n], #54\n" + "ldrh %[tmp3], [%[blk], #120]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #110]\n" + "movne %[n], #55\n" + "ldrh %[tmp1], [%[blk], #122]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #112]\n" + "movne %[n], #56\n" + "ldrh %[tmp2], [%[blk], #108]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #114]\n" + "movne %[n], #57\n" + "ldrh %[tmp3], [%[blk], #94]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #116]\n" + "movne %[n], #58\n" + "ldrh %[tmp1], [%[blk], #110]\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #118]\n" + "movne %[n], #59\n" + "ldrh %[tmp2], [%[blk], #124]\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #120]\n" + "movne %[n], #60\n" + "ldrh %[tmp3], [%[blk], #126]\n" + "cmp %[tmp1], #0\n" + "strh %[tmp1], [%[out], #122]\n" + "movne %[n], #61\n" + "cmp %[tmp2], #0\n" + "strh %[tmp2], [%[out], #124]\n" + "movne %[n], #62\n" + "cmp %[tmp3], #0\n" + "strh %[tmp3], [%[out], #126]\n" + "movne %[n], #63\n" + : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), + [n] "+&r" (n) + : [blk] "r" (block), [out] "r" (out) + : "memory", "cc"); + return n; +} + +/* +LOCAL(int) +find_last_nonzero_index (JCOEFPTR block, JCOEFPTR out) +{ + int tmp, i, n = 0; + for (i = 1; i < DCTSIZE2; i++) { + if ((tmp = block[jpeg_natural_order[i]]) != 0) + n = i; + out[i] = tmp; + } + return n; +} +*/ + +#endif + /* Encode a single block's worth of coefficients */ LOCAL(boolean) @@ -518,6 +866,10 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, size_t put_buffer; int put_bits; int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0]; size_t bytes, bytestocopy; int localbuf = 0; +#ifdef __arm__ + int last_nonzero_index, k; + JCOEF workspace[DCTSIZE2]; +#endif put_buffer = state->cur.put_buffer; put_bits = state->cur.put_bits; @@ -552,6 +904,15 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, r = 0; \ }} +#ifdef __arm__ + last_nonzero_index = find_last_nonzero_index(block, workspace) * 2; + block = &workspace[0]; + for (k = 2; k <= last_nonzero_index; k += 2) { + innerloop(k); + } + /* If the last coef(s) were zero, emit an end-of-block code */ + if (k < DCTSIZE2 * 2) DUMP_SINGLE_VALUE(actbl, 0x0) +#else innerloop(2*1); innerloop(2*8); innerloop(2*16); innerloop(2*9); innerloop(2*2); innerloop(2*3); innerloop(2*10); innerloop(2*17); innerloop(2*24); innerloop(2*32); innerloop(2*25); innerloop(2*18); @@ -571,6 +932,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, /* If the last coef(s) were zero, emit an end-of-block code */ if (r > 0) DUMP_SINGLE_VALUE(actbl, 0x0) +#endif state->cur.put_buffer = put_buffer; state->cur.put_bits = put_bits; -- cgit v1.2.3