Copyright (c) 2006 Dean Gaudet All Rights Reserved. This will eventually be part of a larger patch under an appropriate license. Index: gsrc/jidctfst_sse2.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ gsrc/jidctfst_sse2.c 2006-08-26 23:32:55.000000000 -0700 @@ -0,0 +1,287 @@ +/* + * jidctfst_sse2.c + * + * Copyright (C) 2006, Dean Gaudet. + * Copyright (C) 1994-1998, Thomas G. Lane. + * This file is part of the Independent JPEG Group's software. + * For conditions of distribution and use, see the accompanying README file. + * + * This file is an SSE2 reimplementation of jidctfst.c. See that file for + * a more thorough description of the implementation. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#if defined(DCT_IFAST_SUPPORTED) && defined(__SSE2__) + +#include +#include + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 +#error "Sorry, this code only copes with 8x8 DCTs." +#endif + +#if BITS_IN_JSAMPLE != 8 +#error "Sorry, this code only supports BITS_IN_JSAMPLE == 8" +#endif + + +/* Scaling decisions are generally the same as in the LL&M algorithm; + * see jidctint.c for more details. However, we choose to descale + * (right shift) multiplication products as soon as they are formed, + * rather than carrying additional fractional bits into subsequent additions. + * This compromises accuracy slightly, but it lets us save a few shifts. + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) + * everywhere except in the multiplications proper; this saves a good deal + * of work on 16-bit-int machines. + * + * The dequantized coefficients are not integers because the AA&N scaling + * factors have been incorporated. We represent them scaled up by PASS1_BITS, + * so that the first and second IDCT rounds have the same input scaling. + * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to + * avoid a descaling shift; this compromises accuracy rather drastically + * for small quantization table entries, but it saves a lot of shifts. + * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway, + * so we use a much larger scaling factor to preserve accuracy. + * + * A final compromise is to represent the multiplicative constants to only + * 8 fractional bits, rather than 13. This saves some shifting work on some + * machines, and may also reduce the cost of multiplication (since there + * are fewer one-bits in the constants). + */ + +#define CONST_BITS 8 +#define PASS1_BITS 2 + +// a union of convenience... for creating a few constants. +typedef union { + short s[8]; + __m128i m; +} simd_const_t; +#define X(v) { .s = {v, v, v, v, v, v, v, v}} +static const simd_const_t FIX_1_082392200 = X(FIX(1.082392200)); +static const simd_const_t FIX_1_414213562 = X(FIX(1.414213562)); +static const simd_const_t FIX_1_847759065 = X(FIX(1.847759065)); +static const simd_const_t FIX_n2_613125930 = X(-FIX(2.613125930)); + +static const simd_const_t CENTERJSAMPLE_vec = X(CENTERJSAMPLE); +#undef X + + +// multiply packed shorts against the FIX constants above and normalize the +// result +static inline __m128i MULTIPLY(__m128i var, __m128i c) +{ + __m128i lo = _mm_srli_epi16(_mm_mullo_epi16(var, c), CONST_BITS); + __m128i hi = _mm_slli_epi16(_mm_mulhi_epi16(var, c), 16-CONST_BITS); + return _mm_add_epi16(lo, hi); +} + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce a DCTELEM result. For 8-bit data a 16x16->16 + * multiplication will do. For 12-bit data, the multiplier table is + * declared INT32, so a 32-bit multiply will be used. + */ + +static inline __m128i DEQUANTIZE(const JCOEF *inptr, const short *quantval) +{ + return _mm_mullo_epi16(_mm_loadu_si128((const __m128i *)inptr), + _mm_load_si128((const __m128i *)quantval)); +} + + +// s0..s7 are an input 8x8 matrix of packed shorts... calculate its transpose. +// there's a commented example of a similar transpose when arranging the output +// bytes at the end of jpeg_idct_ifast. +#define transpose8x8(d0, d1, d2, d3, d4, d5, d6, d7, s0, s1, s2, s3, s4, s5, s6, s7) do { \ + __m128i t0 = _mm_unpacklo_epi16(s0, s1); \ + __m128i t1 = _mm_unpacklo_epi16(s2, s3); \ + __m128i t2 = _mm_unpacklo_epi32(t0, t1); \ + __m128i t3 = _mm_unpackhi_epi32(t0, t1); \ + __m128i t4 = _mm_unpacklo_epi16(s4, s5); \ + __m128i t5 = _mm_unpacklo_epi16(s6, s7); \ + __m128i t6 = _mm_unpacklo_epi32(t4, t5); \ + __m128i t7 = _mm_unpackhi_epi32(t4, t5); \ + d0 = _mm_unpacklo_epi64(t2, t6); \ + d1 = _mm_unpackhi_epi64(t2, t6); \ + d2 = _mm_unpacklo_epi64(t3, t7); \ + d3 = _mm_unpackhi_epi64(t3, t7); \ + t0 = _mm_unpackhi_epi16(s0, s1); \ + t1 = _mm_unpackhi_epi16(s2, s3); \ + t2 = _mm_unpacklo_epi32(t0, t1); \ + t3 = _mm_unpackhi_epi32(t0, t1); \ + t4 = _mm_unpackhi_epi16(s4, s5); \ + t5 = _mm_unpackhi_epi16(s6, s7); \ + t6 = _mm_unpacklo_epi32(t4, t5); \ + t7 = _mm_unpackhi_epi32(t4, t5); \ + d4 = _mm_unpacklo_epi64(t2, t6); \ + d5 = _mm_unpackhi_epi64(t2, t6); \ + d6 = _mm_unpacklo_epi64(t3, t7); \ + d7 = _mm_unpackhi_epi64(t3, t7); \ + } while (0) + + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + */ + +GLOBAL(void) +jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp10, tmp11, tmp12, tmp13; + __m128i z5, z10, z11, z12, z13; + const short * quantptr; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y0, y1, y2, y3, y4, y5, y6, y7; + + /* Pass 1: process columns from input, store into work array. */ + + quantptr = (const short *) compptr->dct_table; + + /* Even part */ + + tmp0 = DEQUANTIZE(&coef_block[DCTSIZE*0], &quantptr[DCTSIZE*0]); + tmp1 = DEQUANTIZE(&coef_block[DCTSIZE*2], &quantptr[DCTSIZE*2]); + tmp2 = DEQUANTIZE(&coef_block[DCTSIZE*4], &quantptr[DCTSIZE*4]); + tmp3 = DEQUANTIZE(&coef_block[DCTSIZE*6], &quantptr[DCTSIZE*6]); + + tmp10 = _mm_add_epi16(tmp0, tmp2); /* phase 3 */ + tmp11 = _mm_sub_epi16(tmp0, tmp2); + + tmp13 = _mm_add_epi16(tmp1, tmp3); /* phases 5-3 */ + tmp12 = _mm_sub_epi16(MULTIPLY(_mm_sub_epi16(tmp1, tmp3), FIX_1_414213562.m), tmp13); /* 2*c4 */ + + tmp0 = _mm_add_epi16(tmp10, tmp13); /* phase 2 */ + tmp3 = _mm_sub_epi16(tmp10, tmp13); + tmp1 = _mm_add_epi16(tmp11, tmp12); + tmp2 = _mm_sub_epi16(tmp11, tmp12); + + /* Odd part */ + + tmp4 = DEQUANTIZE(&coef_block[DCTSIZE*1], &quantptr[DCTSIZE*1]); + tmp5 = DEQUANTIZE(&coef_block[DCTSIZE*3], &quantptr[DCTSIZE*3]); + tmp6 = DEQUANTIZE(&coef_block[DCTSIZE*5], &quantptr[DCTSIZE*5]); + tmp7 = DEQUANTIZE(&coef_block[DCTSIZE*7], &quantptr[DCTSIZE*7]); + + z13 = _mm_add_epi16(tmp6, tmp5); /* phase 6 */ + z10 = _mm_sub_epi16(tmp6, tmp5); + z11 = _mm_add_epi16(tmp4, tmp7); + z12 = _mm_sub_epi16(tmp4, tmp7); + + tmp7 = _mm_add_epi16(z11, z13); /* phase 5 */ + tmp11 = MULTIPLY(_mm_sub_epi16(z11, z13), FIX_1_414213562.m); /* 2*c4 */ + + z5 = MULTIPLY(_mm_add_epi16(z10, z12), FIX_1_847759065.m); /* 2*c2 */ + tmp10 = _mm_sub_epi16(MULTIPLY(z12, FIX_1_082392200.m), z5); /* 2*(c2-c6) */ + tmp12 = _mm_add_epi16(MULTIPLY(z10, FIX_n2_613125930.m), z5); /* -2*(c2+c6) */ + + tmp6 = _mm_sub_epi16(tmp12, tmp7); /* phase 2 */ + tmp5 = _mm_sub_epi16(tmp11, tmp6); + tmp4 = _mm_add_epi16(tmp10, tmp5); + + x0 = _mm_adds_epi16(tmp0, tmp7); + x7 = _mm_subs_epi16(tmp0, tmp7); + x1 = _mm_adds_epi16(tmp1, tmp6); + x6 = _mm_subs_epi16(tmp1, tmp6); + x2 = _mm_adds_epi16(tmp2, tmp5); + x5 = _mm_subs_epi16(tmp2, tmp5); + x4 = _mm_adds_epi16(tmp3, tmp4); + x3 = _mm_subs_epi16(tmp3, tmp4); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + transpose8x8(y0, y1, y2, y3, y4, y5, y6, y7, x0, x1, x2, x3, x4, x5, x6, x7); + + /* Even part */ + + tmp10 = _mm_add_epi16(y0, y4); + tmp11 = _mm_sub_epi16(y0, y4); + + tmp13 = _mm_add_epi16(y2, y6); + tmp12 = _mm_sub_epi16(MULTIPLY(_mm_sub_epi16(y2, y6), FIX_1_414213562.m), tmp13); + + tmp0 = _mm_add_epi16(tmp10, tmp13); + tmp3 = _mm_sub_epi16(tmp10, tmp13); + tmp1 = _mm_add_epi16(tmp11, tmp12); + tmp2 = _mm_sub_epi16(tmp11, tmp12); + + /* Odd part */ + + z13 = _mm_add_epi16(y5, y3); + z10 = _mm_sub_epi16(y5, y3); + z11 = _mm_add_epi16(y1, y7); + z12 = _mm_sub_epi16(y1, y7); + + tmp7 = _mm_add_epi16(z11, z13); /* phase 5 */ + tmp11 = MULTIPLY(_mm_sub_epi16(z11, z13), FIX_1_414213562.m); /* 2*c4 */ + + z5 = MULTIPLY(_mm_add_epi16(z10, z12), FIX_1_847759065.m); /* 2*c2 */ + tmp10 = _mm_sub_epi16(MULTIPLY(z12, FIX_1_082392200.m), z5); /* 2*(c2-c6) */ + tmp12 = _mm_add_epi16(MULTIPLY(z10, FIX_n2_613125930.m), z5); /* -2*(c2+c6) */ + + tmp6 = _mm_sub_epi16(tmp12, tmp7); /* phase 2 */ + tmp5 = _mm_sub_epi16(tmp11, tmp6); + tmp4 = _mm_add_epi16(tmp10, tmp5); + + /* Final output stage: scale down by a factor of 8 and range-limit */ + + x0 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp0, tmp7), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x7 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp0, tmp7), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x1 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp1, tmp6), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x6 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp1, tmp6), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x2 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp2, tmp5), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x5 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp2, tmp5), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x4 = _mm_add_epi16(_mm_srai_epi16(_mm_adds_epi16(tmp3, tmp4), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + x3 = _mm_add_epi16(_mm_srai_epi16(_mm_subs_epi16(tmp3, tmp4), PASS1_BITS+3), CENTERJSAMPLE_vec.m); + + // range_limit/saturation step + y0 = _mm_packus_epi16(x0, x4); // 74 64 54 44 34 24 14 04 .. 70 60 50 40 30 20 10 00 + y1 = _mm_packus_epi16(x1, x5); // 75 65 55 45 35 25 15 05 .. 71 61 51 41 31 21 11 01 + y2 = _mm_packus_epi16(x2, x6); // 76 66 56 46 36 26 16 06 .. 72 62 52 42 32 22 12 02 + y3 = _mm_packus_epi16(x3, x7); // 77 67 57 47 37 27 17 07 .. 73 63 53 43 33 23 13 03 + + // we now have to perform another transpose to arrange the output bytes + x0 = _mm_unpacklo_epi8(y0, y1); // 71 70 61 60 51 50 41 40 .. 31 30 21 20 11 10 01 00 + x1 = _mm_unpacklo_epi8(y2, y3); // 73 72 63 62 53 52 43 42 .. 33 32 23 22 13 12 03 02 + x2 = _mm_unpacklo_epi16(x0, x1); // 33 32 31 30 23 22 21 20 .. 13 12 11 10 03 02 01 00 + x3 = _mm_unpackhi_epi16(x0, x1); // 73 72 71 70 63 62 61 60 .. 53 52 51 50 43 42 41 40 + + x4 = _mm_unpackhi_epi8(y0, y1); // 75 74 65 64 55 54 45 44 .. 35 34 25 24 15 14 05 04 + x5 = _mm_unpackhi_epi8(y2, y3); // 77 76 67 66 57 56 47 46 .. 37 36 27 26 17 16 07 06 + x6 = _mm_unpacklo_epi16(x4, x5); // 37 36 35 34 27 26 25 24 .. 17 16 15 14 07 06 05 04 + x7 = _mm_unpackhi_epi16(x4, x5); // 77 76 75 74 67 66 65 64 .. 57 56 55 54 47 46 45 44 + + y0 = _mm_unpacklo_epi32(x2, x6); // 17 16 15 14 13 12 11 10 .. 07 06 05 04 03 02 01 00 + y1 = _mm_unpackhi_epi32(x2, x6); + y2 = _mm_unpacklo_epi32(x3, x7); + y3 = _mm_unpackhi_epi32(x3, x7); + + // there doesn't seem to be any way to do these low/high 8-byte stores + // without casting the registers to __m128d and using the movlpd/movhpd + // instructions... there's nothing technically wrong with this (it won't + // cause troubles regardless of the bit patterns in the registers). it's + // just kind of ugly. + _mm_storel_pd((double *)(output_buf[0] + output_col), (__m128d)y0); + _mm_storeh_pd((double *)(output_buf[1] + output_col), (__m128d)y0); + _mm_storel_pd((double *)(output_buf[2] + output_col), (__m128d)y1); + _mm_storeh_pd((double *)(output_buf[3] + output_col), (__m128d)y1); + _mm_storel_pd((double *)(output_buf[4] + output_col), (__m128d)y2); + _mm_storeh_pd((double *)(output_buf[5] + output_col), (__m128d)y2); + _mm_storel_pd((double *)(output_buf[6] + output_col), (__m128d)y3); + _mm_storeh_pd((double *)(output_buf[7] + output_col), (__m128d)y3); +} + +#endif /* DCT_IFAST_SUPPORTED */ Index: gsrc/makefile.cfg =================================================================== --- gsrc.orig/makefile.cfg 2006-08-25 14:54:28.000000000 -0700 +++ gsrc/makefile.cfg 2006-08-26 23:32:55.000000000 -0700 @@ -79,7 +79,7 @@ LIBSOURCES= jcapimin.c jcapistd.c jccoef jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \ jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \ jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \ - jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \ + jfdctint.c jidctflt.c jidctfst.c jidctfst_sse2.c jidctint.c jidctred.c jquant1.c \ jquant2.c jutils.c jmemmgr.c # memmgr back ends: compile only one of these into a working library SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c @@ -121,7 +121,7 @@ CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \ jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \ jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \ - jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \ + jidctfst.$(O) jidctfst_sse2.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \ jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O) # These objectfiles are included in libjpeg.a LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS) @@ -287,6 +287,7 @@ jfdctfst.$(O): jfdctfst.c jinclude.h jco jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h +jidctfst_sse2.$(O): jidctfst_sse2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h Index: gsrc/jddctmgr.c =================================================================== --- gsrc.orig/jddctmgr.c 2006-08-26 23:32:43.000000000 -0700 +++ gsrc/jddctmgr.c 2006-08-26 23:32:55.000000000 -0700 @@ -15,6 +15,7 @@ * dequantization multiplier table needed by the IDCT routine. */ +#include #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" @@ -259,9 +260,11 @@ jinit_inverse_dct (j_decompress_ptr cinf for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { /* Allocate and pre-zero a multiplier table for each component */ - compptr->dct_table = + /* Provide 16-byte alignment for SSE2 support. */ + char *t = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(multiplier_table)); + SIZEOF(multiplier_table) + 15); + compptr->dct_table = (void *)(((uintptr_t)t + 15) & ~(uintptr_t)15); MEMZERO(compptr->dct_table, SIZEOF(multiplier_table)); /* Mark multiplier table not yet set up for any method */ idct->cur_method[ci] = -1; Index: gsrc/jdct.h =================================================================== --- gsrc.orig/jdct.h 2006-08-26 23:32:43.000000000 -0700 +++ gsrc/jdct.h 2006-08-26 23:32:55.000000000 -0700 @@ -55,7 +55,11 @@ typedef JMETHOD(void, float_DCT_method_p typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */ #if BITS_IN_JSAMPLE == 8 +#ifndef __SSE2__ typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */ +#else +typedef short IFAST_MULT_TYPE; /* SSE2 code requires short */ +#endif #define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */ #else typedef INT32 IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */ Index: gsrc/jidctfst.c =================================================================== --- gsrc.orig/jidctfst.c 2006-08-25 15:17:25.000000000 -0700 +++ gsrc/jidctfst.c 2006-08-26 23:34:26.000000000 -0700 @@ -37,7 +37,7 @@ #include "jpeglib.h" #include "jdct.h" /* Private declarations for DCT subsystem */ -#ifdef DCT_IFAST_SUPPORTED +#if defined(DCT_IFAST_SUPPORTED) && !defined(__SSE2__) /*