diff -Naru -x .deps mpeg2dec-0.3.1.prev/configure.in mpeg2dec-0.3.1/configure.in --- mpeg2dec-0.3.1.prev/configure.in 2002-12-13 23:35:48.000000000 -0700 +++ mpeg2dec-0.3.1/configure.in 2003-05-21 17:55:26.000000000 -0700 @@ -68,6 +68,12 @@ AC_TRY_CFLAGS([$OPT_CFLAGS], [CFLAGS=$OPT_CFLAGS; AC_DEFINE([ARCH_ALPHA],,[alpha architecture])]);; + arm* | armv4l*) + ${CCASFLAGS= $CFLAGS } + AM_PROG_AS + OPT_CFLAGS="$CFLAGS" + AC_TRY_CFLAGS([$OPT_CFLAGS], + [CFLAGS=$OPT_CFLAGS; AC_DEFINE([ARCH_ARM],,[arm architecture])]);; esac elif test x"$CC" = x"tcc" -a x"`$CC -version 2>&1 | grep TenDRA`" != x""; then dnl TenDRA portability checking compiler diff -Naru -x .deps mpeg2dec-0.3.1.prev/include/config.h.in mpeg2dec-0.3.1/include/config.h.in --- mpeg2dec-0.3.1.prev/include/config.h.in 2002-12-13 23:43:29.000000000 -0700 +++ mpeg2dec-0.3.1/include/config.h.in 2003-05-02 22:00:35.000000000 -0700 @@ -6,6 +6,9 @@ /* alpha architecture */ #undef ARCH_ALPHA +/* arm architecture */ +#undef ARCH_ARM + /* ppc architecture */ #undef ARCH_PPC diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/coefficients.h mpeg2dec-0.3.1/libmpeg2/coefficients.h --- mpeg2dec-0.3.1.prev/libmpeg2/coefficients.h 1969-12-31 17:00:00.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/coefficients.h 2003-05-08 23:12:54.000000000 -0700 @@ -0,0 +1,250 @@ + +#ifndef __u64 +#define __u64 uint64_t +#endif +//the above equate control the precision of multiplications +#define tSqrt2 1.4142135623730950488 + +#define half 0x80000000 +#if 0 +#define t1 0.98078528 //cos(11.25) +#define t2 0.923879532 //cos(22.5) +#define t3 0.831469612 //cos(33.75) +#define t4 0.7071067811865475244 //cos(45) +#define t5 0.555570233 //cos(56.25) +#define t6 0.382683432 //cos(67.5) +#define t7 0.195090322 //cos(78.75) +#define halfv(a) DWORD((a) * half) +#else + + +#define t1 ((__u64)(0xfb14be7e)) +#define t2 ((__u64)(0xec835e78)) +#define t3 ((__u64)(0xd4db3148)) +#define t4 ((__u64)(0xb504f334)) +#define t5 ((__u64)(0x8e39d9cc)) +#define t6 ((__u64)(0x61f78a99)) +#define t7 ((__u64)(0x31f17078)) +#define halfv(a) ((a) >> 1) +#endif + + +//the defines below only affect the initial dct multiplies +//cos(a)cos(b) = (cos(a+b)+cos(a-b))/2 + +#define ccFracBits 32 + +#define c1c1 (halfv(t2)+half) //((one+c2)>>1) +#define c1c2 halfv(t1+t3) //((c1+c3)>>1) +#define c1c3 halfv(t2+t4) //((c2+c4)>>1) +#define c1c4 halfv(t3+t5) //((c3+c5)>>1) +#define c1c5 halfv(t4+t6) //((c4+c6)>>1) +#define c1c6 halfv(t5+t7) //((c5+c7)>>1) +#define c1c7 halfv(t6) //(c6>>1) + +#define c2c1 c1c2 +#define c2c2 (halfv(t4)+half) //((one+c4)>>1) +#define c2c3 halfv(t1+t5) //((c1+c5)>>1) +#define c2c4 halfv(t2+t6) //((c2+c6)>>1) +#define c2c5 halfv(t3+t7) //((c3+c7)>>1) +#define c2c6 halfv(t4) //(c4>>1) +#define c2c7 halfv(t5-t7) //((c5-c7)>>1) + +#define c3c1 c1c3 +#define c3c2 c2c3 +#define c3c3 (halfv(t6)+half) //((one+c6)>>1) +#define c3c4 halfv(t1+t7) //((c1+c7)>>1) +#define c3c5 halfv(t2) //(c2>>1) +#define c3c6 halfv(t3-t7) //((c3-c7)>>1) +#define c3c7 halfv(t4-t6) //((c4-c6)>>1) +#define c4c4 half +#define c4c1 c1c4 +#define c4c2 c2c4 +#define c4c3 c3c4 +#define C4C4 half +#define c4c5 halfv(t1-t7) //((c1-c7)>>1) +#define c4c6 halfv(t2-t6) //((c2-c6)>>1) +#define c4c7 halfv(t3-t5) //((c3-c5)>>1) + +#define c5c1 c1c5 +#define c5c2 c2c5 +#define c5c3 c3c5 +#define c5c4 c4c5 +#define c5c5 (half-halfv(t6)) //((one-c6)>>1) +#define c5c6 halfv(t1-t5) //((c1-c5)>>1) +#define c5c7 halfv(t2-t4) //((c2-c4)>>1) + +#define c6c1 c1c6 +#define c6c2 c2c6 +#define c6c3 c3c6 +#define c6c4 c4c6 +#define c6c5 c5c6 +#define c6c6 (half-halfv(t4)) //((one-c4)>>1) +#define c6c7 halfv(t1-t3) //((c1-c3)>>1) + +#define c7c1 c1c7 +#define c7c2 c2c7 +#define c7c3 c3c7 +#define c7c4 c4c7 +#define c7c5 c5c7 +#define c7c6 c6c7 +#define c7c7 (half-halfv(t2)) //((one-c2)>>1) + +//The initial matrix multiply of this algorithm is +//Q[] = |c4 0 0 0 0 0 0 0 | +// | 0 -c1 0 0 0 0 0 0 | +// | 0 0 -c2 0 0 0 0 0 | +// | 0 0 0 -c3 0 0 0 0 | +// | 0 0 0 0 -c4 0 0 0 | +// | 0 0 0 0 0 -c5 0 0 | +// | 0 0 0 0 0 0 -c6 0 | +// | 0 0 0 0 0 0 0 c7 | +//the sign is defined below +#define CE00 c4c4 +#define CE01 c4c1 +#define CE02 c4c2 +#define CE03 c4c3 +#define CE04 c4c4 +#define CE05 c4c5 +#define CE06 c4c6 +#define CE07 c4c7 + +#define CE10 c1c4 +#define CE11 c1c1 +#define CE12 c1c2 +#define CE13 c1c3 +#define CE14 c1c4 +#define CE15 c1c5 +#define CE16 c1c6 +#define CE17 c1c7 + +#define CE20 c2c4 +#define CE21 c2c1 +#define CE22 c2c2 +#define CE23 c2c3 +#define CE24 c2c4 +#define CE25 c2c5 +#define CE26 c2c6 +#define CE27 c2c7 + +#define CE30 c3c4 +#define CE31 c3c1 +#define CE32 c3c2 +#define CE33 c3c3 +#define CE34 c3c4 +#define CE35 c3c5 +#define CE36 c3c6 +#define CE37 c3c7 + +#define CE40 c4c4 +#define CE41 c4c1 +#define CE42 c4c2 +#define CE43 c4c3 +#define CE44 c4c4 +#define CE45 c4c5 +#define CE46 c4c6 +#define CE47 c4c7 + +#define CE50 c5c4 +#define CE51 c5c1 +#define CE52 c5c2 +#define CE53 c5c3 +#define CE54 c5c4 +#define CE55 c5c5 +#define CE56 c5c6 +#define CE57 c5c7 + +#define CE60 c6c4 +#define CE61 c6c1 +#define CE62 c6c2 +#define CE63 c6c3 +#define CE64 c6c4 +#define CE65 c6c5 +#define CE66 c6c6 +#define CE67 c6c7 + +#define CE70 c7c4 +#define CE71 c7c1 +#define CE72 c7c2 +#define CE73 c7c3 +#define CE74 c7c4 +#define CE75 c7c5 +#define CE76 c7c6 +#define CE77 c7c7 + + +#define SCE00 +#define SCE01 - +#define SCE02 - +#define SCE03 - +#define SCE04 - +#define SCE05 - +#define SCE06 - +#define SCE07 + +#define SCE10 - +#define SCE11 +#define SCE12 +#define SCE13 +#define SCE14 +#define SCE15 +#define SCE16 +#define SCE17 - + +#define SCE20 - +#define SCE21 +#define SCE22 +#define SCE23 +#define SCE24 +#define SCE25 +#define SCE26 +#define SCE27 - + +#define SCE30 - +#define SCE31 +#define SCE32 +#define SCE33 +#define SCE34 +#define SCE35 +#define SCE36 +#define SCE37 - + +#define SCE40 - +#define SCE41 +#define SCE42 +#define SCE43 +#define SCE44 +#define SCE45 +#define SCE46 +#define SCE47 - + +#define SCE50 - +#define SCE51 +#define SCE52 +#define SCE53 +#define SCE54 +#define SCE55 +#define SCE56 +#define SCE57 - + +#define SCE60 - +#define SCE61 +#define SCE62 +#define SCE63 +#define SCE64 +#define SCE65 +#define SCE66 +#define SCE67 - + +#define SCE70 +#define SCE71 - +#define SCE72 - +#define SCE73 - +#define SCE74 - +#define SCE75 - +#define SCE76 - +#define SCE77 + +//#define RD4(a) (((a)+1)>>1) //make sure answer fits in a signed 32 bit int, by dumping table when all positive +#define RD4(a) (int)(S##a (((a)+1)>>1)) + diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/GetIdct.c mpeg2dec-0.3.1/libmpeg2/GetIdct.c --- mpeg2dec-0.3.1.prev/libmpeg2/GetIdct.c 1969-12-31 17:00:00.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/GetIdct.c 2003-05-10 06:14:00.000000000 -0700 @@ -0,0 +1,1118 @@ +#include "config.h" +#include "linux/types.h" +typedef unsigned char BYTE; +typedef unsigned long DWORD; + +#define INLINE +//#define INLINE inline +#define __stdcall + +//#define QSCALE +//#define JPEG +//#define TESTING + +#include "coefficients.h" + +//this algoritm uses 144 (was 118 but too many memory accessses) multiples, 64 of which are the initial cosine multiplies +//jpeg can combine these 64 with the quantization table, but because of oddification mpeg can't +//but mpeg can still skip the multiple of zero coefficients + +//col 0->0, 4->1, 2->2, 6->3, 5->4, 1->5, 7->6, 3->7, +// or +//col 0->0, 1->5, 2->2, 3->7, 4->1, 5->4, 6->3, 7->6, + + +#define INDEX_CE00 0 +#define INDEX_CE77 (6*8+6) +void mpeg2_idct_arm_init() +{ + static unsigned char mapcol[] = {0,5,2,7,1,4,3,6}; +#define maprow mapcol + int i,j; + int row,col; + extern uint8_t mpeg2_scan_norm[64]; + extern uint8_t mpeg2_scan_alt[64]; + for (i = 0; i < 64; i++) { +//bit 0->2 1->0 2->1 3->5 4->3 5->4 +//col 0->0, 1->4, 2->1, 3->5, 4->2, 5->6, 7->7 +//row 0->0, 1->4, 2->1, 3->5, 4->2, 5->6, 7->7 + + j = mpeg2_scan_norm[i]; + row = maprow[j>>3]; + col = mapcol[j & 7]; + mpeg2_scan_norm[i] = (row<<3)|col; + + j = mpeg2_scan_alt[i]; + row = maprow[j>>3]; + col = mapcol[j & 7]; + mpeg2_scan_alt[i] = (row<<3)|col; + } +} + + + +#ifndef TESTING +void GetIDCT_ARM(int16_t * srcDCT,uint8_t * dest,const int stride,const int addFlag); +void mpeg2_idct_fill_arm (int dct,uint8_t * dest,const int stride,const int addFlag); +#define GETIDCT GetIDCT_ARM +#define FILLIDCT mpeg2_idct_fill_arm //this is used when only the 1st DCT is non-zero +#else +void GetIDCT(int16_t * srcDCT,uint8_t * dest,const int stride,const int addFlag); +void FillIDCT(int dct,uint8_t * dest,const int stride,const int addFlag); +#define GETIDCT GetIDCT +#define FILLIDCT mpeg2_idct_fill_default +#endif + +//////////////////////////////////////////////////////////////////////// +#define unlikely(x) x +static BYTE INLINE rangeCheck1(int a) +{ +// if (((unsigned int)a)<=255) return (BYTE)a; +// if (a<0) return 0; +// else return 255; + if ( unlikely (((unsigned int)a)>255)) a = ~(a>>31); //negative values get 0, >255 get -1 (or 255 in low byte) + return a; +} +void mpeg2_idct_fill_default(int dct,uint8_t * dest,const int stride,const int addFlag) +{ + int i = 8; + dct = (dct+4)>>3; //round and convert 0 : 2047 range, to 0 : 255 range + if (addFlag==0) { + dct = rangeCheck1(dct); + do + { + *(dest+0) = dct; + *(dest+1) = dct; + *(dest+2) = dct; + *(dest+3) = dct; + *(dest+4) = dct; + *(dest+5) = dct; + *(dest+6) = dct; + *(dest+7) = dct; + dest+=stride; + } while (--i); + } else { + do + { + *(dest+0) = rangeCheck1(dct + (*(dest+0))); + *(dest+1) = rangeCheck1(dct + (*(dest+1))); + *(dest+2) = rangeCheck1(dct + (*(dest+2))); + *(dest+3) = rangeCheck1(dct + (*(dest+3))); + *(dest+4) = rangeCheck1(dct + (*(dest+4))); + *(dest+5) = rangeCheck1(dct + (*(dest+5))); + *(dest+6) = rangeCheck1(dct + (*(dest+6))); + *(dest+7) = rangeCheck1(dct + (*(dest+7))); + dest+=stride; + } while (--i); + } +} +//////////////////////////////////////////////////////////////////////// +#define CE00_SHIFT (30-27) + +static inline void add_or_copy (const int last, int16_t * DCTblock, uint8_t * dest, const int stride,int addFlag) +{ + if ((last > 129) || (DCTblock[INDEX_CE77] && ((DCTblock[INDEX_CE00] & (7<>CE00_SHIFT,dest,stride,addFlag); //undo shift, this is used when only the 1st DCT is non-zero + DCTblock[INDEX_CE00] = DCTblock[INDEX_CE77] = 0; //the next 62 are already zero, the last may be non-zero if oddified + } +} +void mpeg2_idct_copy_arm (int16_t * srcDCT, uint8_t * dest, const int stride) +{ + add_or_copy(130,srcDCT,dest,stride,0); +} +void mpeg2_idct_add_arm (const int last, int16_t * srcDCT, uint8_t * dest, const int stride) +{ +// add_or_copy(last,srcDCT,dest,stride,1); + add_or_copy(130,srcDCT,dest,stride,1); +} +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +#ifdef TESTING + +//extern "C" +// void __stdcall aPQxoPQdiv4(int* ptr,int16_t * srcDCT,int* qTable); + void __stdcall aB1xoB1(int* ptr,int16_t * srcDCT); + void __stdcall aB2xoB2(int* ptr); + void __stdcall aMxoM(int* ptr); + void __stdcall aA1xoA1(int* ptr); + void __stdcall aA2xoA2(int* ptr); + void __stdcall aA3xoA3(int* ptr); + void __stdcall aA4xoA4(int* ptr); + + +#if 1 +#define VC6x 0x61f78a99 // /2**32 ((t6 *(080000000h/400h))/tens); //t6=.382683432 +#define SHFT_C6x 32 +#define SHFT_C6x_M1 31 +#define SHFT_C6x_M2 30 + +#define VC2mC6x 0x4545e9ef // /2**31 (((t2-t6)*(080000000h/400h))/tens); //t2=.923879532, t2-t6=.541 +#define VC2x 0x7641af3c // /2**31 ((t2 *(080000000h/400h))/tens); +#define VC4x 0x5a82799a // /2**31 ((t4 *(080000000h/400h))/tens); +#define VoneDivC4 0x5a82799a // /2**30 ((tSqrt2 *(040000000h/400h))/tens); //sqrt(2) +#define VC2pC6x 0x539eba44 // /2**30 (((t2+t6)*(040000000h/400h))/tens); //+=1.306 + +#else +#define VC6x (int(t6*0x80000000)) +#define SHFT_C6x 31 +#define SHFT_C6x_M1 30 +#define SHFT_C6x_M2 29 + +#define VC2mC6x (int((t2-t6)*0x80000000)) +#define VC2x (int(t2*0x80000000)) +#define VC4x (int(t4*0x80000000)) +#define VoneDivC4 (int(tSqrt2*0x40000000)) +#define VC2pC6x (int((t2+t6)*0x40000000)) + +#endif + +#define SHFT_C2mC6x 31 +#define SHFT_C2x 31 +#define SHFT_C4x 31 +#define SHFT_oneDivC4 30 +#define SHFT_C2pC6x 30 + +#define SHFT_C2mC6x_M1 30 +#define SHFT_C2x_M1 30 +#define SHFT_C4x_M1 30 +#define SHFT_oneDivC4_M1 29 +#define SHFT_C2pC6x_M1 29 + +#define SHFT_C2mC6x_M2 29 +#define SHFT_C2x_M2 29 +#define SHFT_C4x_M2 29 +#define SHFT_oneDivC4_M2 28 +#define SHFT_C2pC6x_M2 28 + +#define oneDivC4 VoneDivC4 //sqrt(2) +#define C6x VC6x //t6=.382683432 +#define C2mC6x VC2mC6x //t2=.923879532, t2-t6=.541 +#define C2pC6x VC2pC6x //+=1.306 +#define C2x VC2x +#define C4x VC4x + + + +#if defined(ARCH_X86) +#define FASTCALL __fastcall +#else +#define FASTCALL +#endif + +#ifdef ARCH_ARM +//I don't know how to place a # in a macro and still replace shiftR with parameter +//The following does not work, nor does using \x23 for # +#define smullShift(a,b,shiftR) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # \\\n 32-shiftR \n\ + add %0,%0,%1,LSR # \\\n shiftR" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) + +#define __smull16(a,b) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # (32-16) \n\ + add %0,%0,%1,LSR # (16)" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) + + +#define __smull29(a,b) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # (32-29) \n\ + add %0,%0,%1,LSR # (29)" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) + +#define __smull30(a,b) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # (32-30) \n\ + add %0,%0,%1,LSR # (30)" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) +#define __smull31(a,b) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # (32-31) \n\ + add %0,%0,%1,LSR # (31)" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) + + + +//a is signed 16 bits, b is signed 32 bit +#define __SMultW(a,b) \ +({ register int __rTemp; \ + __asm__ ("%@ Inlined smull \n\ + smulwb %0,%1,%2" \ + : "=r" (__rTemp) \ + : "r" (b), "r" (a) ); \ + __rTemp;}) + +#define SMultW(a,b) __smull(a,b,16) //use this if not arm version 5 or above +//#define SMultW(a,b) __SMultW(a,b) + +#define __smull(a,b,shiftRCnt) __smull##shiftRCnt(a,b) +#define smull(a,b,shiftRCnt) __smull(a,b,shiftRCnt) +#endif + + + +#ifdef ARCH_ARM +#define DivC4(val) smull(val,oneDivC4,SHFT_oneDivC4) + +#elif defined(ARCH_X86) +static int INLINE FASTCALL DivC4(val) +{ + __asm + { + imul oneDivC4 + shrd eax,edx,SHFT_oneDivC4 + } +} +#else +#error routine undefined +#endif + + + + + + +//rows 0&7, 1&6, 2&5, 3&4 can be represented as (N xo A4) where N = | 1 1 | +// | 1 -1 | +// P[16,2] (A4 xo N) P[16,8] +//where A4 xo N = +// 0 1 2 3 4 5 6 7 +// | N 0 0 0 0 0 0 -N | +// | 0 N 0 0 0 0 -N 0 | +// | 0 0 N 0 0 -N 0 0 | +// | 0 0 0 N -N 0 0 0 | +// | 0 0 0 N N 0 0 0 | +// | 0 0 N 0 0 N 0 0 | +// | 0 N 0 0 0 0 N 0 | +// | N 0 0 0 0 0 0 N | +// + + +//ck = cos(k?/16) //? is pie, 3.14 +//c8=0, c9=-c7, c10=-c6, c11=-c5, c12=-c4, c13=-c3, c14=-c2, c15=-c1, c16=-1, c17=-c1, +//ck= - c(16-k) +//ck= - c(k-16) +//ck = c(32-k) +//ck = c(k-32) + + +//c4=1/sqrt(2) +//s(x) = (S0*c4 + S1*c(2x+1) + S2*c(4x+2) + S3*c(6x+3) + S4*c(8x+4) + S5*c(10x+5) + S6*c(12x+6) + S7*c(14x+7) )/2 +//s0 = (S0*c4 + S1*c1 + S2*c2 + S3*c3 + S4*c4 + S5*c5 + S6*c6 + S7*c7 )/2 +//s1 = (S0*c4 + S1*c3 + S2*c6 + S3*c9 + S4*c12 + S5*c15 + S6*c18 + S7*c21 )/2 +//s2 = (S0*c4 + S1*c5 + S2*c10 + S3*c15 + S4*c20 + S5*c25 + S6*c30 + S7*c35 )/2 +//s3 = (S0*c4 + S1*c7 + S2*c14 + S3*c21 + S4*c28 + S5*c35 + S6*c42 + S7*c49 )/2 +//s4 = (S0*c4 + S1*c9 + S2*c18 + S3*c27 + S4*c36 + S5*c45 + S6*c54 + S7*c63 )/2 +//s5 = (S0*c4 + S1*c11 + S2*c22 + S3*c33 + S4*c44 + S5*c55 + S6*c66 + S7*c77 )/2 +//s6 = (S0*c4 + S1*c13 + S2*c26 + S3*c39 + S4*c52 + S5*c65 + S6*c78 + S7*c91 )/2 +//s7 = (S0*c4 + S1*c15 + S2*c30 + S3*c45 + S4*c60 + S5*c75 + S6*c90 + S7*c105 )/2 + +//2s0 = (S0*c4 + S1*c1 + S2*c2 + S3*c3 + S4*c4 + S5*c5 + S6*c6 + S7*c7 ) +//2s1 = (S0*c4 + S1*c3 + S2*c6 - S3*c7 - S4*c4 - S5*c1 - S6*c2 - S7*c5 ) +//2s2 = (S0*c4 + S1*c5 - S2*c6 - S3*c1 - S4*c4 + S5*c7 + S6*c2 + S7*c3 ) +//2s3 = (S0*c4 + S1*c7 - S2*c2 - S3*c5 + S4*c4 + S5*c3 - S6*c6 - S7*c1 ) +//2s4 = (S0*c4 - S1*c7 - S2*c2 + S3*c5 + S4*c4 - S5*c3 - S6*c6 + S7*c1 ) +//2s5 = (S0*c4 - S1*c5 - S2*c6 + S3*c1 - S4*c4 - S5*c7 + S6*c2 - S7*c3 ) +//2s6 = (S0*c4 - S1*c3 + S2*c6 + S3*c7 - S4*c4 + S5*c1 - S6*c2 + S7*c5 ) +//2s7 = (S0*c4 - S1*c1 + S2*c2 - S3*c3 + S4*c4 - S5*c5 + S6*c6 - S7*c7 ) +// 1 4 2 4 1 4 2 4 +//22 multiplies +// +//s[] = 1/2 | c4 c1 c2 c3 c4 c5 c6 c7 | | S0 | +// | c4 c3 c6 -c7 -c4 -c1 -c2 -c5 | | S1 | +// | c4 c5 -c6 -c1 -c4 c7 c2 c3 | | S2 | +// | c4 c7 -c2 -c5 c4 c3 -c6 -c1 | | S3 | +// | c4 -c7 -c2 c5 c4 -c3 -c6 c1 | | S4 | +// | c4 -c5 -c6 c1 -c4 -c7 c2 -c3 | | S5 | +// | c4 -c3 c6 c7 -c4 c1 -c2 c5 | | S6 | +// | c4 -c1 c2 -c3 c4 -c5 c6 -c7 | | S7 | + + +//Note: first scaling almost converts to DFT values except c4 would be replaced by 1/c4, so F(0) is times 2 +//Q'[] = |1/c4 0 0 0 0 0 0 0 | Q[] = |c4 0 0 0 0 0 0 0 | S[] = | S0 | +// | 0 -1/c1 0 0 0 0 0 0 | | 0 -c1 0 0 0 0 0 0 | | S1 | +// | 0 0 -1/c2 0 0 0 0 0 | | 0 0 -c2 0 0 0 0 0 | | S2 | +// | 0 0 0 -1/c3 0 0 0 0 | | 0 0 0 -c3 0 0 0 0 | | S3 | +// | 0 0 0 0 -1/c4 0 0 0 | | 0 0 0 0 -c4 0 0 0 | | S4 | +// | 0 0 0 0 0 -1/c5 0 0 | | 0 0 0 0 0 -c5 0 0 | | S5 | +// | 0 0 0 0 0 0 -1/c6 0 | | 0 0 0 0 0 0 -c6 0 | | S6 | +// | 0 0 0 0 0 0 0 1/c7| | 0 0 0 0 0 0 0 c7 | | S7 | + +//s[] = | c4 c1 c2 c3 c4 c5 c6 c7 | |1/c4 0 0 0 0 0 0 0 | 1/2 Q[] S[] +// | c4 c3 c6 -c7 -c4 -c1 -c2 -c5 | | 0 -1/c1 0 0 0 0 0 0 | +// | c4 c5 -c6 -c1 -c4 c7 c2 c3 | | 0 0 -1/c2 0 0 0 0 0 | +// | c4 c7 -c2 -c5 c4 c3 -c6 -c1 | | 0 0 0 -1/c3 0 0 0 0 | +// | c4 -c7 -c2 c5 c4 -c3 -c6 c1 | | 0 0 0 0 -1/c4 0 0 0 | +// | c4 -c5 -c6 c1 -c4 -c7 c2 -c3 | | 0 0 0 0 0 -1/c5 0 0 | +// | c4 -c3 c6 c7 -c4 c1 -c2 c5 | | 0 0 0 0 0 0 -1/c6 0 | +// | c4 -c1 c2 -c3 c4 -c5 c6 -c7 | | 0 0 0 0 0 0 0 1/c7| + + +//s[] = | 1 -1 -1 -1 -1 -1 -1 1 | 1/2 Q[] S[] +// | 1 -c3/c1 -c6/c2 c7/c3 1 c1/c5 c2/c6 -c5/c7 | +// | 1 -c5/c1 c6/c2 c1/c3 1 -c7/c5 -c2/c6 c3/c7 | +// | 1 -c7/c1 1 c5/c3 -1 -c3/c5 1 -c1/c7 | +// | 1 c7/c1 1 -c5/c3 -1 c3/c5 1 c1/c7 | +// | 1 c5/c1 c6/c2 -c1/c3 1 c7/c5 -c2/c6 -c3/c7 | +// | 1 c3/c1 -c6/c2 -c7/c3 1 -c1/c5 c2/c6 c5/c7 | +// | 1 1 -1 1 -1 1 -1 -1 | + + +//P'[]= | 1 0 0 0 0 0 0 0 | P[]=| 1 0 0 0 0 0 0 0 | +// | 0 0 0 0 0 1 0 0 | | 0 0 0 0 1 0 0 0 | +// | 0 0 1 0 0 0 0 0 | | 0 0 1 0 0 0 0 0 | +// | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 1 0 | +// | 0 1 0 0 0 0 0 0 | | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 1 0 0 0 | | 0 1 0 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | | 0 0 0 0 0 0 0 1 | +// | 0 0 0 0 0 0 1 0 | | 0 0 0 1 0 0 0 0 | +// + + +// +// 0 1 2 3 4 5 6 7 +//s[] = | 1 -1 -1 -1 -1 -1 -1 1 | | 1 0 0 0 0 0 0 0 | P[] 1/2 Q[] S[] +// | 1 -c3/c1 -c6/c2 c7/c3 1 c1/c5 c2/c6 -c5/c7 | | 0 0 0 0 0 1 0 0 | +// | 1 -c5/c1 c6/c2 c1/c3 1 -c7/c5 -c2/c6 c3/c7 | | 0 0 1 0 0 0 0 0 | +// | 1 -c7/c1 1 c5/c3 -1 -c3/c5 1 -c1/c7 | | 0 0 0 0 0 0 0 1 | +// | 1 c7/c1 1 -c5/c3 -1 c3/c5 1 c1/c7 | | 0 1 0 0 0 0 0 0 | +// | 1 c5/c1 c6/c2 -c1/c3 1 c7/c5 -c2/c6 -c3/c7 | | 0 0 0 0 1 0 0 0 | +// | 1 c3/c1 -c6/c2 -c7/c3 1 -c1/c5 c2/c6 c5/c7 | | 0 0 0 1 0 0 0 0 | +// | 1 1 -1 1 -1 1 -1 -1 | | 0 0 0 0 0 0 1 0 | + +//0,4,2,6,5,1,7,3 +//s[] = | 1 -1 -1 -1 -1 -1 1 -1 | P[] 1/2 Q[] S[] +// | 1 1 -c6/c2 c2/c6 c1/c5 -c3/c1 -c5/c7 c7/c3 | +// | 1 1 c6/c2 -c2/c6 -c7/c5 -c5/c1 c3/c7 c1/c3 | +// | 1 -1 1 1 -c3/c5 -c7/c1 -c1/c7 c5/c3 | +// | 1 -1 1 1 c3/c5 c7/c1 c1/c7 -c5/c3 | +// | 1 1 c6/c2 -c2/c6 c7/c5 c5/c1 -c3/c7 -c1/c3 | +// | 1 1 -c6/c2 c2/c6 -c1/c5 c3/c1 c5/c7 -c7/c3 | +// | 1 -1 -1 -1 1 1 -1 1 | + + +//cos(a)cos(b) = (cos(a+b)+cos(a-b) )/2 + + //c1c7 = (0+c6)/2 = c6/2 + //1/(c1c7) = 2/c6 + //(c1/c7+c7/c1)c1c7 = c1c1+c7c7 = c1c1 + s1s1 = 1 +//(c1/c7+c7/c1) = 2/c6 + //(c1/c7-c7/c1)c1c7 = c1c1-c7c7 = c1c1 - s1s1 = c2 +//(c1/c7-c7/c1) = 2c2/c6 + //(c5/c1+c3/c7)c1c7 = c5c7+c3c1 = (c12+c2 + c4+c2)/2 = (-c4+c2 +c4+c2)/2 = c2 +//(c5/c1+c3/c7) = 2c2/c6 + //(c3/c1+c5/c7)c1c7 = c3c7+c1c5 = (c10+c4 + c6+c4)/2 = (-c6+c4+c6+c4)/2 = c4 +//(c3/c1+c5/c7) = 2c4/c6 + //(c3/c7-c5/c1)c1c7 = c3c1-c5c7 = (c4+c2 - c12-c2)/2 = (c4+c2 +c4-c2)/2 = c4 +//(c3/c7-c5/c1) = 2c4/c6 + //(c5/c7-c3/c1)c1c7 = c5c1-c3c7 (c6+c4-c10-c4)/2 = (c6+c4+c6-c4)/2 = c6 +//(c5/c7-c3/c1) = 2 + + //1/(c3c5) = 1/((0+c2)/2) = 2/c2 + //(c3/c5+c5/c3)c3c5 = c3*c3+c5*c5 = 1 +//(c3/c5+c5/c3) = 2/c2 + //(c1/c5+c7/c3)c3c5 = c1c3+c7c5 = c2 +//(c1/c5+c7/c3) = 2 + //(c1/c3+c7/c5)c3c5 = c7c3+c1c5 = c4 +//(c1/c3+c7/c5) = 2c4/c2 + //(c1/c5-c7/c3)c3c5 = c1c3-c7c5 = c4 +//(c1/c5-c7/c3) = 2c4/c2 + //(c1/c3-c7/c5)c3c5 = c1c5-c7c3 = c6 +//(c1/c3-c7/c5) = 2c6/c2 + //(c3/c5-c5/c3)c3c5 = c3*c3-c5*c5 = (c6+1 -c10-1)/2=(c6+c6)/2=c6 +//(c3/c5-c5/c3) = 2c6/c2 + + //(c2/c6+c6/c2)c2c6 = c2*c2+c6*c6 = 1 +//(c2/c6+c6/c2) = 1/(c2c6) = 1/(0+c4/2) = 2/c4 + //(c2/c6-c6/c2)c2c6 = c2*c2-c6*c6 = (c4+1-c12-1)/2 = (c4+c4)/2 = c4 +//(c2/c6-c6/c2) = 2 + +//B1' = | 1 0 0 0 0 0 0 0 | B1= | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1/2 0 0 1/2 | | 0 0 0 0 1 0 0 -1 | +// | 0 0 0 0 0 1/2 1/2 0 | | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 -1/2 1/2 0 | | 0 0 0 0 0 1 1 0 | +// | 0 0 0 0 -1/2 0 0 1/2 | | 0 0 0 0 1 0 0 1 | +// + + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 -1 -1 -1 -1 -1 1 -1 | | 1 0 0 0 0 0 0 0 | B1[] P[] 1/2 Q[] S[] +// | 1 1 -c6/c2 c2/c6 c1/c5 -c3/c1 -c5/c7 c7/c3 | | 0 1 0 0 0 0 0 0 | +// | 1 1 c6/c2 -c2/c6 -c7/c5 -c5/c1 c3/c7 c1/c3 | | 0 0 1 0 0 0 0 0 | +// | 1 -1 1 1 -c3/c5 -c7/c1 -c1/c7 c5/c3 | | 0 0 0 1 0 0 0 0 | +// | 1 -1 1 1 c3/c5 c7/c1 c1/c7 -c5/c3 | | 0 0 0 0 1/2 0 0 1/2 | +// | 1 1 c6/c2 -c2/c6 c7/c5 c5/c1 -c3/c7 -c1/c3 | | 0 0 0 0 0 1/2 1/2 0 | +// | 1 1 -c6/c2 c2/c6 -c1/c5 c3/c1 c5/c7 -c7/c3 | | 0 0 0 0 0 -1/2 1/2 0 | +// | 1 -1 -1 -1 1 1 -1 1 | | 0 0 0 0 -1/2 0 0 1/2 | + + +//0,1,2,3,(4-7)/2,(5-6)/2,(6+5)/2,(7+4)/2 +//s[] = | 1 -1 -1 -1 0 -1 0 -1 | B1[] P[] 1/2 Q[] S[] +// | 1 1 -c6/c2 c2/c6 ( c1/c5-c7/c3)/2 (-c3/c1+c5/c7)/2 (-c5/c7-c3/c1)/2 ( c7/c3+c1/c5)/2 | +// | 1 1 c6/c2 -c2/c6 (-c7/c5-c1/c3)/2 (-c5/c1-c3/c7)/2 ( c3/c7-c5/c1)/2 ( c1/c3-c7/c5)/2 | +// | 1 -1 1 1 (-c3/c5-c5/c3)/2 (-c7/c1+c1/c7)/2 (-c1/c7-c7/c1)/2 ( c5/c3-c3/c5)/2 | +// | 1 -1 1 1 ( c3/c5+c5/c3)/2 ( c7/c1-c1/c7)/2 ( c1/c7+c7/c1)/2 (-c5/c3+c3/c5)/2 | +// | 1 1 c6/c2 -c2/c6 ( c7/c5+c1/c3)/2 ( c5/c1+c3/c7)/2 (-c3/c7+c5/c1)/2 (-c1/c3+c7/c5)/2 | +// | 1 1 -c6/c2 c2/c6 (-c1/c5+c7/c3)/2 ( c3/c1-c5/c7)/2 ( c5/c7+c3/c1)/2 (-c7/c3-c1/c5)/2 | +// | 1 -1 -1 -1 0 1 0 1 | + +//1/(2c1c7) = 1/c6 +//1/(2c3c5) = 1/c2 +//(c1/c5-c7/c3)/2 = c4/(2c3c5) = c4/c2 +//(c1/c3+c7/c5)/2 = c4/(2c3c5) = c4/c2 +//(c3/c5+c5/c3)/2 = 1/(2c3c5) = 1/c2 + +//(c5/c7-c3/c1)/2 = c6/(2c1c7) = 1 +//(c3/c7+c5/c1)/2 = c2/(2c1c7) = c2/c6 +//(c1/c7-c7/c1)/2 = c2/(2c1c7) = c2/c6 + +//(c3/c1+c5/c7)/2 = c4/(2c1c7) = c4/c6 +//(c3/c7-c5/c1)/2 = c4/(2c1c7) = c4/c6 +//(c1/c7+c7/c1)/2 = 1/(2c1c7) = 1/c6 + +//(c1/c5+c7/c3)/2 = c2/(2c3c5) = 1 +//(c1/c3-c7/c5)/2 = c6/(2c3c5) = c6/c2 +//(c3/c5-c5/c3)/2 = c6/(2c3c5) = c6/c2 + +//s[] = | 1 -1 -1 -1 0 -1 0 -1 | B1[] P[] 1/2 Q[] S[] +// | 1 1 -c6/c2 c2/c6 c4/c2 1 -c4/c6 1 | +// | 1 1 c6/c2 -c2/c6 -c4/c2 -c2/c6 c4/c6 c6/c2 | +// | 1 -1 1 1 -1/c2 c2/c6 -1/c6 -c6/c2 | +// | 1 -1 1 1 1/c2 -c2/c6 1/c6 c6/c2 | +// | 1 1 c6/c2 -c2/c6 c4/c2 c2/c6 -c4/c6 -c6/c2 | +// | 1 1 -c6/c2 c2/c6 -c4/c2 -1 c4/c6 -1 | +// | 1 -1 -1 -1 0 1 0 1 | + +//B2'= | 1 0 0 0 0 0 0 0 | B2= | 1 0 0 0 0 0 0 0| +// | 0 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0| +// | 0 0 1/2 1/2 0 0 0 0 | | 0 0 1 -1 0 0 0 0| +// | 0 0 -1/2 1/2 0 0 0 0 | | 0 0 1 1 0 0 0 0| +// | 0 0 0 0 1 0 0 0 | | 0 0 0 0 1 0 0 0| +// | 0 0 0 0 0 1/2 0 1/2 | | 0 0 0 0 0 1 0 -1| +// | 0 0 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0| +// | 0 0 0 0 0 -1/2 0 1/2 | | 0 0 0 0 0 1 0 1| +// + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 -1 -1 -1 0 -1 0 -1 | | 1 0 0 0 0 0 0 0 | B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -c6/c2 c2/c6 c4/c2 1 -c4/c6 1 | | 0 1 0 0 0 0 0 0 | +// | 1 1 c6/c2 -c2/c6 -c4/c2 -c2/c6 c4/c6 c6/c2 | | 0 0 1/2 1/2 0 0 0 0 | +// | 1 -1 1 1 -1/c2 c2/c6 -1/c6 -c6/c2 | | 0 0 -1/2 1/2 0 0 0 0 | +// | 1 -1 1 1 1/c2 -c2/c6 1/c6 c6/c2 | | 0 0 0 0 1 0 0 0 | +// | 1 1 c6/c2 -c2/c6 c4/c2 c2/c6 -c4/c6 -c6/c2 | | 0 0 0 0 0 1/2 0 1/2 | +// | 1 1 -c6/c2 c2/c6 -c4/c2 -1 c4/c6 -1 | | 0 0 0 0 0 0 1 0 | +// | 1 -1 -1 -1 0 1 0 1 | | 0 0 0 0 0 -1/2 0 1/2 | + +//0,1,(2-3)/2,(3+2)/2,4,(5-7)/2,6,(7+5)/2 +//s[] = | 1 -1 0 -1 0 0 0 -1 | B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 (-c6/c2-c2/c6)/2 ( c2/c6-c6/c2)/2 c4/c2 0 -c4/c6 1 | +// | 1 1 ( c6/c2+c2/c6)/2 (-c2/c6+c6/c2)/2 -c4/c2 (-c2/c6-c6/c2)/2 c4/c6 ( c6/c2-c2/c6)/2 | +// | 1 -1 0 1 -1/c2 ( c2/c6+c6/c2)/2 -1/c6 (-c6/c2+c2/c6)/2 | +// | 1 -1 0 1 1/c2 (-c2/c6-c6/c2)/2 1/c6 ( c6/c2-c2/c6)/2 | +// | 1 1 ( c6/c2+c2/c6)/2 (-c2/c6+c6/c2)/2 c4/c2 ( c2/c6+c6/c2)/2 -c4/c6 (-c6/c2+c2/c6)/2 | +// | 1 1 (-c6/c2-c2/c6)/2 ( c2/c6-c6/c2)/2 -c4/c2 0 c4/c6 -1 | +// | 1 -1 0 -1 0 0 0 1 | + +//(c6/c2+c2/c6)/2 = 1/c4 +//(c2/c6-c6/c2)/2 = 1 +//s[] = | 1 -1 0 -1 0 0 0 -1 | B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1/c4 1 c4/c2 0 -c4/c6 1 | +// | 1 1 1/c4 -1 -c4/c2 -1/c4 c4/c6 -1 | +// | 1 -1 0 1 -1/c2 1/c4 -1/c6 1 | +// | 1 -1 0 1 1/c2 -1/c4 1/c6 -1 | +// | 1 1 1/c4 -1 c4/c2 1/c4 -c4/c6 1 | +// | 1 1 -1/c4 1 -c4/c2 0 c4/c6 -1 | +// | 1 -1 0 -1 0 0 0 1 | + +//(c4c2/c6+c4c6/c2)/2 = c4(c2/c6+c6/c2)/2 = c4(1/c4) = 1 +//(c2/c6-c6/c2)/2 = 1 + +// +//M' = | 1 0 0 0 0 0 0 0 | M = | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0 | +// | 0 0 c4 0 0 0 0 0 | | 0 0 1/c4 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 c2/2 0 -c6/2 0 | | 0 0 0 0 2c2 0 2c6 0 | +// | 0 0 0 0 0 c4 0 0 | | 0 0 0 0 0 1/c4 0 0 | +// | 0 0 0 0 c6/2 0 c2/2 0 | | 0 0 0 0 -2c6 0 2c2 0 | +// | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | +// +//y5 = -n5c2 - n7c6 +//y7 = -n5c6 + n7c2 + +//y5 = -n5(c2-c6) - (n5+n7)c6 = -n5c2+n5c6 -n5c6-n7c6 = -n5c2 - n7c6 +//y7 = -(n5+n7)c6 + n7(c2+c6) = -n5c6-n7c6 + n7c2+n7c6 = -n5c6 + n7c2 +//so.... only 5 multiples required. + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 -1 0 -1 0 0 0 -1 | | 1 0 0 0 0 0 0 0 | M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1/c4 1 c4/c2 0 -c4/c6 1 | | 0 1 0 0 0 0 0 0 | +// | 1 1 1/c4 -1 -c4/c2 -1/c4 c4/c6 -1 | | 0 0 c4 0 0 0 0 0 | +// | 1 -1 0 1 -1/c2 1/c4 -1/c6 1 | | 0 0 0 1 0 0 0 0 | +// | 1 -1 0 1 1/c2 -1/c4 1/c6 -1 | | 0 0 0 0 c2/2 0 -c6/2 0 | +// | 1 1 1/c4 -1 c4/c2 1/c4 -c4/c6 1 | | 0 0 0 0 0 c4 0 0 | +// | 1 1 -1/c4 1 -c4/c2 0 c4/c6 -1 | | 0 0 0 0 c6/2 0 c2/2 0 | +// | 1 -1 0 -1 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + + +//0,1,c4*2,3, (c2*4+c6*6)/2, c4*5, (c2*6-c6*4)/2, 7 +//s[] = | 1 -1 0 -1 0 0 0 -1 | M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1 1 ( c4-c4)/2 0 (-c4c2/c6-c4c6/c2)/2 1 | +// | 1 1 1 -1 (-c4+c4)/2 -1 ( c4c2/c6+c4c6/c2)/2 -1 | +// | 1 -1 0 1 -1 1 (-c2/c6+c6/c2)/2 1 | +// | 1 -1 0 1 1 -1 ( c2/c6-c6/c2)/2 -1 | +// | 1 1 1 -1 ( c4-c4)/2 1 (-c4c2/c6-c4c6/c2)/2 1 | +// | 1 1 -1 1 (-c4+c4)/2 0 ( c4c2/c6+c4c6/c2)/2 -1 | +// | 1 -1 0 -1 0 0 0 1 | + +//(c4c2/c6+c4c6/c2)/2 = c4(c2/c6+c6/c2)/2 = c4(1/c4) = 1 +//(c2/c6-c6/c2)/2 = (c2c2-c6c6)/(2c2c6) = ((c4+1)/2 - (c12+1)/2)/(2c2c6) = (c4/2 +1/2 +c4/2 -1/2)/(2c2c6) = c4/(2c2c6) = c4/(2(0+c4)/2) = 1 +//s[] = | 1 -1 0 -1 0 0 0 -1 | M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1 1 0 0 -1 1 | +// | 1 1 1 -1 0 -1 1 -1 | +// | 1 -1 0 1 -1 1 -1 1 | +// | 1 -1 0 1 1 -1 1 -1 | +// | 1 1 1 -1 0 1 -1 1 | +// | 1 1 -1 1 0 0 1 -1 | +// | 1 -1 0 -1 0 0 0 1 | + + +//s[] = | 1 -1 0 -1 0 0 0 -1 | M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1 1 0 0 -1 1 | +// | 1 1 1 -1 0 -1 1 -1 | +// | 1 -1 0 1 -1 1 -1 1 | +// | 1 -1 0 1 1 -1 1 -1 | +// | 1 1 1 -1 0 1 -1 1 | +// | 1 1 -1 1 0 0 1 -1 | +// | 1 -1 0 -1 0 0 0 1 | + +//A1'= | 1/2 1/2 0 0 0 0 0 0 | A1= | 1 -1 0 0 0 0 0 0 | +// |-1/2 1/2 0 0 0 0 0 0 | | 1 1 0 0 0 0 0 0 | +// | 0 0 1 1 0 0 0 0 | | 0 0 1 -1 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 0 | | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 1 | | 0 0 0 0 0 0 1 -1 | +// | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | +// + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 -1 0 -1 0 0 0 -1 | | 1/2 1/2 0 0 0 0 0 0 | A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 1 1 -1 1 0 0 -1 1 | |-1/2 1/2 0 0 0 0 0 0 | +// | 1 1 1 -1 0 -1 1 -1 | | 0 0 1 1 0 0 0 0 | +// | 1 -1 0 1 -1 1 -1 1 | | 0 0 0 1 0 0 0 0 | +// | 1 -1 0 1 1 -1 1 -1 | | 0 0 0 0 1 0 0 0 | +// | 1 1 1 -1 0 1 -1 1 | | 0 0 0 0 0 1 0 0 | +// | 1 1 -1 1 0 0 1 -1 | | 0 0 0 0 0 0 1 1 | +// | 1 -1 0 -1 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + +//(0-1)/2, (1+0)/2, 2, 3+2, 4, 5, 6, 7+6 +//s[] = | 1 0 0 -1 0 0 0 -1 | A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 -1 0 0 0 -1 0 | +// | 0 1 1 0 0 -1 1 0 | +// | 1 0 0 1 -1 1 -1 0 | +// | 1 0 0 1 1 -1 1 0 | +// | 0 1 1 0 0 1 -1 0 | +// | 0 1 -1 0 0 0 1 0 | +// | 1 0 0 -1 0 0 0 1 | + + +//A2'= | 1/2 0 0 1/2 0 0 0 0 | A2= | 1 0 0 -1 0 0 0 0 | +// | 0 1/2 1/2 0 0 0 0 0 | | 0 1 -1 0 0 0 0 0 | +// | 0 -1/2 1/2 0 0 0 0 0 | | 0 1 1 0 0 0 0 0 | +// |-1/2 0 0 1/2 0 0 0 0 | | 1 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 1 0 | | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 0 0 -1 0 0 0 -1 | | 1/2 0 0 1/2 0 0 0 0 | A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 -1 0 0 0 -1 0 | | 0 1/2 1/2 0 0 0 0 0 | +// | 0 1 1 0 0 -1 1 0 | | 0 -1/2 1/2 0 0 0 0 0 | +// | 1 0 0 1 -1 1 -1 0 | |-1/2 0 0 1/2 0 0 0 0 | +// | 1 0 0 1 1 -1 1 0 | | 0 0 0 0 1 0 0 0 | +// | 0 1 1 0 0 1 -1 0 | | 0 0 0 0 0 1 1 0 | +// | 0 1 -1 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | +// | 1 0 0 -1 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + +//(0-3)/2,(1-2)/2,(2+1)/2,(3+0)/2,4,5,6+5,7 +//s[] = | 1 0 0 0 0 0 0 -1 | A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 0 0 0 0 -1 0 | +// | 0 0 1 0 0 -1 0 0 | +// | 0 0 0 1 -1 1 0 0 | +// | 0 0 0 1 1 -1 0 0 | +// | 0 0 1 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | + +//A3'= | 1 0 0 0 0 0 0 0 | A3 = | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 1 0 0 | | 0 0 0 0 1 -1 0 0 | +// | 0 0 0 0 0 1 0 0 | | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 0 0 1 0 0 0 -1 | | 1 0 0 0 0 0 0 0 | A3[] A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 0 0 0 0 -1 0 | | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 -1 0 0 | | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 -1 1 0 0 | | 0 0 0 1 0 0 0 0 | +// | 0 0 0 1 1 -1 0 0 | | 0 0 0 0 1 1 0 0 | +// | 0 0 1 0 0 1 0 0 | | 0 0 0 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | + +//0,1,2,3,4,5+4,6,7 +//s[] = | 1 0 0 0 0 0 0 -1 | A3[] A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 0 0 0 0 -1 0 | +// | 0 0 1 0 0 -1 0 0 | +// | 0 0 0 1 -1 0 0 0 | +// | 0 0 0 1 1 0 0 0 | +// | 0 0 1 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | + + +//A4'= | 1/2 0 0 0 0 0 0 1/2 | A4= | 1 0 0 0 0 0 0 -1| +// | 0 1/2 0 0 0 0 1/2 0 | | 0 1 0 0 0 0 -1 0| +// | 0 0 1/2 0 0 1/2 0 0 | | 0 0 1 0 0 -1 0 0| +// | 0 0 0 1/2 1/2 0 0 0 | | 0 0 0 1 -1 0 0 0| +// | 0 0 0 -1/2 1/2 0 0 0 | | 0 0 0 1 1 0 0 0| +// | 0 0 -1/2 0 0 1/2 0 0 | | 0 0 1 0 0 1 0 0| +// | 0 -1/2 0 0 0 0 1/2 0 | | 0 1 0 0 0 0 1 0| +// |-1/2 0 0 0 0 0 0 1/2 | | 1 0 0 0 0 0 0 1| + +// 0 1 2 3 4 5 6 7 +//s[] = | 1 0 0 0 0 0 0 -1 | | 1/2 0 0 0 0 0 0 1/2 | A4[] A3[] A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 0 0 0 0 -1 0 | | 0 1/2 0 0 0 0 1/2 0 | +// | 0 0 1 0 0 -1 0 0 | | 0 0 1/2 0 0 1/2 0 0 | +// | 0 0 0 1 -1 0 0 0 | | 0 0 0 1/2 1/2 0 0 0 | +// | 0 0 0 1 1 0 0 0 | | 0 0 0 -1/2 1/2 0 0 0 | +// | 0 0 1 0 0 1 0 0 | | 0 0 -1/2 0 0 1/2 0 0 | +// | 0 1 0 0 0 0 1 0 | | 0 -1/2 0 0 0 0 1/2 0 | +// | 1 0 0 0 0 0 0 1 | |-1/2 0 0 0 0 0 0 1/2 | + +//(0-7)/2,(1-6)/2,(2-5)/2,(3-4)/2,(4+3)/2,(5+2)/2,(6+1)/2,(7+0)/2 +//s[] = | 1 0 0 0 0 0 0 0 | A4[] A3[] A2[] A1[] M[] B2[] B1[] P[] 1/2 Q[] S[] +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | + +//verified.....We got the I matrix, stop. + + + + +// s[]= | 1 0 0 0 0 0 0 -1 | | 1 0 0 0 0 0 0 0 | | 1 0 0 -1 0 0 0 0 | | 1 -1 0 0 0 0 0 0 | | 1 0 0 0 0 0 0 0 | |1 0 0 0 0 0 0 0 | | 1 0 0 0 0 0 0 0 | | 1 0 0 0 0 0 0 0| |c4 0 0 0 0 0 0 0 | | S0 | +// | 0 1 0 0 0 0 -1 0 | | 0 1 0 0 0 0 0 0 | | 0 1 -1 0 0 0 0 0 | | 1 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0 | |0 1 0 0 0 0 0 0 | | 0 1 0 0 0 0 0 0 | | 0 0 0 0 1 0 0 0| | 0 -c1 0 0 0 0 0 0 | | S1 | +// | 0 0 1 0 0 -1 0 0 | | 0 0 1 0 0 0 0 0 | | 0 1 1 0 0 0 0 0 | | 0 0 1 -1 0 0 0 0 | | 0 0 1/c4 0 0 0 0 0 | |0 0 1 -1 0 0 0 0 | | 0 0 1 0 0 0 0 0 | | 0 0 1 0 0 0 0 0| | 0 0 -c2 0 0 0 0 0 | | S2 | +// | 0 0 0 1 -1 0 0 0 | | 0 0 0 1 0 0 0 0 | | 1 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | |0 0 1 1 0 0 0 0 | | 0 0 0 1 0 0 0 0 | | 0 0 0 0 0 0 1 0| | 0 0 0 -c3 0 0 0 0 | | S3 | +// | 0 0 0 1 1 0 0 0 | | 0 0 0 0 1 -1 0 0 | | 0 0 0 0 1 0 0 0 | | 0 0 0 0 1 0 0 0 | | 0 0 0 0 2c2 0 2c6 0 | |0 0 0 0 1 0 0 0 | | 0 0 0 0 1 0 0 -1 | | 0 0 0 0 0 1 0 0| 1/2 | 0 0 0 0 -c4 0 0 0 | | S4 | +// | 0 0 1 0 0 1 0 0 | | 0 0 0 0 0 1 0 0 | | 0 0 0 0 0 1 -1 0 | | 0 0 0 0 0 1 0 0 | | 0 0 0 0 0 1/c4 0 0 | |0 0 0 0 0 1 0 -1 | | 0 0 0 0 0 1 -1 0 | | 0 1 0 0 0 0 0 0| | 0 0 0 0 0 -c5 0 0 | | S5 | +// | 0 1 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 0 | | 0 0 0 0 0 0 1 -1 | | 0 0 0 0 -2c6 0 2c2 0 | |0 0 0 0 0 0 1 0 | | 0 0 0 0 0 1 1 0 | | 0 0 0 0 0 0 0 1| | 0 0 0 0 0 0 -c6 0 | | S6 | +// | 1 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | | 0 0 0 0 0 0 0 1 | |0 0 0 0 0 1 0 1 | | 0 0 0 0 1 0 0 1 | | 0 0 0 1 0 0 0 0| | 0 0 0 0 0 0 0 c7 | | S7 | +// A4 8 additions A3 1 addition A2 5 additions A1 4 additions M 5 mult 3 additions B2 4 additions B1 4 additions P' Q' 8 mult + +//A4 A3 A2 A1 = 18 additions, *16 = 288 additions +//M' = 3 additions, 5 multiplications * 16 = 80 +64 initial multiplications, 1 per coefficient, if coefficient is skipped, mult can be skipped :) +//B2'B1' = 8 additions, *16 = 128 additions +// 416, + +static void idct_row(int* dest,int16_t * src) +{ + int a0 = *(src+0); + int a1 = *(src+1); + int a2 = *(src+2); + int a3 = *(src+3); + int a4 = *(src+4); + int a5 = *(src+5); + int a6 = *(src+6); + int a7 = *(src+7); + +// P' and Q' are already done + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 -1 | +// | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 1 1 0 | +// | 0 0 0 0 1 0 0 1 | +// B1 4 additions + a4 -= a7; + a5 -= a6; + a7 = a4 + (a7<<1); + a6 = a5 + (a6<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 1 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 -1 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 1 0 1 | +// B2 4 additions + a2 -= a3; + a5 -= a7; + a3 = a2 + (a3<<1); + a7 = a5 + (a7<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1/c4 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 2c2 0 2c6 0 | +// | 0 0 0 0 0 1/c4 0 0 | +// | 0 0 0 0 -2c6 0 2c2 0 | +// | 0 0 0 0 0 0 0 1 | +// M 5 mult 3 additions + a2 = DivC4(a2); + +// a4' = a4*(2c2) + a6*(2c6); +// a6' = a6*(2c2) - a4*(2c6); +//is equal to +// a4' = (a4)*(2c2-2c6) + (a4+a6)*(2c6); +// a6' = (a6)*(2c2+2c6) - (a4+a6)*(2c6); + { + int mab = a4+a6; + mab = smull(mab,C6x,SHFT_C6x_M1); //31-1 + a4 = smull(a4,C2mC6x,SHFT_C2mC6x_M1) + mab; //31-1 + a6 = smull(a6,C2pC6x,SHFT_C2pC6x_M1) - mab; //30-1 + } + a5 = DivC4(a5); + + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 -1 0 0 0 0 0 0 | +// | 1 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 -1 | +// | 0 0 0 0 0 0 0 1 | +// A1 4 additions + a0 -= a1; + a2 -= a3; + a6 -= a7; + a1 = a0 + (a1<<1); + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 -1 0 0 0 0 | +// | 0 1 -1 0 0 0 0 0 | +// | 0 1 1 0 0 0 0 0 | +// | 1 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A2 5 additions + + a0 -= a3; + a1 -= a2; + a5 -= a6; + a3 = a0 + (a3<<1); + a2 = a1 + (a2<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 -1 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A3 1 addition + a4 -= a5; + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 -1 | +// | 0 1 0 0 0 0 -1 0 | +// | 0 0 1 0 0 -1 0 0 | +// | 0 0 0 1 -1 0 0 0 | +// | 0 0 0 1 1 0 0 0 | +// | 0 0 1 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | +// A4 8 additions + a0 -= a7; + a1 -= a6; + a2 -= a5; + a3 -= a4; + a7 = a0 + (a7<<1); + a6 = a1 + (a6<<1); + a5 = a2 + (a5<<1); + a4 = a3 + (a4<<1); + + *(dest+0) = a0; + *(dest+1) = a1; + *(dest+2) = a2; + *(dest+3) = a3; + *(dest+4) = a4; + *(dest+5) = a5; + *(dest+6) = a6; + *(dest+7) = a7; +} + +static void idct_col(int* dest) +{ + int a0 = *(dest+0*8); + int a1 = *(dest+1*8); + int a2 = *(dest+2*8); + int a3 = *(dest+3*8); + int a4 = *(dest+4*8); + int a5 = *(dest+5*8); + int a6 = *(dest+6*8); + int a7 = *(dest+7*8); + +// B1 4 additions + a4 -= a7; + a5 -= a6; + a7 = a4 + (a7<<1); + a6 = a5 + (a6<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 1 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 -1 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 1 0 1 | +// B2 4 additions + a2 -= a3; + a5 -= a7; + a3 = a2 + (a3<<1); + a7 = a5 + (a7<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1/c4 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 2c2 0 2c6 0 | +// | 0 0 0 0 0 1/c4 0 0 | +// | 0 0 0 0 -2c6 0 2c2 0 | +// | 0 0 0 0 0 0 0 1 | +// M 5 mult 3 additions + a2 = DivC4(a2); + +// a4' = a4*(2c2) + a6*(2c6); +// a6' = a6*(2c2) - a4*(2c6); +//is equal to +// a4' = (a4)*(2c2-2c6) + (a4+a6)*(2c6); +// a6' = (a6)*(2c2+2c6) - (a4+a6)*(2c6); + { + int mab = a4+a6; + mab = smull(mab,C6x,SHFT_C6x_M1); //31-1 + a4 = smull(a4,C2mC6x,SHFT_C2mC6x_M1) + mab; //31-1 + a6 = smull(a6,C2pC6x,SHFT_C2pC6x_M1) - mab; //30-1 + } + a5 = DivC4(a5); + + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 -1 0 0 0 0 0 0 | +// | 1 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 -1 | +// | 0 0 0 0 0 0 0 1 | +// A1 4 additions + a0 -= a1; + a2 -= a3; + a6 -= a7; + a1 = a0 + (a1<<1); + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 -1 0 0 0 0 | +// | 0 1 -1 0 0 0 0 0 | +// | 0 1 1 0 0 0 0 0 | +// | 1 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A2 5 additions + + a0 -= a3; + a1 -= a2; + a5 -= a6; + a3 = a0 + (a3<<1); + a2 = a1 + (a2<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 -1 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A3 1 addition + a4 -= a5; + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 -1 | +// | 0 1 0 0 0 0 -1 0 | +// | 0 0 1 0 0 -1 0 0 | +// | 0 0 0 1 -1 0 0 0 | +// | 0 0 0 1 1 0 0 0 | +// | 0 0 1 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | +// A4 8 additions + a0 -= a7; + a1 -= a6; + a2 -= a5; + a3 -= a4; + a7 = a0 + (a7<<1); + a6 = a1 + (a6<<1); + a5 = a2 + (a5<<1); + a4 = a3 + (a4<<1); + + *(dest+0*8) = a0; + *(dest+1*8) = a1; + *(dest+2*8) = a2; + *(dest+3*8) = a3; + *(dest+4*8) = a4; + *(dest+5*8) = a5; + *(dest+6*8) = a6; + *(dest+7*8) = a7; +} + + + + + +#ifdef QSCALE +#define fracBits 16 +#else +#define fracBits 6 +#endif + +#ifdef JPEG +#define levelShiftRound (0x0800000+(1<<(fracBits-1))) //jpeg +#else +#define levelShiftRound (1<<(fracBits-1)) //mpeg +#endif + +static BYTE INLINE rangeCheck(int dct) +{ + int a = (dct+levelShiftRound)>>fracBits; //include level shift and round +// if (((unsigned int)a)<=255) return (BYTE)a; +// if (a<0) return 0; +// else return 255; + if ( unlikely(((unsigned int)a)>255)) a = ~(a>>31); //negative values get 0, >255 get -1 (or 255 in low byte) + return a; +} + +void GetIDCT(int16_t * srcDCT,uint8_t * dest,const int stride,const int addFlag) +{ + int workspace[64+8]; + int* ws = (int*)((((int)workspace)+0x1f)& ~0x1f); + int* dct =ws; + int i; + + for (i = 0; i < 8; i++) { + idct_row (ws,srcDCT); + srcDCT+=8; + ws+=8; + } + ws = dct; + for (i = 0; i < 8; i++) { + idct_col (ws); + ws++; + } + + i=8; + if (addFlag==0) { + do + { + *(dest+0) = rangeCheck(*dct++); + *(dest+1) = rangeCheck(*dct++); + *(dest+2) = rangeCheck(*dct++); + *(dest+3) = rangeCheck(*dct++); + *(dest+4) = rangeCheck(*dct++); + *(dest+5) = rangeCheck(*dct++); + *(dest+6) = rangeCheck(*dct++); + *(dest+7) = rangeCheck(*dct++); + dest+=stride; + } while (--i); + } else { + do + { + *(dest+0) = rangeCheck(*dct++ + ( (*(dest+0)) << fracBits)); + *(dest+1) = rangeCheck(*dct++ + ( (*(dest+1)) << fracBits)); + *(dest+2) = rangeCheck(*dct++ + ( (*(dest+2)) << fracBits)); + *(dest+3) = rangeCheck(*dct++ + ( (*(dest+3)) << fracBits)); + *(dest+4) = rangeCheck(*dct++ + ( (*(dest+4)) << fracBits)); + *(dest+5) = rangeCheck(*dct++ + ( (*(dest+5)) << fracBits)); + *(dest+6) = rangeCheck(*dct++ + ( (*(dest+6)) << fracBits)); + *(dest+7) = rangeCheck(*dct++ + ( (*(dest+7)) << fracBits)); + dest+=stride; + } while (--i); + } +} +#endif + diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/header.c mpeg2dec-0.3.1/libmpeg2/header.c --- mpeg2dec-0.3.1.prev/libmpeg2/header.c 2002-12-13 03:02:47.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/header.c 2003-05-09 01:57:27.000000000 -0700 @@ -74,9 +74,80 @@ 53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63 }; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE +#include "coefficients.h" +//zigzag normal scan order +// 0( 0) 1( 1) 5( 2) 6( 3) 14( 4) 15( 5) 27( 6) 28( 7) +// 2( 8) 4( 9) 7(10) 13(11) 16(12) 26(13) 29(14) 42(15) +// 3(16) 8(17) 12(18) 17(19) 25(20) 30(21) 41(22) 43(23) +// 9(24) 11(25) 18(26) 24(27) 31(28) 40(29) 44(30) 53(31) +//10(32) 19(33) 23(34) 32(35) 39(36) 45(37) 52(38) 54(39) +//20(40) 22(41) 33(42) 38(43) 46(44) 51(45) 55(46) 60(47) +//21(48) 34(49) 37(50) 47(51) 50(52) 56(53) 59(54) 61(55) +//35(56) 36(57) 48(58) 49(59) 57(60) 58(61) 62(62) 63(63) + + +static const int qscale_normal[64] __attribute__ ((aligned (32))) = + { RD4(CE00), + RD4(CE01), RD4(CE10), + RD4(CE20), RD4(CE11), RD4(CE02), + RD4(CE03), RD4(CE12), RD4(CE21), RD4(CE30), + RD4(CE40), RD4(CE31), RD4(CE22), RD4(CE13), RD4(CE04), + RD4(CE05), RD4(CE14), RD4(CE23), RD4(CE32), RD4(CE41), RD4(CE50), + RD4(CE60), RD4(CE51), RD4(CE42), RD4(CE33), RD4(CE24), RD4(CE15), RD4(CE06), + RD4(CE07), RD4(CE16), RD4(CE25), RD4(CE34), RD4(CE43), RD4(CE52), RD4(CE61), RD4(CE70), + RD4(CE71), RD4(CE62), RD4(CE53), RD4(CE44), RD4(CE35), RD4(CE26), RD4(CE17), + RD4(CE27), RD4(CE36), RD4(CE45), RD4(CE54), RD4(CE63), RD4(CE72), + RD4(CE73), RD4(CE64), RD4(CE55), RD4(CE46), RD4(CE37), + RD4(CE47), RD4(CE56), RD4(CE65), RD4(CE74), + RD4(CE75), RD4(CE66), RD4(CE57), + RD4(CE67), RD4(CE76), + RD4(CE77) + }; + +//zigzag alternate scan order +// \ col +//row\ 0 1 2 3 4 5 6 7 +// 0 0( 0) 4( 1) 6( 2) 20( 3) 22( 4) 36( 5) 38( 6) 52( 7) +// 1 1( 8) 5( 9) 7(10) 21(11) 23(12) 37(13) 39(14) 53(15) +// 2 2(16) 8(17) 19(18) 24(19) 34(20) 40(21) 50(22) 54(23) +// 3 3(24) 9(25) 18(26) 25(27) 35(28) 41(29) 51(30) 55(31) +// 4 10(32) 17(33) 26(34) 30(35) 42(36) 46(37) 56(38) 60(39) +// 5 11(40) 16(41) 27(42) 31(43) 43(44) 47(45) 57(46) 61(47) +// 6 12(48) 15(49) 28(50) 32(51) 44(52) 48(53) 58(54) 62(55) +// 7 13(56) 14(57) 29(58) 33(59) 45(60) 49(61) 59(62) 63(63) + +// \ col +//row\ 0 1 2 3 4 5 6 7 +// 0 0( 0) 4( 1) 6( 2) 20( 3) 22( 4) 36( 5) 38( 6) 52( 7) +// 1 1(10) 5(11) 7(12) 21(13) 23(14) 37(15) 39(16) 53(17) +// 2 2(20) 8(21) 19(22) 24(23) 34(24) 40(25) 50(26) 54(27) +// 3 3(30) 9(31) 18(32) 25(33) 35(34) 41(35) 51(36) 55(37) +// 4 10(40) 17(41) 26(42) 30(43) 42(44) 46(45) 56(46) 60(47) +// 5 11(50) 16(51) 27(52) 31(53) 43(54) 47(55) 57(56) 61(57) +// 6 12(60) 15(61) 28(62) 32(63) 44(64) 48(65) 58(66) 62(67) +// 7 13(70) 14(71) 29(72) 33(73) 45(74) 49(75) 59(76) 63(77) + +static const int qscale_alternate[64] __attribute__ ((aligned (32))) = +{ + RD4(CE00), RD4(CE10), RD4(CE20), RD4(CE30), RD4(CE01), RD4(CE11), RD4(CE02), RD4(CE12), + RD4(CE21), RD4(CE31), RD4(CE40), RD4(CE50), RD4(CE60), RD4(CE70), RD4(CE71), RD4(CE61), + RD4(CE51), RD4(CE41), RD4(CE32), RD4(CE22), RD4(CE03), RD4(CE13), RD4(CE04), RD4(CE14), + RD4(CE23), RD4(CE33), RD4(CE42), RD4(CE52), RD4(CE62), RD4(CE72), RD4(CE43), RD4(CE53), + RD4(CE63), RD4(CE73), RD4(CE24), RD4(CE34), RD4(CE05), RD4(CE15), RD4(CE06), RD4(CE16), + RD4(CE25), RD4(CE35), RD4(CE44), RD4(CE54), RD4(CE64), RD4(CE74), RD4(CE45), RD4(CE55), + RD4(CE65), RD4(CE75), RD4(CE26), RD4(CE36), RD4(CE07), RD4(CE17), RD4(CE27), RD4(CE37), + RD4(CE46), RD4(CE56), RD4(CE66), RD4(CE76), RD4(CE47), RD4(CE57), RD4(CE67), RD4(CE77) +}; +#endif + + void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec) { mpeg2dec->decoder.scan = mpeg2_scan_norm; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + mpeg2dec->decoder.qscale_scan_order = qscale_normal; +#endif mpeg2dec->picture = mpeg2dec->pictures; mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf; mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf; @@ -94,6 +165,34 @@ info->user_data = NULL; info->user_data_len = 0; } +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE +void update_quantizer_scaled(uint16_t* dest,uint8_t* src,int scale) +{ + int i = 8; + do { + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + *dest++ = *src++ * scale; + } while (--i); +} +void init_quantizer_scaled(decoder_t * decoder) +{ + decoder->scale_value[0] = decoder->quantizer_scale; + decoder->scale_value[1] = 0xffff; //mark not initialized + update_quantizer_scaled(decoder->scaled[INTRA_SCALED_BASE], decoder->intra_quantizer_matrix, decoder->scale_value[0]); + update_quantizer_scaled(decoder->scaled[NON_INTRA_SCALED_BASE],decoder->non_intra_quantizer_matrix,decoder->scale_value[0]); + decoder->intra_scaled = decoder->scaled[INTRA_SCALED_BASE]; + decoder->non_intra_scaled = decoder->scaled[NON_INTRA_SCALED_BASE]; +} +#endif + + int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec) { uint8_t * buffer = mpeg2dec->chunk_start; @@ -141,6 +240,8 @@ decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] = default_intra_quantizer_matrix [i]; + + if (buffer[7] & 1) for (i = 0; i < 64; i++) decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] = @@ -149,6 +250,8 @@ for (i = 0; i < 64; i++) decoder->non_intra_quantizer_matrix[i] = 16; + + INIT_QUANTIZER_SCALED(decoder); sequence->profile_level_id = 0x80; sequence->colour_primaries = 1; sequence->transfer_characteristics = 1; @@ -160,6 +263,9 @@ decoder->q_scale_type = 0; decoder->concealment_motion_vectors = 0; decoder->scan = mpeg2_scan_norm; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + decoder->qscale_scan_order = qscale_normal; +#endif decoder->picture_structure = FRAME_PICTURE; mpeg2dec->ext_state = SEQ_EXT; @@ -170,6 +276,7 @@ return 0; } + static int sequence_ext (mpeg2dec_t * mpeg2dec) { uint8_t * buffer = mpeg2dec->chunk_start; @@ -519,6 +626,9 @@ decoder->q_scale_type = (buffer[3] >> 4) & 1; decoder->intra_vlc_format = (buffer[3] >> 3) & 1; decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + decoder->qscale_scan_order = (buffer[3] & 4) ? qscale_alternate :qscale_normal; +#endif flags |= (buffer[4] & 0x80) ? PIC_FLAG_PROGRESSIVE_FRAME : 0; if (buffer[4] & 0x40) flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) & @@ -575,13 +685,25 @@ decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] = (buffer[i] << 5) | (buffer[i+1] >> 3); buffer += 64; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + if (decoder->scale_value[0] == decoder->quantizer_scale) i = 0; + else i=1; + decoder->scale_value[i^1] = 0xffff; //invalidate + UPDATE_QUANTIZER_SCALED(decoder->scaled[INTRA_SCALED_BASE+i], decoder->intra_quantizer_matrix, decoder->scale_value[i]); +#endif } - if (buffer[0] & 4) + if (buffer[0] & 4) { for (i = 0; i < 64; i++) decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] = (buffer[i] << 6) | (buffer[i+1] >> 2); - +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + if (decoder->scale_value[0] == decoder->quantizer_scale) i = 0; + else i=1; + decoder->scale_value[i^1] = 0xffff; //invalidate + UPDATE_QUANTIZER_SCALED(decoder->scaled[NON_INTRA_SCALED_BASE+i],decoder->non_intra_quantizer_matrix,decoder->scale_value[i]); +#endif + } return 0; } diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/idct.c mpeg2dec-0.3.1/libmpeg2/idct.c --- mpeg2dec-0.3.1.prev/libmpeg2/idct.c 2002-12-09 21:41:57.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/idct.c 2003-05-09 03:32:36.000000000 -0700 @@ -30,6 +30,8 @@ #include "mpeg2_internal.h" #include "attributes.h" +//#undef ARCH_ARM + #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ @@ -38,13 +40,15 @@ #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ /* idct main entry point */ -void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride); -void (* mpeg2_idct_add) (int last, int16_t * block, - uint8_t * dest, int stride); +void (* mpeg2_idct_copy) ( int16_t * block, uint8_t * dest, int stride); +void (* mpeg2_idct_add) (int last, int16_t * block, uint8_t * dest, int stride); +void (* mpeg2_idct_fill) (int dct, uint8_t * dest, int stride, int addFlag); + + +#ifndef ARCH_ARM static uint8_t clip_lut[1024]; #define CLIP(i) ((clip_lut+384)[(i)]) - #if 0 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \ do { \ @@ -153,8 +157,7 @@ block[8*6] = (a1 - b1) >> 17; block[8*7] = (a0 - b0) >> 17; } - -static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest, +static void mpeg2_idct_copy_c ( int16_t * block, uint8_t * dest, const int stride) { int i; @@ -226,9 +229,17 @@ } while (--i); } } +#endif + + + void mpeg2_idct_init (uint32_t accel) { +#ifndef ARCH_ARM + mpeg2_idct_fill = mpeg2_idct_fill_default; +#endif + #ifdef ARCH_X86 if (accel & MPEG2_ACCEL_X86_MMXEXT) { mpeg2_idct_copy = mpeg2_idct_copy_mmxext; @@ -266,19 +277,30 @@ } else #endif { +#ifdef ARCH_ARM + extern void mpeg2_idct_arm_init(); + mpeg2_idct_copy = mpeg2_idct_copy_arm; + mpeg2_idct_add = mpeg2_idct_add_arm; + mpeg2_idct_arm_init(); + mpeg2_idct_fill = mpeg2_idct_fill_arm; +#else + int i,j; extern uint8_t mpeg2_scan_norm[64]; extern uint8_t mpeg2_scan_alt[64]; - int i, j; - mpeg2_idct_copy = mpeg2_idct_copy_c; mpeg2_idct_add = mpeg2_idct_add_c; for (i = -384; i < 640; i++) clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i); for (i = 0; i < 64; i++) { j = mpeg2_scan_norm[i]; +//bit 0->2 1->0 2->1 3->5 4->3 5->4 +//col 0->0, 1->4, 2->1, 3->5, 4->2, 5->6, 7->7 +//row 0->0, 1->4, 2->1, 3->5, 4->2, 5->6, 7->7 + mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); j = mpeg2_scan_alt[i]; mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); } +#endif } } diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/invDCT.S mpeg2dec-0.3.1/libmpeg2/invDCT.S --- mpeg2dec-0.3.1.prev/libmpeg2/invDCT.S 1969-12-31 17:00:00.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/invDCT.S 2003-05-10 17:04:51.000000000 -0700 @@ -0,0 +1,431 @@ +/* +//------------------------------------------------------------------------------ +//FILE DESCRIPTION: Contain assembler routines for the Inverse DCT +//------------------------------------------------------------------------------ +void GetIDCT_ARM(unsigned char* dest, short *srcDCT, int rowAdvance, int addFlag) + +srcDCT is already unquantized array of 64. + +dest is array of 64 +results are clamped to be between 0 min and 255 max + + Copyright (c) 2003 Troy Kisky (troy.kisky@boundarydevices.com + +*/ + +//#define QSCALE +//#define JPEG + + + +//tens textequ <1.0000000000000000000> +//t2 textequ <.9238795320000000000> +//t4 textequ <.7071067811865475244> +//t6 textequ <.3826834320000000000> +//tSqrt2 textequ <1.4142135623730950488> + +//t2mt6 <.5411961000000000000> +//t2pt6 <1.3065629640000000000> +#define oneDivC4 0x5a82799a // /2**30 ((tSqrt2 *(040000000h/400h))/tens); //sqrt(2) +#define C6x 0x61f78a99 // /2**32 ((t6 *(080000000h/400h))/tens); //t6=.382683432 +#define C2mC6x 0x4545e9ef // /2**31 (((t2-t6)*(080000000h/400h))/tens); //t2=.923879532, t2-t6=.541 +#define C2pC6x 0x539eba44 // /2**30 (((t2+t6)*(040000000h/400h))/tens); //+=1.306 +#define C2x 0x7641af3c // /2**31 ((t2 *(080000000h/400h))/tens); +#define C4x 0x5a82799a // /2**31 ((t4 *(080000000h/400h))/tens); + +#define SHFT_C2mC6x 31 +#define SHFT_C2mC6x_M1 30 + +#define SHFT_oneDivC4 30 + + +#define SHFT_C6x 32 +#define SHFT_C6x_M1 31 + +#define SHFT_C2pC6x 30 +#define SHFT_C2pC6x_M1 29 + +iOneDivC4: .int oneDivC4 +iC6x: .int C6x +iC2mC6x: .int C2mC6x +iC2pC6x: .int C2pC6x + + + +.macro doAdd1 reg1,reg2,dct,src,offset + ldrb \reg1,[\src,#\offset] + ldrb \reg2,[\src,#\offset+1] + add \reg1,\reg1,\dct + add \reg2,\reg2,\dct + cmp \reg1,#255 + mvnhi \reg1,\reg1,asr #31 //neg vals get 0, pos vals get 0xffffffff, or 255 if just worried about low byte + + cmp \reg2,#255 + mvnhi \reg2,\reg2,asr #31 + + strb \reg1,[\src,#\offset] + strb \reg2,[\src,#\offset+1] +.endm +// r0 r1 r2 r3 +//void mpeg2_idct_fill_arm(int dct,uint8_t * dest,const int stride,const int addFlag) + .global mpeg2_idct_fill_arm +mpeg2_idct_fill_arm: + add r0,r0,#4 //round + mov r0,r0,asr #3 + cmp r3,#0 + beq mm2 + stmdb sp!, { r4, lr } // all callee saved regs + mov lr,#8 // int i=8; +mm1: + doAdd1 r4,r3,r0,r1,0 + doAdd1 r4,r3,r0,r1,2 + doAdd1 r4,r3,r0,r1,4 + doAdd1 r4,r3,r0,r1,6 + subs lr,lr,#1 + add r1,r1,r2 // dest+=stride; + bne mm1 // } while (--i); + ldmia sp!, { r4, pc } // restore callee saved regs and return + +mm2: + cmp r0,#255 + orr r3,r1,r2 + mvnhi r0,r0,asr #31 //neg vals get 0, pos vals get 0xffffffff, or 255 in each byte + orrls r0,r0,r0,lsl #8 + orrls r0,r0,r0,lsl #16 + tst r3,#7 + bne mm3 //br if alignment problems + mov r3,r1 + mov r1,r0 + + strd r0,[r3],r2 //0 + strd r0,[r3],r2 //1 + strd r0,[r3],r2 //2 + strd r0,[r3],r2 //3 + strd r0,[r3],r2 //4 + strd r0,[r3],r2 //5 + strd r0,[r3],r2 //6 + strd r0,[r3] //7 + mov pc,lr +mm3: + mov r3,#8 +mm4: + strb r0,[r1,#0] + strb r0,[r1,#1] + strb r0,[r1,#2] + strb r0,[r1,#3] + strb r0,[r1,#4] + strb r0,[r1,#5] + strb r0,[r1,#6] + strb r0,[r1,#7] + add r1,r1,r2 + subs r3,r3,#1 + bne mm4 + mov pc,lr + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////// + +.macro StoreIntCol src + str r0,[\src,#1*8*4] + str r1,[\src,#2*8*4] + str r2,[\src,#3*8*4] + str r3,[\src,#4*8*4] + str r4,[\src,#5*8*4] + str r5,[\src,#6*8*4] + str r6,[\src,#7*8*4] + str r7,[\src,#8*8*4] + add \src,\src,#4 +.endm + +.macro LoadInt16Col src + ldrsh r0,[\src],#2 + ldrsh r1,[\src,#1*8*2 -2] + ldrsh r2,[\src,#2*8*2 -2] + ldrsh r3,[\src,#3*8*2 -2] + ldrsh r4,[\src,#4*8*2 -2] + ldrsh r5,[\src,#5*8*2 -2] + ldrsh r6,[\src,#6*8*2 -2] + ldrsh r7,[\src,#7*8*2 -2] +.endm + +.macro DivC4 reg,rTempLo,rTemp + ldr \rTemp,iOneDivC4 + smull \rTempLo,\reg,\rTemp,\reg + mov \reg,\reg,LSL #32-SHFT_oneDivC4 + add \reg,\reg,\rTempLo,LSR #SHFT_oneDivC4 +.endm + +.macro SMullI a,constVal,shift,rTempLo,rTemp + ldr \rTemp,\constVal + smull \rTempLo,\a,\rTemp,\a + mov \a,\a,LSL #32-\shift + add \a,\a,\rTempLo,LSR #\shift +.endm + +.macro DoCalculate rTempLo,rTemp,rTemp2 +// P' and Q' are already done + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 -1 | +// | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 1 1 0 | +// | 0 0 0 0 1 0 0 1 | +// B1 4 additions + sub r4,r4,r7 // a4 -= a7; + sub r5,r5,r6 // a5 -= a6; + add r7,r4,r7,lsl #1 // a7 = a4 + (a7<<1); + add r6,r5,r6,lsl #1 // a6 = a5 + (a6<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 1 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 -1 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 1 0 1 | +// B2 4 additions + ldr \rTemp,iOneDivC4 + sub r2,r2,r3 // a2 -= a3; + sub r5,r5,r7 // a5 -= a7; + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1/c4 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 2c2 0 2c6 0 | +// | 0 0 0 0 0 1/c4 0 0 | +// | 0 0 0 0 -2c6 0 2c2 0 | +// | 0 0 0 0 0 0 0 1 | +// M 5 mult 3 additions + +// DivC4 r2,\rTempLo,\rTemp // a2 = DivC4(a2); + smull \rTempLo,\rTemp2,\rTemp,r2 + add r7,r5,r7,lsl #1 // a7 = a5 + (a7<<1); from above + add r3,r2,r3,lsl #1 // a3 = a2 + (a3<<1); + mov r2,\rTemp2,LSL #32-SHFT_oneDivC4 + add r2,r2,\rTempLo,LSR #SHFT_oneDivC4 + +// DivC4 r5,\rTempLo,\rTemp // a5 = DivC4(a5); + smull \rTempLo,r5,\rTemp,r5 + ldr \rTemp,iC6x + add \rTemp2,r4,r6 // int mab = a4+a6; + mov r5,r5,LSL #32-SHFT_oneDivC4 + add r5,r5,\rTempLo,LSR #SHFT_oneDivC4 + +// a4' = a4*(2c2) + a6*(2c6); +// a6' = a6*(2c2) - a4*(2c6); +//is equal to +// a4' = (a4)*(2c2-2c6) + (a4+a6)*(2c6); +// a6' = (a6)*(2c2+2c6) - (a4+a6)*(2c6); + // { +// SMullI \rTemp2,iC6x,SHFT_C6x_M1,\rTempLo,\rTemp // mab = smull(mab,C6x,SHFT_C6x_M1); //31-1 +// SMullI r4,iC2mC6x,SHFT_C2mC6x_M1,\rTempLo,\rTemp // a4 = smull(a4,C2mC6x,SHFT_C2mC6x_M1) + mab; //31-1 +// SMullI r6,iC2pC6x,SHFT_C2pC6x_M1,\rTempLo,\rTemp // a6 = smull(a6,C2pC6x,SHFT_C2pC6x_M1) - mab; //30-1 + + + smull \rTempLo,\rTemp2,\rTemp,\rTemp2 + ldr \rTemp,iC2mC6x + sub r0,r0,r1 // a0 -= a1; //from below + mov \rTemp2,\rTemp2,LSL #32-SHFT_C6x_M1 + add \rTemp2,\rTemp2,\rTempLo,LSR #SHFT_C6x_M1 + + smull \rTempLo,r4,\rTemp,r4 + ldr \rTemp,iC2pC6x + sub r2,r2,r3 // a2 -= a3; + mov r4,r4,LSL #32-SHFT_C2mC6x_M1 + add r4,r4,\rTempLo,LSR #SHFT_C2mC6x_M1 + + smull \rTempLo,r6,\rTemp,r6 + mov r6,r6,LSL #32-SHFT_C2pC6x_M1 + add r6,r6,\rTempLo,LSR #SHFT_C2pC6x_M1 +///////////////////////////////// + add r4,r4,\rTemp2 + sub r6,r6,\rTemp2 + // } + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 -1 0 0 0 0 0 0 | +// | 1 1 0 0 0 0 0 0 | +// | 0 0 1 -1 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 -1 | +// | 0 0 0 0 0 0 0 1 | +// A1 4 additions + sub r6,r6,r7 // a6 -= a7; + add r1,r0,r1,lsl #1 // a1 = a0 + (a1<<1); + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 -1 0 0 0 0 | +// | 0 1 -1 0 0 0 0 0 | +// | 0 1 1 0 0 0 0 0 | +// | 1 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 0 0 0 | +// | 0 0 0 0 0 1 -1 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A2 5 additions + + sub r0,r0,r3 // a0 -= a3; + sub r1,r1,r2 // a1 -= a2; + sub r5,r5,r6 // a5 -= a6; + add r3,r0,r3,lsl #1 // a3 = a0 + (a3<<1); + add r2,r1,r2,lsl #1 // a2 = a1 + (a2<<1); + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 0 | +// | 0 1 0 0 0 0 0 0 | +// | 0 0 1 0 0 0 0 0 | +// | 0 0 0 1 0 0 0 0 | +// | 0 0 0 0 1 -1 0 0 | +// | 0 0 0 0 0 1 0 0 | +// | 0 0 0 0 0 0 1 0 | +// | 0 0 0 0 0 0 0 1 | +// A3 1 addition + sub r4,r4,r5 // a4 -= a5; + + +// a0 a1 a2 a3 a4 a5 a6 a7 +// | 1 0 0 0 0 0 0 -1 | +// | 0 1 0 0 0 0 -1 0 | +// | 0 0 1 0 0 -1 0 0 | +// | 0 0 0 1 -1 0 0 0 | +// | 0 0 0 1 1 0 0 0 | +// | 0 0 1 0 0 1 0 0 | +// | 0 1 0 0 0 0 1 0 | +// | 1 0 0 0 0 0 0 1 | +// A4 8 additions + sub r0,r0,r7 // a0 -= a7; + sub r1,r1,r6 // a1 -= a6; + sub r2,r2,r5 // a2 -= a5; + sub r3,r3,r4 // a3 -= a4; + add r7,r0,r7,lsl #1 // a7 = a0 + (a7<<1); + add r6,r1,r6,lsl #1 // a6 = a1 + (a6<<1); + add r5,r2,r5,lsl #1 // a5 = a2 + (a5<<1); + add r4,r3,r4,lsl #1 // a4 = a3 + (a4<<1); +.endm + +#ifdef QSCALE +#define fracBits 16 +#else +#define fracBits 6 +#endif + +#ifdef JPEG +#define levelShiftRound (0x0800000+(1<<(fracBits-1))) //jpeg +#else +#define levelShiftRound (1<<(fracBits-1)) //mpeg +#endif + + +.macro rangeCheck reg1,reg2,regLSR +#ifdef JPEG + add \reg1,\reg1,\regLSR //levelShiftRound + add \reg2,\reg2,\regLSR +#else + add \reg1,\reg1,#levelShiftRound + add \reg2,\reg2,#levelShiftRound +#endif + mov \reg1,\reg1,asr #fracBits //int a = ((*dct)+levelShiftRound)>>fracBits; + mov \reg2,\reg2,asr #fracBits + cmp \reg1,#255 + mvnhi \reg1,\reg1,asr #31 //neg vals get 0, pos vals get 0xffffffff, or 255 if just worried about low byte + + cmp \reg2,#255 + mvnhi \reg2,\reg2,asr #31 + +.endm + + +#define rTemp r8 //rTemp,rTemp2 are even/odd pair +#define rTemp2 r9 +#define rTempLo r10 +#define rSrcDCT r12 +#define rDest r12 //double duty +#define rLoop lr + +#define rRowAdvance rTemp +#define rAddFlag rTemp2 + +//void GetIDCT_ARM(short *srcDCT, BYTE *dest, DWORD rowAdvance, int addFlag) + + .global GetIDCT_ARM +GetIDCT_ARM: + stmdb sp!, { r1 - r12, lr } // all callee saved regs + add fp,sp,#3*4 + sub sp, sp, #(64+8)*4 // reserve some space on the stack, the top 8 entries are not used, but allow the + // the loop to increase the sp as it goes without losing good data (signals won't trash good data) + bic sp, sp, #31 // get to a cache line boundary + mov rSrcDCT,r0 + mov rLoop,#16 +nextCol: + LoadInt16Col rSrcDCT +Calc1: + sub rLoop,rLoop,#1 + DoCalculate rTempLo,rTemp,rTemp2 + cmp rLoop,#8 + blo SaveRow + StoreIntCol sp //advances sp by 4 + bhi nextCol +////// sub sp,sp, #8*4 //reset pointer, this is no longer needed + ldr rDest,[fp,#-12] + ldmia sp!,{r0-r7} + b Calc1 +SaveRow: + ldmdb fp, { rRowAdvance,rAddFlag} // restore saved parameters + + cmp rAddFlag,#0 + beq ll2 + + ldrb rAddFlag,[rDest,#0] + ldrb rTempLo,[rDest,#1] + add r0,r0,rAddFlag,LSL #fracBits + add r1,r1,rTempLo,LSL #fracBits + + ldrb rAddFlag,[rDest,#2] + ldrb rTempLo,[rDest,#3] + add r2,r2,rAddFlag,LSL #fracBits + add r3,r3,rTempLo,LSL #fracBits + + ldrb rAddFlag,[rDest,#4] + ldrb rTempLo,[rDest,#5] + add r4,r4,rAddFlag,LSL #fracBits + add r5,r5,rTempLo,LSL #fracBits + + ldrb rAddFlag,[rDest,#6] + ldrb rTempLo,[rDest,#7] + add r6,r6,rAddFlag,LSL #fracBits + add r7,r7,rTempLo,LSL #fracBits + +ll2: + rangeCheck r0,r1,rLevelShiftRound // *dest++ = rangeCheck(dct++); *dest++ = rangeCheck(dct++); + rangeCheck r2,r3,rLevelShiftRound // *dest++ = rangeCheck(dct++); *dest++ = rangeCheck(dct++); + rangeCheck r4,r5,rLevelShiftRound // *dest++ = rangeCheck(dct++); *dest++ = rangeCheck(dct++); + rangeCheck r6,r7,rLevelShiftRound // *dest++ = rangeCheck(dct++); *dest++ = rangeCheck(dct++); + strb r1,[rDest,#1] + strb r2,[rDest,#2] + strb r3,[rDest,#3] + strb r4,[rDest,#4] + strb r5,[rDest,#5] + strb r6,[rDest,#6] + strb r7,[rDest,#7] + strb r0,[rDest],rRowAdvance + + tst rLoop,rLoop + ldmneia sp!,{r0-r7} + bne Calc1 + + mov sp,fp + ldmia sp!, { r4 - r12, pc } // restore callee saved regs and return + diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/Makefile.am mpeg2dec-0.3.1/libmpeg2/Makefile.am --- mpeg2dec-0.3.1.prev/libmpeg2/Makefile.am 2002-12-08 18:16:01.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/Makefile.am 2003-05-21 17:45:48.000000000 -0700 @@ -6,7 +6,7 @@ motion_comp_altivec.c idct_altivec.c \ motion_comp_alpha.c idct_alpha.c \ motion_comp_mlib.c idct_mlib.c \ - motion_comp.c idct.c + motion_comp.c idct.c GetIdct.c invDCT.S libmpeg2_la_LIBADD = @LIBMPEG2_LIBS@ libmpeg2_la_LDFLAGS = -no-undefined diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/mpeg2_internal.h mpeg2dec-0.3.1/libmpeg2/mpeg2_internal.h --- mpeg2dec-0.3.1.prev/libmpeg2/mpeg2_internal.h 2002-12-13 03:02:47.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/mpeg2_internal.h 2003-05-09 17:42:49.000000000 -0700 @@ -21,6 +21,19 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE +void update_quantizer_scaled(uint16_t* dest,uint8_t* src,int scale); +void init_quantizer_scaled(decoder_t * decoder); +#define UPDATE_QUANTIZER_SCALED(dest,src,scale) update_quantizer_scaled(dest,src,scale) +#define INIT_QUANTIZER_SCALED(decoder) init_quantizer_scaled(decoder) +#else +#define UPDATE_QUANTIZER_SCALED(dest,src,scale) +#define INIT_QUANTIZER_SCALED(decoder) +#endif + + /* macroblock modes */ #define MACROBLOCK_INTRA 1 #define MACROBLOCK_PATTERN 2 @@ -59,7 +72,7 @@ /* next inside a slice, and is never used outside of mpeg2_slice() */ /* DCT coefficients - should be kept aligned ! */ - int16_t DCTblock[64]; + int16_t DCTblock[64] __attribute__ ((aligned (32))); /* bit parsing stuff */ uint32_t bitstream_buf; /* current 32 bit working set */ @@ -89,15 +102,23 @@ /* predictor for DC coefficients in intra blocks */ int16_t dc_dct_pred[3]; - int quantizer_scale; /* remove */ int dmv_offset; /* remove */ unsigned int v_offset; /* remove */ + int quantizer_scale; /* remove */ /* now non-slice-specific information */ /* sequence header stuff */ uint8_t intra_quantizer_matrix [64]; uint8_t non_intra_quantizer_matrix [64]; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE +#define INTRA_SCALED_BASE 0 +#define NON_INTRA_SCALED_BASE 2 + uint16_t scaled[4][64]; + uint16_t* intra_scaled; + uint16_t* non_intra_scaled; + uint16_t scale_value[2]; +#endif /* The width and height of the picture snapped to macroblock units */ int width; @@ -131,7 +152,9 @@ /* pointer to the zigzag scan we're supposed to be using */ const uint8_t * scan; - +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + const int* qscale_scan_order; +#endif int second_field; int mpeg1; @@ -239,37 +262,37 @@ void mpeg2_idct_init (uint32_t accel); /* idct_mlib.c */ -void mpeg2_idct_add_mlib (int last, int16_t * block, - uint8_t * dest, int stride); -void mpeg2_idct_copy_mlib_non_ieee (int16_t * block, uint8_t * dest, - int stride); -void mpeg2_idct_add_mlib_non_ieee (int last, int16_t * block, - uint8_t * dest, int stride); +void mpeg2_idct_add_mlib (int last, int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_copy_mlib_non_ieee ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_mlib_non_ieee (int last, int16_t * block, uint8_t * dest, int stride); /* idct_mmx.c */ -void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride); -void mpeg2_idct_add_mmxext (int last, int16_t * block, - uint8_t * dest, int stride); -void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride); -void mpeg2_idct_add_mmx (int last, int16_t * block, - uint8_t * dest, int stride); +void mpeg2_idct_copy_mmxext ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_mmxext (int last, int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_copy_mmx ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_mmx (int last, int16_t * block, uint8_t * dest, int stride); void mpeg2_idct_mmx_init (void); /* idct_altivec.c */ -void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride); -void mpeg2_idct_add_altivec (int last, int16_t * block, - uint8_t * dest, int stride); +void mpeg2_idct_copy_altivec ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_altivec (int last, int16_t * block, uint8_t * dest, int stride); void mpeg2_idct_altivec_init (void); /* idct_alpha.c */ -void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, int stride); -void mpeg2_idct_add_mvi (int last, int16_t * block, - uint8_t * dest, int stride); -void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, int stride); -void mpeg2_idct_add_alpha (int last, int16_t * block, - uint8_t * dest, int stride); +void mpeg2_idct_copy_mvi ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_mvi (int last, int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_copy_alpha ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_alpha (int last, int16_t * block, uint8_t * dest, int stride); void mpeg2_idct_alpha_init(int no_mvi); +#ifdef ARCH_ARM +void mpeg2_idct_copy_arm ( int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_add_arm (int last, int16_t * block, uint8_t * dest, int stride); +void mpeg2_idct_fill_arm (int dct,uint8_t * dest,const int stride,const int addFlag); +#endif + +void mpeg2_idct_fill_default(int dct,uint8_t * dest,const int stride,const int addFlag); + /* motion_comp.c */ void mpeg2_mc_init (uint32_t accel); diff -Naru -x .deps mpeg2dec-0.3.1.prev/libmpeg2/slice.c mpeg2dec-0.3.1/libmpeg2/slice.c --- mpeg2dec-0.3.1.prev/libmpeg2/slice.c 2002-11-29 03:47:44.000000000 -0700 +++ mpeg2dec-0.3.1/libmpeg2/slice.c 2003-05-10 06:09:54.000000000 -0700 @@ -31,8 +31,8 @@ extern mpeg2_mc_t mpeg2_mc; extern void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride); -extern void (* mpeg2_idct_add) (int last, int16_t * block, - uint8_t * dest, int stride); +extern void (* mpeg2_idct_add) (int last, int16_t * block, uint8_t * dest, int stride); +extern void (* mpeg2_idct_fill) (int dct, uint8_t * dest, int stride,int addFlag); extern void (* mpeg2_cpu_state_save) (cpu_state_t * state); extern void (* mpeg2_cpu_state_restore) (cpu_state_t * state); @@ -138,21 +138,37 @@ #undef bit_ptr } -static inline int get_quantizer_scale (decoder_t * const decoder) +static inline void set_quantizer_scale (decoder_t * const decoder) { #define bit_buf (decoder->bitstream_buf) #define bits (decoder->bitstream_bits) #define bit_ptr (decoder->bitstream_ptr) - int quantizer_scale_code; + int val; + int i; - quantizer_scale_code = UBITS (bit_buf, 5); + val = UBITS (bit_buf, 5); DUMPBITS (bit_buf, bits, 5); - if (decoder->q_scale_type) - return non_linear_quantizer_scale [quantizer_scale_code]; - else - return quantizer_scale_code << 1; + if (decoder->q_scale_type) val = non_linear_quantizer_scale [val]; + else val = val << 1; + + if (decoder->quantizer_scale != val) { + decoder->quantizer_scale = val; +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE + if (decoder->scale_value[0] == val) i = 0; + else if (decoder->scale_value[1] == val) i = 1; + else { + if (decoder->intra_scaled == decoder->scaled[INTRA_SCALED_BASE]) i = 1; //use buffer that was idle before call + else i=0; + decoder->scale_value[i] = val; + UPDATE_QUANTIZER_SCALED(decoder->scaled[INTRA_SCALED_BASE+i], decoder->intra_quantizer_matrix, val); + UPDATE_QUANTIZER_SCALED(decoder->scaled[NON_INTRA_SCALED_BASE+i],decoder->non_intra_quantizer_matrix,val); + } + decoder->intra_scaled = decoder->scaled[INTRA_SCALED_BASE+i]; + decoder->non_intra_scaled = decoder->scaled[NON_INTRA_SCALED_BASE+i]; +#endif + } #undef bit_buf #undef bits #undef bit_ptr @@ -355,14 +371,53 @@ val = SBITS (val, 1) ^ 2047; \ } while (0) -static void get_intra_block_B14 (decoder_t * const decoder) + +#ifdef COMBINE_QUANTIZER_MATRIX_QUANTIZER_SCALE +#define CE00_SHIFT (30-27) +#define INDEX_CE00 0 //scan[0] +#define INDEX_CE77 (6*8+6) //scan[63] +#define __smull27(a,b) \ +({ register int __rTempLo,__rTempHi; register int __val=a; \ + __asm__ ("%@ Inlined smull \n\ + smull %1,%2,%3,%0 \n\ + mov %0,%2,LSL # (32-27) \n\ + add %0,%0,%1,LSR # (27)" \ + : "=&r" (__val), \ + "=&r" (__rTempLo), "=&r" (__rTempHi) \ + : "r" (b), "0" (__val) ); \ + __val;}) + +#define QMI(name, decoder) const uint16_t * name = decoder->intra_scaled; \ + const int* qscale = decoder->qscale_scan_order +#define QMNI(name, decoder) const uint16_t * name = decoder->non_intra_scaled; \ + const int* qscale = decoder->qscale_scan_order +#define QMULT(a,c) ((a)*(c)) +#define QCOMBINE(src,index) __smull27(src,qscale[index]) //doing the multiply early avoids multiplying Zero coefficients + //S&11bit# * S&31bit# = S&42bit#, >>27 = S&15bit# +#define QCOMBINE0(dest) dest[INDEX_CE00] <<= CE00_SHIFT + +#else +#define CE00_SHIFT 0 +#define INDEX_CE00 0 +#define INDEX_CE77 63 +#define QMI(name, decoder) const uint8_t * name = decoder->intra_quantizer_matrix; \ + int quantizer_scale = decoder->quantizer_scale +#define QMNI(name, decoder) const uint8_t * name = decoder->non_intra_quantizer_matrix; \ + int quantizer_scale = decoder->quantizer_scale +#define QMULT(a,c) ((a)*(quantizer_scale)*(c)) +#define QCOMBINE(src,index) src +#define QCOMBINE0(dest) +#endif + + + +static int get_intra_block_B14 (decoder_t * const decoder) { int i; int j; - int val; + int val=0; const uint8_t * scan = decoder->scan; - const uint8_t * quant_matrix = decoder->intra_quantizer_matrix; - int quantizer_scale = decoder->quantizer_scale; + QMI(quant_matrix, decoder); int mismatch; const DCTtab * tab; uint32_t bit_buf; @@ -372,8 +427,8 @@ dest = decoder->DCTblock; i = 0; - mismatch = ~dest[0]; - + mismatch = ~dest[INDEX_CE00]; + QCOMBINE0(dest); bit_buf = decoder->bitstream_buf; bits = decoder->bitstream_bits; bit_ptr = decoder->bitstream_ptr; @@ -393,13 +448,13 @@ j = scan[i]; bit_buf <<= tab->len; bits += tab->len + 1; - val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4; + val = QMULT(tab->level, quant_matrix[j]) >> 4; /* if (bitstream_get (1)) val = -val; */ val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1); SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; bit_buf <<= 1; @@ -425,11 +480,10 @@ DUMPBITS (bit_buf, bits, 12); NEEDBITS (bit_buf, bits, bit_ptr); - val = (SBITS (bit_buf, 12) * - quantizer_scale * quant_matrix[j]) / 16; + val = QMULT(SBITS (bit_buf, 12), quant_matrix[j]) / 16; SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; DUMPBITS (bit_buf, bits, 12); @@ -462,21 +516,22 @@ } break; /* illegal, check needed to avoid buffer overflow */ } - dest[63] ^= mismatch & 1; + if (mismatch & 1) dest[INDEX_CE77] = QCOMBINE((dest[INDEX_CE77]) ? val^1 : 1,63); +// dest[INDEX_CE77] ^= mismatch & 1; DUMPBITS (bit_buf, bits, 2); /* dump end of block code */ decoder->bitstream_buf = bit_buf; decoder->bitstream_bits = bits; decoder->bitstream_ptr = bit_ptr; + return i; } -static void get_intra_block_B15 (decoder_t * const decoder) +static int get_intra_block_B15 (decoder_t * const decoder) { int i; int j; - int val; + int val=0; const uint8_t * scan = decoder->scan; - const uint8_t * quant_matrix = decoder->intra_quantizer_matrix; - int quantizer_scale = decoder->quantizer_scale; + QMI(quant_matrix, decoder); int mismatch; const DCTtab * tab; uint32_t bit_buf; @@ -486,7 +541,8 @@ dest = decoder->DCTblock; i = 0; - mismatch = ~dest[0]; + mismatch = ~dest[INDEX_CE00]; + QCOMBINE0(dest); bit_buf = decoder->bitstream_buf; bits = decoder->bitstream_bits; @@ -506,13 +562,13 @@ j = scan[i]; bit_buf <<= tab->len; bits += tab->len + 1; - val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4; + val = QMULT(tab->level, quant_matrix[j]) >> 4; /* if (bitstream_get (1)) val = -val; */ val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1); SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; bit_buf <<= 1; @@ -537,11 +593,10 @@ DUMPBITS (bit_buf, bits, 12); NEEDBITS (bit_buf, bits, bit_ptr); - val = (SBITS (bit_buf, 12) * - quantizer_scale * quant_matrix[j]) / 16; + val = QMULT(SBITS (bit_buf, 12), quant_matrix[j]) / 16; SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; DUMPBITS (bit_buf, bits, 12); @@ -575,21 +630,22 @@ } break; /* illegal, check needed to avoid buffer overflow */ } - dest[63] ^= mismatch & 1; + if (mismatch & 1) dest[INDEX_CE77] = QCOMBINE((dest[INDEX_CE77]) ? val^1 : 1,63); +// dest[INDEX_CE77] ^= mismatch & 1; DUMPBITS (bit_buf, bits, 4); /* dump end of block code */ decoder->bitstream_buf = bit_buf; decoder->bitstream_bits = bits; decoder->bitstream_ptr = bit_ptr; + return i; } static int get_non_intra_block (decoder_t * const decoder) { int i; int j; - int val; + int val=0; const uint8_t * scan = decoder->scan; - const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix; - int quantizer_scale = decoder->quantizer_scale; + QMNI(quant_matrix, decoder); int mismatch; const DCTtab * tab; uint32_t bit_buf; @@ -626,13 +682,13 @@ j = scan[i]; bit_buf <<= tab->len; bits += tab->len + 1; - val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5; + val = QMULT((2*tab->level+1), quant_matrix[j]) >> 5; /* if (bitstream_get (1)) val = -val; */ val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1); SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; bit_buf <<= 1; @@ -662,10 +718,10 @@ DUMPBITS (bit_buf, bits, 12); NEEDBITS (bit_buf, bits, bit_ptr); val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1; - val = (val * quantizer_scale * quant_matrix[j]) / 32; + val = QMULT(val, quant_matrix[j]) / 32; SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); mismatch ^= val; DUMPBITS (bit_buf, bits, 12); @@ -698,7 +754,8 @@ } break; /* illegal, check needed to avoid buffer overflow */ } - dest[63] ^= mismatch & 1; + if (mismatch & 1) dest[INDEX_CE77] = QCOMBINE((dest[INDEX_CE77]) ? val^1 : 1,63); +// dest[INDEX_CE77] ^= mismatch & 1; DUMPBITS (bit_buf, bits, 2); /* dump end of block code */ decoder->bitstream_buf = bit_buf; decoder->bitstream_bits = bits; @@ -706,14 +763,13 @@ return i; } -static void get_mpeg1_intra_block (decoder_t * const decoder) +static int get_mpeg1_intra_block (decoder_t * const decoder) { int i; int j; int val; const uint8_t * scan = decoder->scan; - const uint8_t * quant_matrix = decoder->intra_quantizer_matrix; - int quantizer_scale = decoder->quantizer_scale; + QMI(quant_matrix, decoder); const DCTtab * tab; uint32_t bit_buf; int bits; @@ -722,6 +778,7 @@ i = 0; dest = decoder->DCTblock; + QCOMBINE0(dest); bit_buf = decoder->bitstream_buf; bits = decoder->bitstream_bits; @@ -742,7 +799,7 @@ j = scan[i]; bit_buf <<= tab->len; bits += tab->len + 1; - val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4; + val = QMULT(tab->level, quant_matrix[j]) >> 4; /* oddification */ val = (val - 1) | 1; @@ -751,7 +808,7 @@ val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1); SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); bit_buf <<= 1; NEEDBITS (bit_buf, bits, bit_ptr); @@ -781,13 +838,13 @@ DUMPBITS (bit_buf, bits, 8); val = UBITS (bit_buf, 8) + 2 * val; } - val = (val * quantizer_scale * quant_matrix[j]) / 16; + val = QMULT(val, quant_matrix[j]) / 16; /* oddification */ val = (val + ~SBITS (val, 1)) | 1; SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); DUMPBITS (bit_buf, bits, 8); NEEDBITS (bit_buf, bits, bit_ptr); @@ -823,6 +880,7 @@ decoder->bitstream_buf = bit_buf; decoder->bitstream_bits = bits; decoder->bitstream_ptr = bit_ptr; + return i; } static int get_mpeg1_non_intra_block (decoder_t * const decoder) @@ -831,8 +889,7 @@ int j; int val; const uint8_t * scan = decoder->scan; - const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix; - int quantizer_scale = decoder->quantizer_scale; + QMNI(quant_matrix, decoder); const DCTtab * tab; uint32_t bit_buf; int bits; @@ -867,7 +924,7 @@ j = scan[i]; bit_buf <<= tab->len; bits += tab->len + 1; - val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5; + val = QMULT((2*tab->level+1), quant_matrix[j]) >> 5; /* oddification */ val = (val - 1) | 1; @@ -876,7 +933,7 @@ val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1); SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); bit_buf <<= 1; NEEDBITS (bit_buf, bits, bit_ptr); @@ -910,13 +967,13 @@ val = UBITS (bit_buf, 8) + 2 * val; } val = 2 * (val + SBITS (val, 1)) + 1; - val = (val * quantizer_scale * quant_matrix[j]) / 32; + val = QMULT(val, quant_matrix[j]) / 32; /* oddification */ val = (val + ~SBITS (val, 1)) | 1; SATURATE (val); - dest[j] = val; + dest[j] = QCOMBINE(val, i); DUMPBITS (bit_buf, bits, 8); NEEDBITS (bit_buf, bits, bit_ptr); @@ -955,9 +1012,28 @@ return i; } +#if 1 + +#if 1 +void GetIDCT_ARM(int16_t * srcDCT,uint8_t * dest,const int stride,const int addFlag); +#define FILLIDCT mpeg2_idct_fill_arm //this is used when only the 1st DCT is non-zero +#define GETIDCT_COPY(a,b,c) GetIDCT_ARM(a,b,c,0) +#else +void GetIDCT(int16_t * srcDCT,uint8_t * dest,const int stride,const int addFlag); +#define FILLIDCT mpeg2_idct_fill_default //this is used when only the 1st DCT is non-zero +#define GETIDCT_COPY(a,b,c) GetIDCT(a,b,c,0) +#endif + +#else +#define FILLIDCT mpeg2_idct_fill //this is used when only the 1st DCT is non-zero +#define GETIDCT_COPY(a,b,c) mpeg2_idct_copy(a,b,c) +#endif + static inline void slice_intra_DCT (decoder_t * const decoder, const int cc, uint8_t * const dest, const int stride) { + int last; + int16_t * DCTblock = decoder->DCTblock; #define bit_buf (decoder->bitstream_buf) #define bits (decoder->bitstream_bits) #define bit_ptr (decoder->bitstream_ptr) @@ -967,17 +1043,29 @@ decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder); else decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder); - decoder->DCTblock[0] = - decoder->dc_dct_pred[cc] << (3 - decoder->intra_dc_precision); + + DCTblock[INDEX_CE00] = decoder->dc_dct_pred[cc] << (3 - decoder->intra_dc_precision); if (decoder->mpeg1) { if (decoder->coding_type != D_TYPE) - get_mpeg1_intra_block (decoder); + last = get_mpeg1_intra_block (decoder); + else { + FILLIDCT (DCTblock[INDEX_CE00], dest, stride,0); + DCTblock[INDEX_CE00] = 0; + return; + } } else if (decoder->intra_vlc_format) - get_intra_block_B15 (decoder); + last = get_intra_block_B15 (decoder); else - get_intra_block_B14 (decoder); - mpeg2_idct_copy (decoder->DCTblock, dest, stride); + last = get_intra_block_B14 (decoder); + + if ((last > 129) || (DCTblock[INDEX_CE77] && ((DCTblock[INDEX_CE00] & (7<>CE00_SHIFT,dest,stride,0); //undo shift, this is used when only the 1st DCT is non-zero + DCTblock[INDEX_CE00] = DCTblock[INDEX_CE77] = 0; //the next 62 are already zero, the last may be non-zero if oddified + } #undef bit_buf #undef bits #undef bit_ptr @@ -1525,7 +1613,7 @@ decoder->dest[1] = decoder->picture_dest[1] + offset; decoder->dest[2] = decoder->picture_dest[2] + offset; - decoder->quantizer_scale = get_quantizer_scale (decoder); + set_quantizer_scale (decoder); /* ignore intra_slice and all the extra data */ while (bit_buf & 0x80000000) { @@ -1605,7 +1693,7 @@ /* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */ if (macroblock_modes & MACROBLOCK_QUANT) - decoder->quantizer_scale = get_quantizer_scale (decoder); + set_quantizer_scale (decoder); if (macroblock_modes & MACROBLOCK_INTRA) { diff -Naru -x .deps mpeg2dec-0.3.1.prev/libvo/yuv2rgb.c mpeg2dec-0.3.1/libvo/yuv2rgb.c --- mpeg2dec-0.3.1.prev/libvo/yuv2rgb.c 2002-12-13 17:41:23.000000000 -0700 +++ mpeg2dec-0.3.1/libvo/yuv2rgb.c 2003-04-21 02:35:06.000000000 -0700 @@ -329,7 +329,7 @@ for (i = -232; i < 256+232; i++) { int j = table_Y[i+384] >> 3; - if (order == CONVERT_RGB) + if (order == CONVERT_BGR) j <<= ((bpp==16) ? 11 : 10); ((uint16_t *)table_b)[i] = j;