Index: lib/cpu.c =================================================================== --- lib/cpu.c (revision 14319) +++ lib/cpu.c (working copy) @@ -9,25 +9,57 @@ * by the Xiph.Org Foundation http://www.xiph.org/ * * * ******************************************************************** - + CPU capability detection for x86 processors. Originally written by Rudolf Marek. - + function: last mod: $Id$ ********************************************************************/ - #include "cpu.h" +#if !defined(USE_ASM) +ogg_uint32_t oc_cpu_flags_get(void){ + return 0; +} +#else +#if defined(_MSC_VER) + +/* Visual C cpuid helper function. For VS2005 we could + as well use the _cpuid builtin, but that wouldn't work + for VS2003 users, so we do it in inline assembler */ + +static void oc_cpuid_helper (ogg_uint32_t * CpuInfo, ogg_uint32_t op){ + _asm { + mov eax, [op] + mov esi, CpuInfo + cpuid + mov [esi + 0], eax + mov [esi + 4], ebx + mov [esi + 8], ecx + mov [esi +12], edx + } +} + +#define cpuid(_op,_eax,_ebx,_ecx,_edx) \ + { \ + ogg_uint32_t nfo[4]; \ + oc_cpuid_helper (nfo, (_op)); \ + (_eax) = nfo[0],(_ebx) = nfo[1]; \ + (_ecx) = nfo[2],(_edx) = nfo[3]; \ + } +#endif + + ogg_uint32_t oc_cpu_flags_get(void){ ogg_uint32_t flags = 0; -#if defined(USE_ASM) ogg_uint32_t eax; ogg_uint32_t ebx; ogg_uint32_t ecx; ogg_uint32_t edx; +#if !defined (_MSC_VER) #if (defined(__amd64__) || defined(__x86_64__)) # define cpuid(_op,_eax,_ebx,_ecx,_edx) \ __asm__ __volatile__( \ @@ -75,6 +107,7 @@ /*No cpuid.*/ if(eax==ebx)return 0; #endif +#endif cpuid(0,eax,ebx,ecx,edx); if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){ /*Intel:*/ @@ -102,7 +135,7 @@ /*Implement me.*/ flags=0; } - + #ifdef DEBUG if (flags) { TH_DEBUG("vectorized instruction sets supported:"); @@ -115,8 +148,8 @@ TH_DEBUG("\n"); } #endif -#endif - return flags; } +#endif + Index: lib/dec/state.c =================================================================== --- lib/dec/state.c (revision 14319) +++ lib/dec/state.c (working copy) @@ -20,8 +20,12 @@ #include "../internal.h" #include "idct.h" #if defined(USE_ASM) +#if defined(_MSC_VER) +# include "x86_vc/x86int.h" +#else # include "x86/x86int.h" #endif +#endif #if defined(OC_DUMP_IMAGES) # include # include "png.h" Index: lib/dec/x86_vc/mmxfrag.c =================================================================== --- lib/dec/x86_vc/mmxfrag.c (revision 0) +++ lib/dec/x86_vc/mmxfrag.c (revision 0) @@ -0,0 +1,215 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ +#include "../../internal.h" + +/* ------------------------------------------------------------------------ + MMX reconstruction fragment routines for Visual Studio. + Tested with VS2005. Should compile for VS2003 and VC6 as well. + + Initial implementation 2007 by Nils Pipenbrinck. + ---------------------------------------------------------------------*/ + +#if defined(USE_ASM) + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, + const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter reconstruction step with 8 iterations + unrolled. The iteration for each instruction is noted by the #id in the + comments (in case you want to reconstruct it) + --------------------------------------------------------------------- */ + _asm{ + mov edi, [_residue] /* load residue ptr */ + mov eax, 0x00800080 /* generate constant */ + mov ebx, [_dst_ystride] /* load dst-stride */ + mov edx, [_dst] /* load dest pointer */ + + /* unrolled loop begins here */ + + movd mm0, eax /* load constant */ + movq mm1, [edi+ 8*0] /* #1 load low residue */ + movq mm2, [edi+ 8*1] /* #1 load high residue */ + punpckldq mm0, mm0 /* build constant */ + movq mm3, [edi+ 8*2] /* #2 load low residue */ + movq mm4, [edi+ 8*3] /* #2 load high residue */ + movq mm5, [edi+ 8*4] /* #3 load low residue */ + movq mm6, [edi+ 8*5] /* #3 load high residue */ + paddsw mm1, mm0 /* #1 bias low residue */ + paddsw mm2, mm0 /* #1 bias high residue */ + packuswb mm1, mm2 /* #1 pack to byte */ + paddsw mm3, mm0 /* #2 bias low residue */ + paddsw mm4, mm0 /* #2 bias high residue */ + packuswb mm3, mm4 /* #2 pack to byte */ + paddsw mm5, mm0 /* #3 bias low residue */ + paddsw mm6, mm0 /* #3 bias high residue */ + packuswb mm5, mm6 /* #3 pack to byte */ + movq [edx], mm1 /* #1 write row */ + movq [edx + ebx], mm3 /* #2 write row */ + movq [edx + ebx*2], mm5 /* #3 write row */ + movq mm1, [edi+ 8*6] /* #4 load low residue */ + lea ecx, [ebx + ebx*2] /* make dst_ystride * 3 */ + movq mm2, [edi+ 8*7] /* #4 load high residue */ + movq mm3, [edi+ 8*8] /* #5 load low residue */ + lea esi, [ebx*4 + ebx] /* make dst_ystride * 5 */ + movq mm4, [edi+ 8*9] /* #5 load high residue */ + movq mm5, [edi+ 8*10] /* #6 load low residue */ + lea eax, [ecx*2 + ebx] /* make dst_ystride * 7 */ + movq mm6, [edi+ 8*11] /* #6 load high residue */ + paddsw mm1, mm0 /* #4 bias low residue */ + paddsw mm2, mm0 /* #4 bias high residue */ + packuswb mm1, mm2 /* #4 pack to byte */ + paddsw mm3, mm0 /* #5 bias low residue */ + paddsw mm4, mm0 /* #5 bias high residue */ + packuswb mm3, mm4 /* #5 pack to byte */ + paddsw mm5, mm0 /* #6 bias low residue */ + paddsw mm6, mm0 /* #6 bias high residue */ + packuswb mm5, mm6 /* #6 pack to byte */ + movq [edx + ecx], mm1 /* #4 write row */ + movq [edx + ebx*4], mm3 /* #5 write row */ + movq [edx + esi], mm5 /* #6 write row */ + movq mm1, [edi+ 8*12] /* #7 load low residue */ + movq mm2, [edi+ 8*13] /* #7 load high residue */ + movq mm3, [edi+ 8*14] /* #8 load low residue */ + movq mm4, [edi+ 8*15] /* #8 load high residue */ + paddsw mm1, mm0 /* #7 bias low residue */ + paddsw mm2, mm0 /* #7 bias high residue */ + packuswb mm1, mm2 /* #7 pack to byte */ + paddsw mm3, mm0 /* #8 bias low residue */ + paddsw mm4, mm0 /* #8 bias high residue */ + packuswb mm3, mm4 /* #8 pack to byte */ + movq [edx + ecx*2], mm1 /* #7 write row */ + movq [edx + eax], mm3 /* #8 write row */ + } +} + + + +void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride, + const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter reconstruction step with two iterations + running in parallel to hide some load-latencies and break the dependency + chains. The iteration for each instruction is noted by the #id in the + comments (in case you want to reconstruct it) + --------------------------------------------------------------------- */ + _asm{ + pxor mm0, mm0 /* generate constant 0 */ + mov esi, [_src] + mov edi, [_residue] + mov eax, [_src_ystride] + mov edx, [_dst] + mov ebx, [_dst_ystride] + mov ecx, 4 + + align 16 + +nextchunk: + movq mm3, [esi] /* #1 load source */ + movq mm1, [edi+0] /* #1 load residium low */ + movq mm2, [edi+8] /* #1 load residium high */ + movq mm7, [esi+eax] /* #2 load source */ + movq mm4, mm3 /* #1 get copy of src */ + movq mm5, [edi+16] /* #2 load residium low */ + punpckhbw mm4, mm0 /* #1 expand high source */ + movq mm6, [edi+24] /* #2 load residium high */ + punpcklbw mm3, mm0 /* #1 expand low source */ + paddsw mm4, mm2 /* #1 add residium high */ + movq mm2, mm7 /* #2 get copy of src */ + paddsw mm3, mm1 /* #1 add residium low */ + punpckhbw mm2, mm0 /* #2 expand high source */ + packuswb mm3, mm4 /* #1 final row pixels */ + punpcklbw mm7, mm0 /* #2 expand low source */ + movq [edx], mm3 /* #1 write row */ + paddsw mm2, mm6 /* #2 add residium high */ + add edi, 32 /* residue += 4 */ + paddsw mm7, mm5 /* #2 add residium low */ + sub ecx, 1 /* update loop counter */ + packuswb mm7, mm2 /* #2 final row */ + lea esi, [esi+eax*2] /* src += stride * 2 */ + movq [edx + ebx], mm7 /* #2 write row */ + lea edx, [edx+ebx*2] /* dst += stride * 2 */ + jne nextchunk + } +} + + +void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride, + const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2, + int _src2_ystride,const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter2 reconstruction step.The building of the + average is done with a bit-twiddeling trick to avoid excessive register + copy work during byte to word conversion. + + average = (a & b) + (((a ^ b) & 0xfe) >> 1); + + (shown for a single byte; it's done with 8 of them at a time) + + Slightly faster than the obvious method using add and shift, but not + earthshaking improvement either. + + If anyone comes up with a way that produces bit-identical outputs + using the pavgb instruction let me know and I'll do the 3dnow codepath. + --------------------------------------------------------------------- */ + _asm{ + mov eax, 0xfefefefe + mov esi, [_src1] + mov edi, [_src2] + movd mm1, eax + mov ebx, [_residue] + mov edx, [_dst] + mov eax, [_dst_ystride] + punpckldq mm1, mm1 /* replicate lsb32 */ + mov ecx, 8 /* init loop counter */ + pxor mm0, mm0 /* constant zero */ + sub edx, eax /* dst -= dst_stride */ + + align 16 + +nextrow: + movq mm2, [esi] /* load source1 */ + movq mm3, [edi] /* load source2 */ + movq mm5, [ebx + 0] /* load lower residue */ + movq mm6, [ebx + 8] /* load higer residue */ + add esi, _src1_ystride /* src1 += src1_stride */ + add edi, _src2_ystride /* src2 += src1_stride */ + movq mm4, mm2 /* get copy of source1 */ + pand mm2, mm3 /* s1 & s2 (avg part) */ + pxor mm3, mm4 /* s1 ^ s2 (avg part) */ + add ebx, 16 /* residue++ */ + pand mm3, mm1 /* mask out low bits */ + psrlq mm3, 1 /* shift xor avg-part */ + paddd mm3, mm2 /* build final average */ + add edx, eax /* dst += dst_stride */ + movq mm2, mm3 /* get copy of average */ + punpckhbw mm3, mm0 /* average high */ + punpcklbw mm2, mm0 /* average low */ + paddsw mm3, mm6 /* high + residue */ + paddsw mm2, mm5 /* low + residue */ + sub ecx, 1 /* update loop counter */ + packuswb mm2, mm3 /* pack and saturate */ + movq [edx], mm2 /* write row */ + jne nextrow + } +} + +void oc_restore_fpu_mmx(void){ + _asm { emms } +} + +#endif + Index: lib/dec/x86_vc/mmxloopfilter.c =================================================================== --- lib/dec/x86_vc/mmxloopfilter.c (revision 0) +++ lib/dec/x86_vc/mmxloopfilter.c (revision 0) @@ -0,0 +1,378 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------- + MMX based loop filter for the theora codec. + + Originally written by Rudolf Marek, based on code from On2's VP3. + Converted to Visual Studio inline assembly by Nils Pipenbrinck. + + Note: I can't test these since my example files never get into the + loop filters, but the code has been converted semi-automatic from + the GCC sources, so it ought to work. + ---------------------------------------------------------------------*/ +#include "../../internal.h" +#include "x86int.h" +#include + +#if defined(USE_ASM) + + + +static void loop_filter_v(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + _asm { + mov eax, [_pix] + mov edx, [_ystride] + mov ebx, [_ll] + + /* _pix -= ystride */ + sub eax, edx + /* mm0=0 */ + pxor mm0, mm0 + /* _pix -= ystride */ + sub eax, edx + /* esi=_ystride*3 */ + lea esi, [edx + edx*2] + + /* mm7=_pix[0...8]*/ + movq mm7, [eax] + /* mm4=_pix[0...8+_ystride*3]*/ + movq mm4, [eax + esi] + /* mm6=_pix[0...8]*/ + movq mm6, mm7 + /* Expand unsigned _pix[0...3] to 16 bits.*/ + punpcklbw mm6, mm0 + movq mm5, mm4 + /* Expand unsigned _pix[4...7] to 16 bits.*/ + punpckhbw mm7, mm0 + punpcklbw mm4, mm0 + /* Expand other arrays too.*/ + punpckhbw mm5, mm0 + /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/ + psubw mm6, mm4 + psubw mm7, mm5 + /*mm5=mm4=_pix[0...7+_ystride]*/ + movq mm4, [eax + edx] + /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/ + movq mm2, [eax + edx*2] + movq mm5, mm4 + movq mm3, mm2 + movq mm1, mm2 + /*Expand these arrays.*/ + punpckhbw mm5, mm0 + punpcklbw mm4, mm0 + punpckhbw mm3, mm0 + punpcklbw mm2, mm0 + pcmpeqw mm0, mm0 + /*mm0=3 3 3 3 + mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ + psubw mm3, mm5 + psrlw mm0, 14 + psubw mm2, mm4 + /*Scale by 3.*/ + pmullw mm3, mm0 + pmullw mm2, mm0 + /*mm0=4 4 4 4 + f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ + 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ + psrlw mm0, 1 + paddw mm3, mm7 + psllw mm0, 2 + paddw mm2, mm6 + /*Add 4.*/ + paddw mm3, mm0 + paddw mm2, mm0 + /*"Divide" by 8.*/ + psraw mm3, 3 + psraw mm2, 3 + /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ + /*Free up mm5.*/ + packuswb mm4, mm5 + /*mm0=L L L L*/ + movq mm0, [ebx] + /*if(R_i<-2L||R_i>2L)R_i=0:*/ + movq mm5, mm2 + pxor mm6, mm6 + movq mm7, mm0 + psubw mm6, mm0 + psllw mm7, 1 + psllw mm6, 1 + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + pcmpgtw mm7, mm2 + pcmpgtw mm5, mm6 + pand mm2, mm7 + movq mm7, mm0 + pand mm2, mm5 + psllw mm7, 1 + movq mm5, mm3 + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + pcmpgtw mm7, mm3 + pcmpgtw mm5, mm6 + pand mm3, mm7 + movq mm7, mm0 + pand mm3, mm5 + /*if(R_i<-L)R_i'=R_i+2L; + if(R_i>L)R_i'=R_i-2L; + if(R_i<-L||R_i>L)R_i=-R_i':*/ + psraw mm6, 1 + movq mm5, mm2 + psllw mm7, 1 + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm5=R_i>L?FF:00*/ + pcmpgtw mm5, mm0 + /*mm6=-L>R_i?FF:00*/ + pcmpgtw mm6, mm2 + /*mm7=R_i>L?2L:0*/ + pand mm7, mm5 + /*mm2=R_i>L?R_i-2L:R_i*/ + psubw mm2, mm7 + movq mm7, mm0 + /*mm5=-L>R_i||R_i>L*/ + por mm5, mm6 + psllw mm7, 1 + /*mm7=-L>R_i?2L:0*/ + pand mm7, mm6 + pxor mm6, mm6 + /*mm2=-L>R_i?R_i+2L:R_i*/ + paddw mm2, mm7 + psubw mm6, mm0 + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + pand mm5, mm2 + movq mm7, mm0 + /*mm2=-L>R_i||R_i>L?0:R_i*/ + psubw mm2, mm5 + psllw mm7, 1 + /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ + psubw mm2, mm5 + movq mm5, mm3 + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm6=-L>R_i?FF:00*/ + pcmpgtw mm6, mm3 + /*mm5=R_i>L?FF:00*/ + pcmpgtw mm5, mm0 + /*mm7=R_i>L?2L:0*/ + pand mm7, mm5 + /*mm2=R_i>L?R_i-2L:R_i*/ + psubw mm3, mm7 + psllw mm0, 1 + /*mm5=-L>R_i||R_i>L*/ + por mm5, mm6 + /*mm0=-L>R_i?2L:0*/ + pand mm0, mm6 + /*mm3=-L>R_i?R_i+2L:R_i*/ + paddw mm3, mm0 + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + pand mm5, mm3 + /*mm2=-L>R_i||R_i>L?0:R_i*/ + psubw mm3, mm5 + /*mm3=-L>R_i||R_i>L?-R_i':R_i*/ + psubw mm3, mm5 + /*Unfortunately, there's no unsigned byte+signed byte with unsigned + saturation op code, so we have to promote things back 16 bits.*/ + pxor mm0, mm0 + movq mm5, mm4 + punpcklbw mm4, mm0 + punpckhbw mm5, mm0 + movq mm6, mm1 + punpcklbw mm1, mm0 + punpckhbw mm6, mm0 + /*_pix[0...8+_ystride]+=R_i*/ + paddw mm4, mm2 + paddw mm5, mm3 + /*_pix[0...8+_ystride*2]-=R_i*/ + psubw mm1, mm2 + psubw mm6, mm3 + packuswb mm4, mm5 + packuswb mm1, mm6 + /*Write it back out.*/ + movq [eax + edx], mm4 + movq [eax + edx*2], mm1 + } +} + +/*This code implements the bulk of loop_filter_h(). + Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all + four p0's to one register we must transpose the values in four mmx regs. + When half is done we repeat this for the rest.*/ +static void loop_filter_h4(unsigned char *_pix,long _ystride, + const ogg_int16_t *_ll){ + /* todo: merge the comments from the GCC sources */ + _asm { + mov ecx, [_pix] + mov edx, [_ystride] + mov eax, [_ll] + /*esi=_ystride*3*/ + lea esi, [edx + edx*2] + + movd mm0, dword ptr [ecx] + movd mm1, dword ptr [ecx + edx] + movd mm2, dword ptr [ecx + edx*2] + movd mm3, dword ptr [ecx + esi] + punpcklbw mm0, mm1 + punpcklbw mm2, mm3 + movq mm1, mm0 + punpckhwd mm0, mm2 + punpcklwd mm1, mm2 + pxor mm7, mm7 + movq mm5, mm1 + punpcklbw mm1, mm7 + punpckhbw mm5, mm7 + movq mm3, mm0 + punpcklbw mm0, mm7 + punpckhbw mm3, mm7 + psubw mm1, mm3 + movq mm4, mm0 + pcmpeqw mm2, mm2 + psubw mm0, mm5 + psrlw mm2, 14 + pmullw mm0, mm2 + psrlw mm2, 1 + paddw mm0, mm1 + psllw mm2, 2 + paddw mm0, mm2 + psraw mm0, 3 + movq mm6, qword ptr [eax] + movq mm1, mm0 + pxor mm2, mm2 + movq mm3, mm6 + psubw mm2, mm6 + psllw mm3, 1 + psllw mm2, 1 + pcmpgtw mm3, mm0 + pcmpgtw mm1, mm2 + pand mm0, mm3 + pand mm0, mm1 + psraw mm2, 1 + movq mm1, mm0 + movq mm3, mm6 + pcmpgtw mm2, mm0 + pcmpgtw mm1, mm6 + psllw mm3, 1 + psllw mm6, 1 + pand mm3, mm1 + pand mm6, mm2 + psubw mm0, mm3 + por mm1, mm2 + paddw mm0, mm6 + pand mm1, mm0 + psubw mm0, mm1 + psubw mm0, mm1 + paddw mm5, mm0 + psubw mm4, mm0 + packuswb mm5, mm7 + packuswb mm4, mm7 + punpcklbw mm5, mm4 + movd edi, mm5 + mov word ptr [ecx + 01H], di + psrlq mm5, 32 + shr edi, 16 + mov word ptr [ecx + edx + 01H], di + movd edi, mm5 + mov word ptr [ecx + edx*2 + 01H], di + shr edi, 16 + mov word ptr [ecx + esi + 01H], di + } +} + +static void loop_filter_h(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + _pix-=2; + loop_filter_h4(_pix,_ystride,_ll); + loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll); +} + + +/*We copy the whole function because the MMX routines will be inlined 4 times, + and we can do just a single emms call at the end this way. + We also do not use the _bv lookup table, instead computing the values that + would lie in it on the fly.*/ + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, + int _refi,int _pli,int _fragy0,int _fragy_end){ + ogg_int16_t __declspec(align(8)) ll[4]; + th_img_plane *iplane; + oc_fragment_plane *fplane; + oc_fragment *frag_top; + oc_fragment *frag0; + oc_fragment *frag; + oc_fragment *frag_end; + oc_fragment *frag0_end; + oc_fragment *frag_bot; + ll[0]=ll[1]=ll[2]=ll[3]= + (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]]; + iplane=_state->ref_frame_bufs[_refi]+_pli; + fplane=_state->fplanes+_pli; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + frag_top=_state->frags+fplane->froffset; + frag0=frag_top+_fragy0*fplane->nhfrags; + frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags; + frag_bot=_state->frags+fplane->froffset+fplane->nfrags; + while(frag0nhfrags; + while(fragcoded){ + if(frag>frag0){ + loop_filter_h(frag->buffer[_refi],iplane->ystride,ll); + } + if(frag0>frag_top){ + loop_filter_v(frag->buffer[_refi],iplane->ystride,ll); + } + if(frag+1coded){ + loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll); + } + if(frag+fplane->nhfragsnhfrags)->coded){ + loop_filter_v((frag+fplane->nhfrags)->buffer[_refi], + iplane->ystride,ll); + } + } + frag++; + } + frag0+=fplane->nhfrags; + } + + /*This needs to be removed when decode specific functions are implemented:*/ + _mm_empty(); +} + +#endif + Index: lib/dec/x86_vc/mmxstate.c =================================================================== --- lib/dec/x86_vc/mmxstate.c (revision 0) +++ lib/dec/x86_vc/mmxstate.c (revision 0) @@ -0,0 +1,191 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------------ + MMX acceleration of complete fragment reconstruction algorithm. + Originally written by Rudolf Marek. + + Conversion to MSC intrinsics by Nils Pipenbrinck. + ---------------------------------------------------------------------*/ +#if defined(USE_ASM) + +#include "../../internal.h" +#include "../idct.h" +#include "x86int.h" +#include + +static const unsigned char OC_FZIG_ZAGMMX[64]= +{ + 0, 8, 1, 2, 9,16,24,17, + 10, 3,32,11,18,25, 4,12, + 5,26,19,40,33,34,41,48, + 27, 6,13,20,28,21,14, 7, + 56,49,42,35,43,50,57,36, + 15,22,29,30,23,44,37,58, + 51,59,38,45,52,31,60,53, + 46,39,47,54,61,62,55,63 +}; + +/* Fill a block with value */ +static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){ + __m64 t = _value; + _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t; + _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t; + _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t; + _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t; +} + +/* copy a block of 8 byte elements using different strides */ +static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, + unsigned char * _src, int _src_ystride){ + __m64 a,b,c,d,e,f,g,h; + a = *(__m64*)(_src + 0 * _src_ystride); + b = *(__m64*)(_src + 1 * _src_ystride); + c = *(__m64*)(_src + 2 * _src_ystride); + d = *(__m64*)(_src + 3 * _src_ystride); + e = *(__m64*)(_src + 4 * _src_ystride); + f = *(__m64*)(_src + 5 * _src_ystride); + g = *(__m64*)(_src + 6 * _src_ystride); + h = *(__m64*)(_src + 7 * _src_ystride); + *(__m64*)(_dst + 0 * _dst_ystride) = a; + *(__m64*)(_dst + 1 * _dst_ystride) = b; + *(__m64*)(_dst + 2 * _dst_ystride) = c; + *(__m64*)(_dst + 3 * _dst_ystride) = d; + *(__m64*)(_dst + 4 * _dst_ystride) = e; + *(__m64*)(_dst + 5 * _dst_ystride) = f; + *(__m64*)(_dst + 6 * _dst_ystride) = g; + *(__m64*)(_dst + 7 * _dst_ystride) = h; +} + +void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, + ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ + ogg_int16_t __declspec(align(16)) res_buf[64]; + int dst_framei; + int dst_ystride; + int zzi; + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + dequantize fewer coefficients and use a smaller transform when the block + ends with a long zero run instead of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + __m64 p; + /*Why is the iquant product rounded in this case and no others? Who knows.*/ + p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5); + /* broadcast 16 bits into all 4 mmx subregisters */ + p = _m_punpcklwd (p,p); + p = _m_punpckldq (p,p); + loc_fill_mmx_value ((__m64 *)res_buf, p); + } + else{ + /*Then, fill in the remainder of the coefficients with 0's, and perform + the iDCT.*/ + /*First zero the buffer.*/ + /*On K7, etc., this could be replaced with movntq and sfence.*/ + loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64()); + + res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant); + /*This is planned to be rewritten in MMX.*/ + for(zzi=1;zzi<_ncoefs;zzi++) + { + int ci; + ci=OC_FZIG_ZAG[zzi]; + res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]* + _ac_iquant[ci]); + } + + if(_last_zzi<10){ + oc_idct8x8_10_mmx(res_buf); + } + else { + oc_idct8x8_mmx(res_buf); + } + } + /*Fill in the target buffer.*/ + dst_framei=_state->ref_frame_idx[OC_FRAME_SELF]; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; + /*For now ystride values in all ref frames assumed to be equal.*/ + if(_frag->mbmode==OC_MODE_INTRA){ + oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf); + } + else{ + int ref_framei; + int ref_ystride; + int mvoffset0; + int mvoffset1; + ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]]; + ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride; + if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0], + _frag->mv[1],ref_ystride,_pli)>1){ + oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride, + _frag->buffer[ref_framei]+mvoffset0,ref_ystride, + _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf); + } + else{ + oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride, + _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf); + } + } + + _mm_empty(); +} + + +void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, + int _nfragis,int _dst_frame,int _src_frame,int _pli){ + const int *fragi; + const int *fragi_end; + int dst_framei; + int dst_ystride; + int src_framei; + int src_ystride; + dst_framei=_state->ref_frame_idx[_dst_frame]; + src_framei=_state->ref_frame_idx[_src_frame]; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; + src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride; + fragi_end=_fragis+_nfragis; + for(fragi=_fragis;fragifrags+*fragi; + loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, + frag->buffer[src_framei], src_ystride); + } + _m_empty(); +} + +#endif + Index: lib/dec/x86_vc/mmxidct.c =================================================================== --- lib/dec/x86_vc/mmxidct.c (revision 0) +++ lib/dec/x86_vc/mmxidct.c (revision 0) @@ -0,0 +1,1007 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------- + MMX based IDCT for the theora codec. + + Originally written by Rudolf Marek, based on code from On2's VP3. + Converted to Visual Studio inline assembly by Nils Pipenbrinck. + + ---------------------------------------------------------------------*/ +#if defined(USE_ASM) + +#include +#include "../dct.h" +#include "../idct.h" +#include "x86int.h" + +/*A table of constants used by the MMX routines.*/ +static const __declspec(align(16)) ogg_uint16_t + OC_IDCT_CONSTS[(7+1)*4]={ + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + 8, 8, 8, 8 +}; + + +void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){ + _asm { + mov edx, [_y] + mov eax, offset OC_IDCT_CONSTS + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 18H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 38H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 28H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 08H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 20H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 10H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 08H], mm4 + punpckhwd mm0, mm7 + movq [edx + 18H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx] + punpckldq mm1, mm0 + movq mm5, [edx + 10H] + movq mm0, mm4 + movq [edx + 38H], mm6 + punpcklwd mm0, mm5 + movq [edx + 28H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx], mm0 + punpckhwd mm5, mm3 + movq [edx + 10H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 30H], mm4 + movq [edx + 20H], mm2 + movq mm2, [edx + 70H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 50H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 60H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 50H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 60H], mm6 + movq mm2, mm0 + movq mm6, [edx + 40H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 50H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 60H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 50H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx + 40H], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 48H], mm4 + punpckhwd mm0, mm7 + movq [edx + 58H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx + 40H] + punpckldq mm1, mm0 + movq mm5, [edx + 50H] + movq mm0, mm4 + movq [edx + 78H], mm6 + punpcklwd mm0, mm5 + movq [edx + 68H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx + 40H], mm0 + punpckhwd mm5, mm3 + movq [edx + 50H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 70H], mm4 + movq [edx + 60H], mm2 + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 50H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 70H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 60H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 40H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 20H] + paddw mm7, mm7 + movq [edx + 20H], mm2 + paddw mm7, mm4 + movq [edx + 10H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 40H], mm4 + psraw mm5, 4 + movq [edx + 30H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 60H], mm6 + psraw mm0, 4 + movq [edx + 50H], mm5 + movq [edx + 70H], mm7 + movq [edx], mm0 + movq mm2, [edx + 38H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 18H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 28H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 18H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 28H], mm6 + movq mm2, mm0 + movq mm6, [edx + 08H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 18H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 28H] + paddw mm7, mm7 + movq [edx + 28H], mm2 + paddw mm7, mm4 + movq [edx + 18H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 48H], mm4 + psraw mm5, 4 + movq [edx + 38H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 68H], mm6 + psraw mm0, 4 + movq [edx + 58H], mm5 + movq [edx + 78H], mm7 + movq [edx + 08H], mm0 + /* emms */ + } +} + + +void oc_idct8x8_mmx(ogg_int16_t _y[64]){ + _asm { + mov edx, [_y] + mov eax, offset OC_IDCT_CONSTS + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 18H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 38H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 28H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 08H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 20H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 10H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 08H], mm4 + punpckhwd mm0, mm7 + movq [edx + 18H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx] + punpckldq mm1, mm0 + movq mm5, [edx + 10H] + movq mm0, mm4 + movq [edx + 38H], mm6 + punpcklwd mm0, mm5 + movq [edx + 28H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx], mm0 + punpckhwd mm5, mm3 + movq [edx + 10H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 30H], mm4 + movq [edx + 20H], mm2 + movq mm2, [edx + 70H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 50H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 60H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 50H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 60H], mm6 + movq mm2, mm0 + movq mm6, [edx + 40H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 50H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 60H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 50H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx + 40H], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 48H], mm4 + punpckhwd mm0, mm7 + movq [edx + 58H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx + 40H] + punpckldq mm1, mm0 + movq mm5, [edx + 50H] + movq mm0, mm4 + movq [edx + 78H], mm6 + punpcklwd mm0, mm5 + movq [edx + 68H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx + 40H], mm0 + punpckhwd mm5, mm3 + movq [edx + 50H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 70H], mm4 + movq [edx + 60H], mm2 + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 50H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 70H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 60H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 40H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 20H] + paddw mm7, mm7 + movq [edx + 20H], mm2 + paddw mm7, mm4 + movq [edx + 10H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 40H], mm4 + psraw mm5, 4 + movq [edx + 30H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 60H], mm6 + psraw mm0, 4 + movq [edx + 50H], mm5 + movq [edx + 70H], mm7 + movq [edx], mm0 + movq mm2, [edx + 38H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 18H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 28H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 18H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 28H], mm6 + movq mm2, mm0 + movq mm6, [edx + 08H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 18H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 28H] + paddw mm7, mm7 + movq [edx + 28H], mm2 + paddw mm7, mm4 + movq [edx + 18H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 48H], mm4 + psraw mm5, 4 + movq [edx + 38H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 68H], mm6 + psraw mm0, 4 + movq [edx + 58H], mm5 + movq [edx + 78H], mm7 + movq [edx + 08H], mm0 + /* emms */ + } +} + +#endif + Index: lib/dec/x86_vc/x86int.h =================================================================== --- lib/dec/x86_vc/x86int.h (revision 0) +++ lib/dec/x86_vc/x86int.h (revision 0) @@ -0,0 +1,49 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 13884 2007-09-22 08:38:10Z giles $ + + ********************************************************************/ + +#if !defined(_x86_x86int_vc_H) +# define _x86_x86int_vc_H (1) +# include "../../internal.h" + +void oc_state_vtable_init_x86(oc_theora_state *_state); + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, + const ogg_int16_t *_residue); + +void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue); + +void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2, + int _src2_ystride,const ogg_int16_t *_residue); + +void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, + int _nfragis,int _dst_frame,int _src_frame,int _pli); + +void oc_restore_fpu_mmx(void); + +void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, + ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); + +void oc_idct8x8_mmx(ogg_int16_t _y[64]); +void oc_idct8x8_10_mmx(ogg_int16_t _y[64]); + +void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, + int _refi,int _pli,int _fragy0,int _fragy_end); + +#endif Index: lib/dec/x86_vc/x86state.c =================================================================== --- lib/dec/x86_vc/x86state.c (revision 0) +++ lib/dec/x86_vc/x86state.c (revision 0) @@ -0,0 +1,42 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86state.c 13884 2007-09-22 08:38:10Z giles $ + + ********************************************************************/ + +#if defined(USE_ASM) + +#include "x86int.h" +#include "../../cpu.h" + +void oc_state_vtable_init_x86(oc_theora_state *_state){ + _state->cpu_flags=oc_cpu_flags_get(); + + /* fill with defaults */ + oc_state_vtable_init_c(_state); + + /* patch MMX functions */ + if(_state->cpu_flags&OC_CPU_X86_MMX){ + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; + _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; + _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; + _state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx; + } +} + +#endif + Index: lib/dec/x86_vc/mmxfrag.c =================================================================== --- lib/dec/x86_vc/mmxfrag.c (revision 0) +++ lib/dec/x86_vc/mmxfrag.c (revision 0) @@ -0,0 +1,215 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ +#include "../../internal.h" + +/* ------------------------------------------------------------------------ + MMX reconstruction fragment routines for Visual Studio. + Tested with VS2005. Should compile for VS2003 and VC6 as well. + + Initial implementation 2007 by Nils Pipenbrinck. + ---------------------------------------------------------------------*/ + +#if defined(USE_ASM) + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, + const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter reconstruction step with 8 iterations + unrolled. The iteration for each instruction is noted by the #id in the + comments (in case you want to reconstruct it) + --------------------------------------------------------------------- */ + _asm{ + mov edi, [_residue] /* load residue ptr */ + mov eax, 0x00800080 /* generate constant */ + mov ebx, [_dst_ystride] /* load dst-stride */ + mov edx, [_dst] /* load dest pointer */ + + /* unrolled loop begins here */ + + movd mm0, eax /* load constant */ + movq mm1, [edi+ 8*0] /* #1 load low residue */ + movq mm2, [edi+ 8*1] /* #1 load high residue */ + punpckldq mm0, mm0 /* build constant */ + movq mm3, [edi+ 8*2] /* #2 load low residue */ + movq mm4, [edi+ 8*3] /* #2 load high residue */ + movq mm5, [edi+ 8*4] /* #3 load low residue */ + movq mm6, [edi+ 8*5] /* #3 load high residue */ + paddsw mm1, mm0 /* #1 bias low residue */ + paddsw mm2, mm0 /* #1 bias high residue */ + packuswb mm1, mm2 /* #1 pack to byte */ + paddsw mm3, mm0 /* #2 bias low residue */ + paddsw mm4, mm0 /* #2 bias high residue */ + packuswb mm3, mm4 /* #2 pack to byte */ + paddsw mm5, mm0 /* #3 bias low residue */ + paddsw mm6, mm0 /* #3 bias high residue */ + packuswb mm5, mm6 /* #3 pack to byte */ + movq [edx], mm1 /* #1 write row */ + movq [edx + ebx], mm3 /* #2 write row */ + movq [edx + ebx*2], mm5 /* #3 write row */ + movq mm1, [edi+ 8*6] /* #4 load low residue */ + lea ecx, [ebx + ebx*2] /* make dst_ystride * 3 */ + movq mm2, [edi+ 8*7] /* #4 load high residue */ + movq mm3, [edi+ 8*8] /* #5 load low residue */ + lea esi, [ebx*4 + ebx] /* make dst_ystride * 5 */ + movq mm4, [edi+ 8*9] /* #5 load high residue */ + movq mm5, [edi+ 8*10] /* #6 load low residue */ + lea eax, [ecx*2 + ebx] /* make dst_ystride * 7 */ + movq mm6, [edi+ 8*11] /* #6 load high residue */ + paddsw mm1, mm0 /* #4 bias low residue */ + paddsw mm2, mm0 /* #4 bias high residue */ + packuswb mm1, mm2 /* #4 pack to byte */ + paddsw mm3, mm0 /* #5 bias low residue */ + paddsw mm4, mm0 /* #5 bias high residue */ + packuswb mm3, mm4 /* #5 pack to byte */ + paddsw mm5, mm0 /* #6 bias low residue */ + paddsw mm6, mm0 /* #6 bias high residue */ + packuswb mm5, mm6 /* #6 pack to byte */ + movq [edx + ecx], mm1 /* #4 write row */ + movq [edx + ebx*4], mm3 /* #5 write row */ + movq [edx + esi], mm5 /* #6 write row */ + movq mm1, [edi+ 8*12] /* #7 load low residue */ + movq mm2, [edi+ 8*13] /* #7 load high residue */ + movq mm3, [edi+ 8*14] /* #8 load low residue */ + movq mm4, [edi+ 8*15] /* #8 load high residue */ + paddsw mm1, mm0 /* #7 bias low residue */ + paddsw mm2, mm0 /* #7 bias high residue */ + packuswb mm1, mm2 /* #7 pack to byte */ + paddsw mm3, mm0 /* #8 bias low residue */ + paddsw mm4, mm0 /* #8 bias high residue */ + packuswb mm3, mm4 /* #8 pack to byte */ + movq [edx + ecx*2], mm1 /* #7 write row */ + movq [edx + eax], mm3 /* #8 write row */ + } +} + + + +void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride, + const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter reconstruction step with two iterations + running in parallel to hide some load-latencies and break the dependency + chains. The iteration for each instruction is noted by the #id in the + comments (in case you want to reconstruct it) + --------------------------------------------------------------------- */ + _asm{ + pxor mm0, mm0 /* generate constant 0 */ + mov esi, [_src] + mov edi, [_residue] + mov eax, [_src_ystride] + mov edx, [_dst] + mov ebx, [_dst_ystride] + mov ecx, 4 + + align 16 + +nextchunk: + movq mm3, [esi] /* #1 load source */ + movq mm1, [edi+0] /* #1 load residium low */ + movq mm2, [edi+8] /* #1 load residium high */ + movq mm7, [esi+eax] /* #2 load source */ + movq mm4, mm3 /* #1 get copy of src */ + movq mm5, [edi+16] /* #2 load residium low */ + punpckhbw mm4, mm0 /* #1 expand high source */ + movq mm6, [edi+24] /* #2 load residium high */ + punpcklbw mm3, mm0 /* #1 expand low source */ + paddsw mm4, mm2 /* #1 add residium high */ + movq mm2, mm7 /* #2 get copy of src */ + paddsw mm3, mm1 /* #1 add residium low */ + punpckhbw mm2, mm0 /* #2 expand high source */ + packuswb mm3, mm4 /* #1 final row pixels */ + punpcklbw mm7, mm0 /* #2 expand low source */ + movq [edx], mm3 /* #1 write row */ + paddsw mm2, mm6 /* #2 add residium high */ + add edi, 32 /* residue += 4 */ + paddsw mm7, mm5 /* #2 add residium low */ + sub ecx, 1 /* update loop counter */ + packuswb mm7, mm2 /* #2 final row */ + lea esi, [esi+eax*2] /* src += stride * 2 */ + movq [edx + ebx], mm7 /* #2 write row */ + lea edx, [edx+ebx*2] /* dst += stride * 2 */ + jne nextchunk + } +} + + +void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride, + const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2, + int _src2_ystride,const ogg_int16_t *_residue){ + /* --------------------------------------------------------------------- + This function does the inter2 reconstruction step.The building of the + average is done with a bit-twiddeling trick to avoid excessive register + copy work during byte to word conversion. + + average = (a & b) + (((a ^ b) & 0xfe) >> 1); + + (shown for a single byte; it's done with 8 of them at a time) + + Slightly faster than the obvious method using add and shift, but not + earthshaking improvement either. + + If anyone comes up with a way that produces bit-identical outputs + using the pavgb instruction let me know and I'll do the 3dnow codepath. + --------------------------------------------------------------------- */ + _asm{ + mov eax, 0xfefefefe + mov esi, [_src1] + mov edi, [_src2] + movd mm1, eax + mov ebx, [_residue] + mov edx, [_dst] + mov eax, [_dst_ystride] + punpckldq mm1, mm1 /* replicate lsb32 */ + mov ecx, 8 /* init loop counter */ + pxor mm0, mm0 /* constant zero */ + sub edx, eax /* dst -= dst_stride */ + + align 16 + +nextrow: + movq mm2, [esi] /* load source1 */ + movq mm3, [edi] /* load source2 */ + movq mm5, [ebx + 0] /* load lower residue */ + movq mm6, [ebx + 8] /* load higer residue */ + add esi, _src1_ystride /* src1 += src1_stride */ + add edi, _src2_ystride /* src2 += src1_stride */ + movq mm4, mm2 /* get copy of source1 */ + pand mm2, mm3 /* s1 & s2 (avg part) */ + pxor mm3, mm4 /* s1 ^ s2 (avg part) */ + add ebx, 16 /* residue++ */ + pand mm3, mm1 /* mask out low bits */ + psrlq mm3, 1 /* shift xor avg-part */ + paddd mm3, mm2 /* build final average */ + add edx, eax /* dst += dst_stride */ + movq mm2, mm3 /* get copy of average */ + punpckhbw mm3, mm0 /* average high */ + punpcklbw mm2, mm0 /* average low */ + paddsw mm3, mm6 /* high + residue */ + paddsw mm2, mm5 /* low + residue */ + sub ecx, 1 /* update loop counter */ + packuswb mm2, mm3 /* pack and saturate */ + movq [edx], mm2 /* write row */ + jne nextrow + } +} + +void oc_restore_fpu_mmx(void){ + _asm { emms } +} + +#endif + Index: lib/dec/x86_vc/mmxidct.c =================================================================== --- lib/dec/x86_vc/mmxidct.c (revision 0) +++ lib/dec/x86_vc/mmxidct.c (revision 0) @@ -0,0 +1,1007 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------- + MMX based IDCT for the theora codec. + + Originally written by Rudolf Marek, based on code from On2's VP3. + Converted to Visual Studio inline assembly by Nils Pipenbrinck. + + ---------------------------------------------------------------------*/ +#if defined(USE_ASM) + +#include +#include "../dct.h" +#include "../idct.h" +#include "x86int.h" + +/*A table of constants used by the MMX routines.*/ +static const __declspec(align(16)) ogg_uint16_t + OC_IDCT_CONSTS[(7+1)*4]={ + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, + 8, 8, 8, 8 +}; + + +void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){ + _asm { + mov edx, [_y] + mov eax, offset OC_IDCT_CONSTS + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 18H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 38H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 28H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 08H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 20H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 10H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 08H], mm4 + punpckhwd mm0, mm7 + movq [edx + 18H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx] + punpckldq mm1, mm0 + movq mm5, [edx + 10H] + movq mm0, mm4 + movq [edx + 38H], mm6 + punpcklwd mm0, mm5 + movq [edx + 28H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx], mm0 + punpckhwd mm5, mm3 + movq [edx + 10H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 30H], mm4 + movq [edx + 20H], mm2 + movq mm2, [edx + 70H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 50H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 60H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 50H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 60H], mm6 + movq mm2, mm0 + movq mm6, [edx + 40H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 50H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 60H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 50H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx + 40H], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 48H], mm4 + punpckhwd mm0, mm7 + movq [edx + 58H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx + 40H] + punpckldq mm1, mm0 + movq mm5, [edx + 50H] + movq mm0, mm4 + movq [edx + 78H], mm6 + punpcklwd mm0, mm5 + movq [edx + 68H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx + 40H], mm0 + punpckhwd mm5, mm3 + movq [edx + 50H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 70H], mm4 + movq [edx + 60H], mm2 + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 50H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 70H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 60H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 40H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 20H] + paddw mm7, mm7 + movq [edx + 20H], mm2 + paddw mm7, mm4 + movq [edx + 10H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 40H], mm4 + psraw mm5, 4 + movq [edx + 30H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 60H], mm6 + psraw mm0, 4 + movq [edx + 50H], mm5 + movq [edx + 70H], mm7 + movq [edx], mm0 + movq mm2, [edx + 38H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 18H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 28H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 18H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 28H], mm6 + movq mm2, mm0 + movq mm6, [edx + 08H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 18H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 28H] + paddw mm7, mm7 + movq [edx + 28H], mm2 + paddw mm7, mm4 + movq [edx + 18H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 48H], mm4 + psraw mm5, 4 + movq [edx + 38H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 68H], mm6 + psraw mm0, 4 + movq [edx + 58H], mm5 + movq [edx + 78H], mm7 + movq [edx + 08H], mm0 + /* emms */ + } +} + + +void oc_idct8x8_mmx(ogg_int16_t _y[64]){ + _asm { + mov edx, [_y] + mov eax, offset OC_IDCT_CONSTS + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 18H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 38H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 28H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 08H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 20H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 10H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 08H], mm4 + punpckhwd mm0, mm7 + movq [edx + 18H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx] + punpckldq mm1, mm0 + movq mm5, [edx + 10H] + movq mm0, mm4 + movq [edx + 38H], mm6 + punpcklwd mm0, mm5 + movq [edx + 28H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx], mm0 + punpckhwd mm5, mm3 + movq [edx + 10H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 30H], mm4 + movq [edx + 20H], mm2 + movq mm2, [edx + 70H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 50H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 60H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 50H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 60H], mm6 + movq mm2, mm0 + movq mm6, [edx + 40H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 50H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + movq mm3, [edx + 60H] + psubw mm4, mm7 + paddw mm1, mm1 + paddw mm7, mm7 + paddw mm1, mm2 + paddw mm7, mm4 + psubw mm4, mm3 + paddw mm3, mm3 + psubw mm6, mm5 + paddw mm5, mm5 + paddw mm3, mm4 + paddw mm5, mm6 + psubw mm7, mm0 + paddw mm0, mm0 + movq [edx + 50H], mm1 + paddw mm0, mm7 + movq mm1, mm4 + punpcklwd mm4, mm5 + movq [edx + 40H], mm0 + punpckhwd mm1, mm5 + movq mm0, mm6 + punpcklwd mm6, mm7 + movq mm5, mm4 + punpckldq mm4, mm6 + punpckhdq mm5, mm6 + movq mm6, mm1 + movq [edx + 48H], mm4 + punpckhwd mm0, mm7 + movq [edx + 58H], mm5 + punpckhdq mm6, mm0 + movq mm4, [edx + 40H] + punpckldq mm1, mm0 + movq mm5, [edx + 50H] + movq mm0, mm4 + movq [edx + 78H], mm6 + punpcklwd mm0, mm5 + movq [edx + 68H], mm1 + punpckhwd mm4, mm5 + movq mm5, mm2 + punpcklwd mm2, mm3 + movq mm1, mm0 + punpckldq mm0, mm2 + punpckhdq mm1, mm2 + movq mm2, mm4 + movq [edx + 40H], mm0 + punpckhwd mm5, mm3 + movq [edx + 50H], mm1 + punpckhdq mm4, mm5 + punpckldq mm2, mm5 + movq [edx + 70H], mm4 + movq [edx + 60H], mm2 + movq mm2, [edx + 30H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 50H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 10H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 70H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 20H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 60H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 10H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 20H], mm6 + movq mm2, mm0 + movq mm6, [edx] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 40H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 10H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 20H] + paddw mm7, mm7 + movq [edx + 20H], mm2 + paddw mm7, mm4 + movq [edx + 10H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 40H], mm4 + psraw mm5, 4 + movq [edx + 30H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 60H], mm6 + psraw mm0, 4 + movq [edx + 50H], mm5 + movq [edx + 70H], mm7 + movq [edx], mm0 + movq mm2, [edx + 38H] + movq mm6, [eax + 10H] + movq mm4, mm2 + movq mm7, [edx + 58H] + pmulhw mm4, mm6 + movq mm1, [eax + 20H] + pmulhw mm6, mm7 + movq mm5, mm1 + pmulhw mm1, mm2 + movq mm3, [edx + 18H] + pmulhw mm5, mm7 + movq mm0, [eax] + paddw mm4, mm2 + paddw mm6, mm7 + paddw mm2, mm1 + movq mm1, [edx + 78H] + paddw mm7, mm5 + movq mm5, mm0 + pmulhw mm0, mm3 + paddw mm4, mm7 + pmulhw mm5, mm1 + movq mm7, [eax + 30H] + psubw mm6, mm2 + paddw mm0, mm3 + pmulhw mm3, mm7 + movq mm2, [edx + 28H] + pmulhw mm7, mm1 + paddw mm5, mm1 + movq mm1, mm2 + pmulhw mm2, [eax + 08H] + psubw mm3, mm5 + movq mm5, [edx + 68H] + paddw mm0, mm7 + movq mm7, mm5 + psubw mm0, mm4 + pmulhw mm5, [eax + 08H] + paddw mm2, mm1 + pmulhw mm1, [eax + 28H] + paddw mm4, mm4 + paddw mm4, mm0 + psubw mm3, mm6 + paddw mm5, mm7 + paddw mm6, mm6 + pmulhw mm7, [eax + 28H] + paddw mm6, mm3 + movq [edx + 18H], mm4 + psubw mm1, mm5 + movq mm4, [eax + 18H] + movq mm5, mm3 + pmulhw mm3, mm4 + paddw mm7, mm2 + movq [edx + 28H], mm6 + movq mm2, mm0 + movq mm6, [edx + 08H] + pmulhw mm0, mm4 + paddw mm5, mm3 + movq mm3, [edx + 48H] + psubw mm5, mm1 + paddw mm2, mm0 + psubw mm6, mm3 + movq mm0, mm6 + pmulhw mm6, mm4 + paddw mm3, mm3 + paddw mm1, mm1 + paddw mm3, mm0 + paddw mm1, mm5 + pmulhw mm4, mm3 + paddw mm6, mm0 + psubw mm6, mm2 + paddw mm2, mm2 + movq mm0, [edx + 18H] + paddw mm2, mm6 + paddw mm4, mm3 + psubw mm2, mm1 + paddw mm2, [eax + 38H] + paddw mm1, mm1 + paddw mm1, mm2 + psraw mm2, 4 + psubw mm4, mm7 + psraw mm1, 4 + movq mm3, [edx + 28H] + paddw mm7, mm7 + movq [edx + 28H], mm2 + paddw mm7, mm4 + movq [edx + 18H], mm1 + psubw mm4, mm3 + paddw mm4, [eax + 38H] + paddw mm3, mm3 + paddw mm3, mm4 + psraw mm4, 4 + psubw mm6, mm5 + psraw mm3, 4 + paddw mm6, [eax + 38H] + paddw mm5, mm5 + paddw mm5, mm6 + psraw mm6, 4 + movq [edx + 48H], mm4 + psraw mm5, 4 + movq [edx + 38H], mm3 + psubw mm7, mm0 + paddw mm7, [eax + 38H] + paddw mm0, mm0 + paddw mm0, mm7 + psraw mm7, 4 + movq [edx + 68H], mm6 + psraw mm0, 4 + movq [edx + 58H], mm5 + movq [edx + 78H], mm7 + movq [edx + 08H], mm0 + /* emms */ + } +} + +#endif + Index: lib/dec/x86_vc/mmxloopfilter.c =================================================================== --- lib/dec/x86_vc/mmxloopfilter.c (revision 0) +++ lib/dec/x86_vc/mmxloopfilter.c (revision 0) @@ -0,0 +1,378 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------- + MMX based loop filter for the theora codec. + + Originally written by Rudolf Marek, based on code from On2's VP3. + Converted to Visual Studio inline assembly by Nils Pipenbrinck. + + Note: I can't test these since my example files never get into the + loop filters, but the code has been converted semi-automatic from + the GCC sources, so it ought to work. + ---------------------------------------------------------------------*/ +#include "../../internal.h" +#include "x86int.h" +#include + +#if defined(USE_ASM) + + + +static void loop_filter_v(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + _asm { + mov eax, [_pix] + mov edx, [_ystride] + mov ebx, [_ll] + + /* _pix -= ystride */ + sub eax, edx + /* mm0=0 */ + pxor mm0, mm0 + /* _pix -= ystride */ + sub eax, edx + /* esi=_ystride*3 */ + lea esi, [edx + edx*2] + + /* mm7=_pix[0...8]*/ + movq mm7, [eax] + /* mm4=_pix[0...8+_ystride*3]*/ + movq mm4, [eax + esi] + /* mm6=_pix[0...8]*/ + movq mm6, mm7 + /* Expand unsigned _pix[0...3] to 16 bits.*/ + punpcklbw mm6, mm0 + movq mm5, mm4 + /* Expand unsigned _pix[4...7] to 16 bits.*/ + punpckhbw mm7, mm0 + punpcklbw mm4, mm0 + /* Expand other arrays too.*/ + punpckhbw mm5, mm0 + /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/ + psubw mm6, mm4 + psubw mm7, mm5 + /*mm5=mm4=_pix[0...7+_ystride]*/ + movq mm4, [eax + edx] + /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/ + movq mm2, [eax + edx*2] + movq mm5, mm4 + movq mm3, mm2 + movq mm1, mm2 + /*Expand these arrays.*/ + punpckhbw mm5, mm0 + punpcklbw mm4, mm0 + punpckhbw mm3, mm0 + punpcklbw mm2, mm0 + pcmpeqw mm0, mm0 + /*mm0=3 3 3 3 + mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ + psubw mm3, mm5 + psrlw mm0, 14 + psubw mm2, mm4 + /*Scale by 3.*/ + pmullw mm3, mm0 + pmullw mm2, mm0 + /*mm0=4 4 4 4 + f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ + 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ + psrlw mm0, 1 + paddw mm3, mm7 + psllw mm0, 2 + paddw mm2, mm6 + /*Add 4.*/ + paddw mm3, mm0 + paddw mm2, mm0 + /*"Divide" by 8.*/ + psraw mm3, 3 + psraw mm2, 3 + /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ + /*Free up mm5.*/ + packuswb mm4, mm5 + /*mm0=L L L L*/ + movq mm0, [ebx] + /*if(R_i<-2L||R_i>2L)R_i=0:*/ + movq mm5, mm2 + pxor mm6, mm6 + movq mm7, mm0 + psubw mm6, mm0 + psllw mm7, 1 + psllw mm6, 1 + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + pcmpgtw mm7, mm2 + pcmpgtw mm5, mm6 + pand mm2, mm7 + movq mm7, mm0 + pand mm2, mm5 + psllw mm7, 1 + movq mm5, mm3 + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + pcmpgtw mm7, mm3 + pcmpgtw mm5, mm6 + pand mm3, mm7 + movq mm7, mm0 + pand mm3, mm5 + /*if(R_i<-L)R_i'=R_i+2L; + if(R_i>L)R_i'=R_i-2L; + if(R_i<-L||R_i>L)R_i=-R_i':*/ + psraw mm6, 1 + movq mm5, mm2 + psllw mm7, 1 + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm5=R_i>L?FF:00*/ + pcmpgtw mm5, mm0 + /*mm6=-L>R_i?FF:00*/ + pcmpgtw mm6, mm2 + /*mm7=R_i>L?2L:0*/ + pand mm7, mm5 + /*mm2=R_i>L?R_i-2L:R_i*/ + psubw mm2, mm7 + movq mm7, mm0 + /*mm5=-L>R_i||R_i>L*/ + por mm5, mm6 + psllw mm7, 1 + /*mm7=-L>R_i?2L:0*/ + pand mm7, mm6 + pxor mm6, mm6 + /*mm2=-L>R_i?R_i+2L:R_i*/ + paddw mm2, mm7 + psubw mm6, mm0 + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + pand mm5, mm2 + movq mm7, mm0 + /*mm2=-L>R_i||R_i>L?0:R_i*/ + psubw mm2, mm5 + psllw mm7, 1 + /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ + psubw mm2, mm5 + movq mm5, mm3 + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm6=-L>R_i?FF:00*/ + pcmpgtw mm6, mm3 + /*mm5=R_i>L?FF:00*/ + pcmpgtw mm5, mm0 + /*mm7=R_i>L?2L:0*/ + pand mm7, mm5 + /*mm2=R_i>L?R_i-2L:R_i*/ + psubw mm3, mm7 + psllw mm0, 1 + /*mm5=-L>R_i||R_i>L*/ + por mm5, mm6 + /*mm0=-L>R_i?2L:0*/ + pand mm0, mm6 + /*mm3=-L>R_i?R_i+2L:R_i*/ + paddw mm3, mm0 + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + pand mm5, mm3 + /*mm2=-L>R_i||R_i>L?0:R_i*/ + psubw mm3, mm5 + /*mm3=-L>R_i||R_i>L?-R_i':R_i*/ + psubw mm3, mm5 + /*Unfortunately, there's no unsigned byte+signed byte with unsigned + saturation op code, so we have to promote things back 16 bits.*/ + pxor mm0, mm0 + movq mm5, mm4 + punpcklbw mm4, mm0 + punpckhbw mm5, mm0 + movq mm6, mm1 + punpcklbw mm1, mm0 + punpckhbw mm6, mm0 + /*_pix[0...8+_ystride]+=R_i*/ + paddw mm4, mm2 + paddw mm5, mm3 + /*_pix[0...8+_ystride*2]-=R_i*/ + psubw mm1, mm2 + psubw mm6, mm3 + packuswb mm4, mm5 + packuswb mm1, mm6 + /*Write it back out.*/ + movq [eax + edx], mm4 + movq [eax + edx*2], mm1 + } +} + +/*This code implements the bulk of loop_filter_h(). + Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all + four p0's to one register we must transpose the values in four mmx regs. + When half is done we repeat this for the rest.*/ +static void loop_filter_h4(unsigned char *_pix,long _ystride, + const ogg_int16_t *_ll){ + /* todo: merge the comments from the GCC sources */ + _asm { + mov ecx, [_pix] + mov edx, [_ystride] + mov eax, [_ll] + /*esi=_ystride*3*/ + lea esi, [edx + edx*2] + + movd mm0, dword ptr [ecx] + movd mm1, dword ptr [ecx + edx] + movd mm2, dword ptr [ecx + edx*2] + movd mm3, dword ptr [ecx + esi] + punpcklbw mm0, mm1 + punpcklbw mm2, mm3 + movq mm1, mm0 + punpckhwd mm0, mm2 + punpcklwd mm1, mm2 + pxor mm7, mm7 + movq mm5, mm1 + punpcklbw mm1, mm7 + punpckhbw mm5, mm7 + movq mm3, mm0 + punpcklbw mm0, mm7 + punpckhbw mm3, mm7 + psubw mm1, mm3 + movq mm4, mm0 + pcmpeqw mm2, mm2 + psubw mm0, mm5 + psrlw mm2, 14 + pmullw mm0, mm2 + psrlw mm2, 1 + paddw mm0, mm1 + psllw mm2, 2 + paddw mm0, mm2 + psraw mm0, 3 + movq mm6, qword ptr [eax] + movq mm1, mm0 + pxor mm2, mm2 + movq mm3, mm6 + psubw mm2, mm6 + psllw mm3, 1 + psllw mm2, 1 + pcmpgtw mm3, mm0 + pcmpgtw mm1, mm2 + pand mm0, mm3 + pand mm0, mm1 + psraw mm2, 1 + movq mm1, mm0 + movq mm3, mm6 + pcmpgtw mm2, mm0 + pcmpgtw mm1, mm6 + psllw mm3, 1 + psllw mm6, 1 + pand mm3, mm1 + pand mm6, mm2 + psubw mm0, mm3 + por mm1, mm2 + paddw mm0, mm6 + pand mm1, mm0 + psubw mm0, mm1 + psubw mm0, mm1 + paddw mm5, mm0 + psubw mm4, mm0 + packuswb mm5, mm7 + packuswb mm4, mm7 + punpcklbw mm5, mm4 + movd edi, mm5 + mov word ptr [ecx + 01H], di + psrlq mm5, 32 + shr edi, 16 + mov word ptr [ecx + edx + 01H], di + movd edi, mm5 + mov word ptr [ecx + edx*2 + 01H], di + shr edi, 16 + mov word ptr [ecx + esi + 01H], di + } +} + +static void loop_filter_h(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + _pix-=2; + loop_filter_h4(_pix,_ystride,_ll); + loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll); +} + + +/*We copy the whole function because the MMX routines will be inlined 4 times, + and we can do just a single emms call at the end this way. + We also do not use the _bv lookup table, instead computing the values that + would lie in it on the fly.*/ + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, + int _refi,int _pli,int _fragy0,int _fragy_end){ + ogg_int16_t __declspec(align(8)) ll[4]; + th_img_plane *iplane; + oc_fragment_plane *fplane; + oc_fragment *frag_top; + oc_fragment *frag0; + oc_fragment *frag; + oc_fragment *frag_end; + oc_fragment *frag0_end; + oc_fragment *frag_bot; + ll[0]=ll[1]=ll[2]=ll[3]= + (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]]; + iplane=_state->ref_frame_bufs[_refi]+_pli; + fplane=_state->fplanes+_pli; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + frag_top=_state->frags+fplane->froffset; + frag0=frag_top+_fragy0*fplane->nhfrags; + frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags; + frag_bot=_state->frags+fplane->froffset+fplane->nfrags; + while(frag0nhfrags; + while(fragcoded){ + if(frag>frag0){ + loop_filter_h(frag->buffer[_refi],iplane->ystride,ll); + } + if(frag0>frag_top){ + loop_filter_v(frag->buffer[_refi],iplane->ystride,ll); + } + if(frag+1coded){ + loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll); + } + if(frag+fplane->nhfragsnhfrags)->coded){ + loop_filter_v((frag+fplane->nhfrags)->buffer[_refi], + iplane->ystride,ll); + } + } + frag++; + } + frag0+=fplane->nhfrags; + } + + /*This needs to be removed when decode specific functions are implemented:*/ + _mm_empty(); +} + +#endif + Index: lib/dec/x86_vc/mmxstate.c =================================================================== --- lib/dec/x86_vc/mmxstate.c (revision 0) +++ lib/dec/x86_vc/mmxstate.c (revision 0) @@ -0,0 +1,191 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: + + ********************************************************************/ + +/* ------------------------------------------------------------------------ + MMX acceleration of complete fragment reconstruction algorithm. + Originally written by Rudolf Marek. + + Conversion to MSC intrinsics by Nils Pipenbrinck. + ---------------------------------------------------------------------*/ +#if defined(USE_ASM) + +#include "../../internal.h" +#include "../idct.h" +#include "x86int.h" +#include + +static const unsigned char OC_FZIG_ZAGMMX[64]= +{ + 0, 8, 1, 2, 9,16,24,17, + 10, 3,32,11,18,25, 4,12, + 5,26,19,40,33,34,41,48, + 27, 6,13,20,28,21,14, 7, + 56,49,42,35,43,50,57,36, + 15,22,29,30,23,44,37,58, + 51,59,38,45,52,31,60,53, + 46,39,47,54,61,62,55,63 +}; + +/* Fill a block with value */ +static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){ + __m64 t = _value; + _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t; + _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t; + _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t; + _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t; +} + +/* copy a block of 8 byte elements using different strides */ +static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, + unsigned char * _src, int _src_ystride){ + __m64 a,b,c,d,e,f,g,h; + a = *(__m64*)(_src + 0 * _src_ystride); + b = *(__m64*)(_src + 1 * _src_ystride); + c = *(__m64*)(_src + 2 * _src_ystride); + d = *(__m64*)(_src + 3 * _src_ystride); + e = *(__m64*)(_src + 4 * _src_ystride); + f = *(__m64*)(_src + 5 * _src_ystride); + g = *(__m64*)(_src + 6 * _src_ystride); + h = *(__m64*)(_src + 7 * _src_ystride); + *(__m64*)(_dst + 0 * _dst_ystride) = a; + *(__m64*)(_dst + 1 * _dst_ystride) = b; + *(__m64*)(_dst + 2 * _dst_ystride) = c; + *(__m64*)(_dst + 3 * _dst_ystride) = d; + *(__m64*)(_dst + 4 * _dst_ystride) = e; + *(__m64*)(_dst + 5 * _dst_ystride) = f; + *(__m64*)(_dst + 6 * _dst_ystride) = g; + *(__m64*)(_dst + 7 * _dst_ystride) = h; +} + +void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, + ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ + ogg_int16_t __declspec(align(16)) res_buf[64]; + int dst_framei; + int dst_ystride; + int zzi; + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + dequantize fewer coefficients and use a smaller transform when the block + ends with a long zero run instead of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Special case only having a DC component.*/ + if(_last_zzi<2){ + __m64 p; + /*Why is the iquant product rounded in this case and no others? Who knows.*/ + p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5); + /* broadcast 16 bits into all 4 mmx subregisters */ + p = _m_punpcklwd (p,p); + p = _m_punpckldq (p,p); + loc_fill_mmx_value ((__m64 *)res_buf, p); + } + else{ + /*Then, fill in the remainder of the coefficients with 0's, and perform + the iDCT.*/ + /*First zero the buffer.*/ + /*On K7, etc., this could be replaced with movntq and sfence.*/ + loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64()); + + res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant); + /*This is planned to be rewritten in MMX.*/ + for(zzi=1;zzi<_ncoefs;zzi++) + { + int ci; + ci=OC_FZIG_ZAG[zzi]; + res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]* + _ac_iquant[ci]); + } + + if(_last_zzi<10){ + oc_idct8x8_10_mmx(res_buf); + } + else { + oc_idct8x8_mmx(res_buf); + } + } + /*Fill in the target buffer.*/ + dst_framei=_state->ref_frame_idx[OC_FRAME_SELF]; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; + /*For now ystride values in all ref frames assumed to be equal.*/ + if(_frag->mbmode==OC_MODE_INTRA){ + oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf); + } + else{ + int ref_framei; + int ref_ystride; + int mvoffset0; + int mvoffset1; + ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]]; + ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride; + if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0], + _frag->mv[1],ref_ystride,_pli)>1){ + oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride, + _frag->buffer[ref_framei]+mvoffset0,ref_ystride, + _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf); + } + else{ + oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride, + _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf); + } + } + + _mm_empty(); +} + + +void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, + int _nfragis,int _dst_frame,int _src_frame,int _pli){ + const int *fragi; + const int *fragi_end; + int dst_framei; + int dst_ystride; + int src_framei; + int src_ystride; + dst_framei=_state->ref_frame_idx[_dst_frame]; + src_framei=_state->ref_frame_idx[_src_frame]; + dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride; + src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride; + fragi_end=_fragis+_nfragis; + for(fragi=_fragis;fragifrags+*fragi; + loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, + frag->buffer[src_framei], src_ystride); + } + _m_empty(); +} + +#endif + Index: lib/dec/x86_vc/x86int.h =================================================================== --- lib/dec/x86_vc/x86int.h (revision 0) +++ lib/dec/x86_vc/x86int.h (revision 0) @@ -0,0 +1,49 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86int.h 13884 2007-09-22 08:38:10Z giles $ + + ********************************************************************/ + +#if !defined(_x86_x86int_vc_H) +# define _x86_x86int_vc_H (1) +# include "../../internal.h" + +void oc_state_vtable_init_x86(oc_theora_state *_state); + +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, + const ogg_int16_t *_residue); + +void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue); + +void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2, + int _src2_ystride,const ogg_int16_t *_residue); + +void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, + int _nfragis,int _dst_frame,int _src_frame,int _pli); + +void oc_restore_fpu_mmx(void); + +void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, + ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); + +void oc_idct8x8_mmx(ogg_int16_t _y[64]); +void oc_idct8x8_10_mmx(ogg_int16_t _y[64]); + +void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv, + int _refi,int _pli,int _fragy0,int _fragy_end); + +#endif Index: lib/dec/x86_vc/x86state.c =================================================================== --- lib/dec/x86_vc/x86state.c (revision 0) +++ lib/dec/x86_vc/x86state.c (revision 0) @@ -0,0 +1,42 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: x86state.c 13884 2007-09-22 08:38:10Z giles $ + + ********************************************************************/ + +#if defined(USE_ASM) + +#include "x86int.h" +#include "../../cpu.h" + +void oc_state_vtable_init_x86(oc_theora_state *_state){ + _state->cpu_flags=oc_cpu_flags_get(); + + /* fill with defaults */ + oc_state_vtable_init_c(_state); + + /* patch MMX functions */ + if(_state->cpu_flags&OC_CPU_X86_MMX){ + _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; + _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; + _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; + _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; + _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx; + _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; + _state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx; + } +} + +#endif + Index: lib/enc/dct_decode.c =================================================================== --- lib/enc/dct_decode.c (revision 14319) +++ lib/enc/dct_decode.c (working copy) @@ -1309,8 +1309,11 @@ funcs->FilterVert = FilterVert__c; funcs->FilterHoriz = FilterHoriz__c; #if defined(USE_ASM) + // Todo: Port the dct for MSC one day. +#if !defined (_MSC_VER) if (cpu_flags & OC_CPU_X86_MMX) { dsp_mmx_dct_decode_init(funcs); } #endif +#endif } Index: lib/enc/encoder_idct.c =================================================================== --- lib/enc/encoder_idct.c (revision 14319) +++ lib/enc/encoder_idct.c (working copy) @@ -562,8 +562,11 @@ funcs->IDct10 = IDct10__c; funcs->IDct3 = IDct10__c; #if defined(USE_ASM) + // todo: make mmx encoder idct for MSC one day... +#if !defined (_MSC_VER) if (cpu_flags & OC_CPU_X86_MMX) { dsp_mmx_idct_init(funcs); } #endif +#endif } Index: lib/internal.h =================================================================== --- lib/internal.h (revision 14319) +++ lib/internal.h (working copy) @@ -40,6 +40,7 @@ /*Thank you Microsoft, I know the order of operations.*/ # if defined(_MSC_VER) # pragma warning(disable:4554) +# pragma warning(disable:4799) /* disable missing EMMS warnings */ # endif /*This library's version.*/ @@ -497,15 +498,4 @@ oc_state_granule_time_func granule_time; }; -#if defined(_MSC_VER) && !defined(TH_REALLY_NO_ASSEMBLY) -# error You are compiling theora without inline assembly.\ - This is probably not what you want. Instead, please either\ - (1) download the assembly .lib binaries or\ - (2) compile them yourself using MinGW, and make Visual Studio\ - link against them.\ - Please seriously consider this before defining TH_REALLY_NO_ASSEMBLY\ - to disable this message and compile without inline assembly.\ - Thank you! #endif - -#endif Index: win32/VS2005/libtheora/libtheora.vcproj =================================================================== --- win32/VS2005/libtheora/libtheora.vcproj (revision 14319) +++ win32/VS2005/libtheora/libtheora.vcproj (working copy) @@ -42,7 +42,7 @@ Name="VCCLCompilerTool" Optimization="0" AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;" - PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;" + PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM" MinimalRebuild="true" BasicRuntimeChecks="3" RuntimeLibrary="1" @@ -129,7 +129,7 @@ EnableIntrinsicFunctions="true" FavorSizeOrSpeed="1" AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;" - PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;" + PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM" StringPooling="true" ExceptionHandling="0" RuntimeLibrary="0" @@ -1477,26 +1477,314 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +