/******************************************************************** * * * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * * * * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 * * by the Xiph.Org Foundation http://www.xiph.org/ * * * ******************************************************************** function: last mod: $Id: mmxfrag.c 14154 2007-11-15 14:40:39Z tterribe $ ********************************************************************/ /*MMX acceleration of fragment reconstruction for motion compensation. Originally written by Rudolf Marek.*/ /*------------------------------------------------------------- Optimized fragment reconstruction for motion compensation for the theora codec. MMX Optimization by Nils Pipenbrinck. Note: Loops are unrolled for best performance. The iteration each instruction belongs to is marked in the comments as #i -------------------------------------------------------------*/ #include "x86int.h" #if defined(USE_ASM) void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride, const ogg_int16_t *_residue){ __asm__ __volatile__( "pcmpeqw %%mm0,%%mm0\n\t" /* 0xFFFFFFFFFFFFFFFF */ "movq 0*8(%[residue]),%%mm1\n\t" /* #1 load low residue */ "movq 1*8(%[residue]),%%mm2\n\t" /* #1 load high residue */ "psllw $15,%%mm0\n\t" /* 0x8000800080008000 */ "movq 2*8(%[residue]),%%mm3\n\t" /* #2 load low residue */ "movq 3*8(%[residue]),%%mm4\n\t" /* #2 load high residue */ "psrlw $8,%%mm0\n\t" /* 0x0080008000800080 */ "movq 4*8(%[residue]),%%mm5\n\t" /* #3 load low residue */ "movq 5*8(%[residue]),%%mm6\n\t" /* #3 load high residue */ "paddsw %%mm0,%%mm1\n\t" /* #1 bias low residue */ "paddsw %%mm0,%%mm2\n\t" /* #1 bias high residue */ "packuswb %%mm2,%%mm1\n\t" /* #1 pack to byte */ "paddsw %%mm0,%%mm3\n\t" /* #2 bias low residue */ "paddsw %%mm0,%%mm4\n\t" /* #2 bias high residue */ "packuswb %%mm4,%%mm3\n\t" /* #2 pack to byte */ "paddsw %%mm0,%%mm5\n\t" /* #3 bias low residue */ "paddsw %%mm0,%%mm6\n\t" /* #3 bias high residue */ "packuswb %%mm6,%%mm5\n\t" /* #3 pack to byte */ "movq %%mm1,(%[dst])\n\t" /* #1 write row */ "movq %%mm3,(%[dst],%[stride])\n\t" /* #2 write row */ "movq %%mm5,(%[dst],%[stride],2)\n\t" /* #3 write row */ "movq 6*8(%[residue]),%%mm1\n\t" /* #4 load low residue */ "movq 7*8(%[residue]),%%mm2\n\t" /* #4 load high residue */ "movq 8*8(%[residue]),%%mm3\n\t" /* #5 load high residue */ "movq 9*8(%[residue]),%%mm4\n\t" /* #5 load high residue */ "movq 10*8(%[residue]),%%mm5\n\t" /* #6 load high residue */ "movq 11*8(%[residue]),%%mm6\n\t" /* #6 load high residue */ "paddsw %%mm0,%%mm1\n\t" /* #4 bias low residue */ "paddsw %%mm0,%%mm2\n\t" /* #4 bias high residue */ "packuswb %%mm2,%%mm1\n\t" /* #4 pack to byte */ "paddsw %%mm0,%%mm3\n\t" /* #5 bias low residue */ "paddsw %%mm0,%%mm4\n\t" /* #5 bias high residue */ "packuswb %%mm4,%%mm3\n\t" /* #5 pack to byte */ "paddsw %%mm0,%%mm5\n\t" /* #6 bias low residue */ "paddsw %%mm0,%%mm6\n\t" /* #6 bias high residue */ "packuswb %%mm6,%%mm5\n\t" /* #6 pack to byte */ "movq %%mm1,(%[dst],%[by3])\n\t" /* #4 write row */ "movq %%mm3,(%[dst],%[stride],4)\n\t" /* #5 write row */ "movq %%mm5,(%[dst],%[by5])\n\t" /* #6 write row */ "movq 12*8(%[residue]),%%mm1\n\t" /* #7 load low residue */ "movq 13*8(%[residue]),%%mm2\n\t" /* #7 load high residue */ "movq 14*8(%[residue]),%%mm3\n\t" /* #8 load low residue */ "movq 15*8(%[residue]),%%mm4\n\t" /* #8 load high residue */ "paddsw %%mm0,%%mm1\n\t" /* #7 bias low residue */ "paddsw %%mm0,%%mm2\n\t" /* #7 bias high residue */ "packuswb %%mm2,%%mm1\n\t" /* #7 pack to byte */ "paddsw %%mm0,%%mm3\n\t" /* #8 bias low residue */ "paddsw %%mm0,%%mm4\n\t" /* #8 bias high residue */ "packuswb %%mm4,%%mm3\n\t" /* #8 pack to byte */ "movq %%mm1,(%[dst],%[by3],2)\n\t" /* #7 write row */ "movq %%mm3,(%[dst],%[by7])\n\t" /* #8 write row */ : :[dst] "r" (_dst), [residue]"r" (_residue), [stride] "r" (_dst_ystride), [by3] "r" (_dst_ystride*3), [by5] "r" (_dst_ystride*5), [by7] "r" (_dst_ystride*7) :"memory"); } void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride, const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){ __asm__ __volatile__( "pxor %%mm0,%%mm0\n\t" ".p2align 4,,15\n\t" "L%=:\n\t" "movq (%[src]),%%mm3\n\t" /* #1 load source */ "movq (%[src],%[s_stride]),%%mm7\n\t" /* #2 load source */ "movq %%mm3,%%mm4\n\t" /* #1 get copy of src */ "punpckhbw %%mm0,%%mm4\n\t" /* #1 expand high source */ "punpcklbw %%mm0,%%mm3\n\t" /* #1 expand low source */ "paddsw 8(%[residue]),%%mm4\n\t" /* #1 add residue high */ "movq %%mm7,%%mm2\n\t" /* #2 get copy of src */ "paddsw (%[residue]), %%mm3\n\t" /* #1 add residue low */ "punpckhbw %%mm0,%%mm2\n\t" /* #2 expand high source */ "packuswb %%mm4,%%mm3\n\t" /* #1 final row pixels */ "punpcklbw %%mm0,%%mm7\n\t" /* #2 expand low source */ "paddsw 16(%[residue]),%%mm7\n\t" /* #2 add residue low */ "paddsw 24(%[residue]),%%mm2\n\t" /* #2 add residue high */ "sub $1,%[counter]\n\t" /* update loop counter */ "lea 32(%[residue]),%[residue]\n\t" /* rsrcdue += 4 */ "packuswb %%mm2,%%mm7\n\t" /* #2 final row */ "lea (%[src],%[s_stride],2),%[src]\n\t" /* src += stride * 2 */ "movq %%mm3,(%[dst])\n\t" /* #1 write row */ "movq %%mm7,(%[dst],%[d_stride])\n\t" /* #2 write row */ "lea (%[dst],%[d_stride],2),%[dst]\n\t" /* dst += stride * 2 */ "jne L%=\n\t" : :[counter]"r"(4), [src]"r"((long)_src), [s_stride]"r"((long)_src_ystride), [dst]"r"((long)_dst), [d_stride]"r"(_dst_ystride), [residue] "r"((long)_residue) :"memory"); } void oc_frag_recon_inter2_mmx (unsigned char *_dst, int _dst_ystride, const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2, int _src2_ystride,const ogg_int16_t *_residue){ if (_dst_ystride==_src1_ystride && _dst_ystride==_src2_ystride){ /* Fast version with equal strides */ __asm__ __volatile__( "pxor %%mm7,%%mm7\n\t" /* generate zero */ ".p2align 4,,15\n\t" "L%=:\n\t" "movq (%[src1]),%%mm0\n\t" /* #1 load source1 */ "movq (%[src2]),%%mm2\n\t" /* #1 load source2 */ "movq %%mm0,%%mm1\n\t" /* #1 copy of source1 */ "movq %%mm2,%%mm3\n\t" /* #1 copy of source2 */ "movq (%[src1],%[stride]),%%mm4\n\t" /* #2 load source1 */ "punpcklbw %%mm7,%%mm0\n\t" /* #1 lower src 1 */ "movq (%[src2],%[stride]),%%mm5\n\t" /* #2 load source2 */ "punpckhbw %%mm7,%%mm1\n\t" /* #1 higher src 1 */ "punpcklbw %%mm7,%%mm2\n\t" /* #1 lower src 2 */ "punpckhbw %%mm7,%%mm3\n\t" /* #1 higher src 2 */ "lea (%[src1],%[stride],2),%[src1]\n\t" /* advance src1 ptr */ "lea (%[src2],%[stride],2),%[src2]\n\t" /* advance src2 ptr */ "paddsw %%mm2,%%mm0\n\t" /* #1 lo src1 + src2 */ "paddsw %%mm3,%%mm1\n\t" /* #1 hi src1 + src2 */ "movq %%mm4,%%mm2\n\t" /* #2 copy of source1 */ "psraw $1,%%mm0\n\t" /* #1 build lo average */ "movq %%mm5,%%mm3\n\t" /* #2 copy of source2 */ "punpcklbw %%mm7,%%mm4\n\t" /* #2 lower src 1 */ "psraw $1,%%mm1\n\t" /* #1 build hi average */ "punpckhbw %%mm7,%%mm2\n\t" /* #2 higher src 1 */ "paddsw (%[residue]),%%mm0\n\t" /* #1 low + residue */ "punpcklbw %%mm7,%%mm5\n\t" /* #2 lower src 2 */ "paddsw 8(%[residue]),%%mm1\n\t" /* #1 high + residue */ "punpckhbw %%mm7,%%mm3\n\t" /* #2 higher src 2 */ "paddsw %%mm4,%%mm5\n\t" /* #2 lower src1 + src2 */ "packuswb %%mm1,%%mm0\n\t" /* #1 pack and saturate */ "paddsw %%mm2,%%mm3\n\t" /* #2 hi src1 + src 2 */ "movq %%mm0,(%[dst])\n\t" /* write row */ "psraw $1,%%mm5\n\t" /* #2 build lo average */ "psraw $1,%%mm3\n\t" /* #2 build hi average */ "paddsw 16(%[residue]),%%mm5\n\t" /* #2 low + residue */ "paddsw 24(%[residue]),%%mm3\n\t" /* #2 high + residue */ "packuswb %%mm3,%%mm5\n\t" /* #2 pack and saturate */ "movq %%mm5,(%[dst],%[stride])\n\t" /* write row */ "add $32,%[residue]\n\t" /* advance residue */ "sub $1,%[counter]\n\t" /* decrement counter */ "lea (%[dst],%[stride],2),%[dst]\n\t" /* advance dest */ "jne L%=\n\t" : :[counter]"r"(4), [src1]"r"((long)_src1), [src2]"r"((long)_src2), [dst]"r"((long)_dst), [stride]"r"(_dst_ystride), [residue] "r"((long)_residue) :"memory"); } else{ /* slightly slower version with different strides - usually this codepath is not taken, but we have it for compatibility with the old code */ __asm__ __volatile__( "pxor %%mm7,%%mm7\n\t" /* generate zero */ ".p2align 4,,15\n\t" "L%=:\n\t" "movq (%[src1]),%%mm0\n\t" /* #1 load source1 */ "movq (%[src2]),%%mm2\n\t" /* #1 load source2 */ "add %[src1_ystride],%[src1]\n\t" /* #1 src1 += stride */ "add %[src2_ystride],%[src2]\n\t" /* #1 src2 += stride */ "movq %%mm0,%%mm1\n\t" /* #1 copy of source1 */ "movq %%mm2,%%mm3\n\t" /* #1 copy of source2 */ "movq (%[src1]),%%mm4\n\t" /* #2 load source1 */ "punpcklbw %%mm7,%%mm0\n\t" /* #1 lower src 1 */ "movq (%[src2]),%%mm5\n\t" /* #2 load source2 */ "punpckhbw %%mm7,%%mm1\n\t" /* #1 higher src 1 */ "punpcklbw %%mm7,%%mm2\n\t" /* #1 lower src 2 */ "add %[src1_ystride],%[src1]\n\t" /* #2 src1 += stride */ "add %[src2_ystride],%[src2]\n\t" /* #2 src2 += stride */ "punpckhbw %%mm7,%%mm3\n\t" /* #1 higher src 2 */ "paddsw %%mm2,%%mm0\n\t" /* #1 lo src1 + src2 */ "paddsw %%mm3,%%mm1\n\t" /* #1 hi src1 + src2 */ "movq %%mm4,%%mm2\n\t" /* #2 copy of source1 */ "psraw $1,%%mm0\n\t" /* #1 build lo average */ "movq %%mm5,%%mm3\n\t" /* #2 copy of source2 */ "punpcklbw %%mm7,%%mm4\n\t" /* #2 lower src 1 */ "psraw $1,%%mm1\n\t" /* #1 build hi average */ "punpckhbw %%mm7,%%mm2\n\t" /* #2 higher src 1 */ "paddsw (%[residue]),%%mm0\n\t" /* #1 low + residue */ "punpcklbw %%mm7,%%mm5\n\t" /* #2 lower src 2 */ "paddsw 8(%[residue]),%%mm1\n\t" /* #1 high + residue */ "punpckhbw %%mm7,%%mm3\n\t" /* #2 higher src 2 */ "paddsw %%mm4,%%mm5\n\t" /* #2 lower src1 + src2 */ "packuswb %%mm1,%%mm0\n\t" /* #1 pack and saturate */ "paddsw %%mm2,%%mm3\n\t" /* #2 higher src1 + src2*/ "movq %%mm0,(%[dst])\n\t" /* write row */ "psraw $1,%%mm5\n\t" /* #2 build lo average */ "psraw $1,%%mm3\n\t" /* #2 build hi average */ "paddsw 16(%[residue]),%%mm5\n\t" /* #2 low + residue */ "paddsw 24(%[residue]),%%mm3\n\t" /* #2 high + residue */ "packuswb %%mm3,%%mm5\n\t" /* #2 pack and saturate */ "movq %%mm5,(%[dst],%[dst_stride])\n\t" /* write row */ "add $32,%[residue]\n\t" /* advance residue */ "sub $1,%[counter]\n\t" /* decrement counter */ "lea (%[dst],%[dst_stride],2),%[dst]\n\t" /* advance dest */ "jne L%=\n\t" : :[counter]"r"(4), [src1]"r"((long)_src1), [src2]"r"((long)_src2), [dst]"r"((long)_dst), [dst_stride]"r"(_dst_ystride), [residue] "r"((long)_residue), [src1_ystride]"m"((long)_src1_ystride), [src2_ystride]"m"((long)_src2_ystride) :"memory"); } } void oc_restore_fpu_mmx(void){ __asm__ __volatile__("emms\n\t"); } #endif