Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

X86.c

Go to the documentation of this file.
00001 /*
00002 
00003     x86 specific optimized assembler dsp routines
00004     Copyright (C) 2001-2004 Jussi Laako
00005 
00006     This program is free software; you can redistribute it and/or modify
00007     it under the terms of the GNU General Public License as published by
00008     the Free Software Foundation; either version 2 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014     GNU General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License
00017     along with this program; if not, write to the Free Software
00018     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019 
00020 */
00021 
00022 
00023 #ifdef DSP_X86
00024 
00025 
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031 
00032 #include "dsp/X86.h"
00033 
00034 
00035 #ifndef DSP_X86_64
00036 static char cpCPUid[13];
00037 #endif
00038 
00039 
00040 #ifdef __cplusplus
00041 extern "C"
00042 {
00043 #endif
00044 
00045 
00046 #ifndef DSP_X86_64
00047 const char *dsp_x86_cpuid ()
00048 {
00049     unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00050     
00051     X86_ASM (
00052         "pushl %%eax\n\t" \
00053         "pushl %%ebx\n\t" \
00054         "pushl %%ecx\n\t" \
00055         "pushl %%edx\n\t" \
00056         "xorl %%eax, %%eax\n\t" \
00057         "cpuid\n\t" \
00058         "movl %%ebx, %0\n\t" \
00059         "movl %%ecx, %2\n\t" \
00060         "movl %%edx, %1\n\t" \
00061         "popl %%edx\n\t" \
00062         "popl %%ecx\n\t" \
00063         "popl %%ebx\n\t" \
00064         "popl %%eax\n\t"
00065         : "=m" (ipCPUid[0]),
00066           "=m" (ipCPUid[1]),
00067           "=m" (ipCPUid[2])
00068         :
00069         : "eax", "ebx", "ecx", "edx", "memory");
00070     cpCPUid[12] = '\0';
00071 
00072     return cpCPUid;
00073 }
00074 
00075 
00076 unsigned int dsp_x86_features ()
00077 {
00078     unsigned int uiFeatures = 0;
00079     
00080     X86_ASM (
00081         "pushl %%eax\n\t" \
00082         "pushl %%ebx\n\t" \
00083         "pushl %%ecx\n\t" \
00084         "pushl %%edx\n\t" \
00085         "movl $1, %%eax\n\t" \
00086         "cpuid\n\t" \
00087         "movl %%edx, %0\n\t" \
00088         "popl %%edx\n\t" \
00089         "popl %%ecx\n\t" \
00090         "popl %%ebx\n\t" \
00091         "popl %%eax\n\t"
00092         : "=m" (uiFeatures)
00093         :
00094         : "eax", "ebx", "ecx", "edx", "memory");
00095     
00096     return uiFeatures;
00097 }
00098 
00099 
00100 unsigned int dsp_x86_amd_features ()
00101 {
00102     unsigned int uiFunction = 0x80000001;
00103     unsigned int uiFeatures = 0;
00104     
00105     X86_ASM (
00106         "pushl %%eax\n\t" \
00107         "pushl %%ebx\n\t" \
00108         "pushl %%ecx\n\t" \
00109         "pushl %%edx\n\t" \
00110         "movl %1, %%eax\n\t" \
00111         "cpuid\n\t" \
00112         "movl %%edx, %0\n\t" \
00113         "popl %%edx\n\t" \
00114         "popl %%ecx\n\t" \
00115         "popl %%ebx\n\t" \
00116         "popl %%eax\n\t"
00117         : "=m" (uiFeatures)
00118         : "m" (uiFunction)
00119         : "eax", "ebx", "ecx", "edx", "memory");
00120     
00121     return uiFeatures;
00122 }
00123 #endif
00124 
00125 
00126 extern int dsp_x86_have_e3dnow ()
00127 {
00128     #ifndef DSP_X86_64
00129     unsigned int uiFeatures;
00130 
00131     if (strcmp(dsp_x86_cpuid(), "AuthenticAMD") == 0)
00132     {
00133         uiFeatures = dsp_x86_amd_features();
00134         if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00135             return 1;
00136     }
00137     return 0;
00138     #else
00139     return 1;
00140     #endif
00141 }
00142 
00143 
00144 extern int dsp_x86_have_sse2 ()
00145 {
00146     #ifndef DSP_X86_64
00147     unsigned int uiFeatures;
00148     
00149     uiFeatures = dsp_x86_features();
00150     if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00151         return 1;
00152     return 0;
00153     #else
00154     return 1;
00155     #endif
00156 }
00157 
00158 
00159 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00160 {
00161     int iStartIdx;
00162     int iDataCntr;
00163     int iDataCount;
00164     stpm64 m64pDest = (stpm64) fpDest;
00165     stpm64 m64pSrc = (stpm64) fpSrc;
00166     
00167     iStartIdx = 0;
00168     X86_ASM (
00169         "prefetchnta %0\n\t" \
00170         "prefetchnta %1\n\t" \
00171         "prefetchnta %2\n\t" \
00172         "prefetchnta %3\n\t"
00173         :
00174         : "m" (m64pSrc[0]),
00175           "m" (m64pSrc[8]),
00176           "m" (m64pSrc[16]),
00177           "m" (m64pSrc[24]));
00178     iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00179     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00180     {
00181         X86_ASM (
00182             "prefetchnta %16\n\t" \
00183             "movq %8, %%mm0\n\t" \
00184             "movq %9, %%mm1\n\t" \
00185             "movq %10, %%mm2\n\t" \
00186             "movq %11, %%mm3\n\t" \
00187             "movq %12, %%mm4\n\t" \
00188             "movq %13, %%mm5\n\t" \
00189             "movq %14, %%mm6\n\t" \
00190             "movq %15, %%mm7\n\t" \
00191             "movntq %%mm0, %0\n\t" \
00192             "movntq %%mm1, %1\n\t" \
00193             "movntq %%mm2, %2\n\t" \
00194             "movntq %%mm3, %3\n\t" \
00195             "movntq %%mm4, %4\n\t" \
00196             "movntq %%mm5, %5\n\t" \
00197             "movntq %%mm6, %6\n\t" \
00198             "movntq %%mm7, %7\n\t"
00199             : "=m" (m64pDest[iDataCntr]),
00200               "=m" (m64pDest[iDataCntr + 1]),
00201               "=m" (m64pDest[iDataCntr + 2]),
00202               "=m" (m64pDest[iDataCntr + 3]),
00203               "=m" (m64pDest[iDataCntr + 4]),
00204               "=m" (m64pDest[iDataCntr + 5]),
00205               "=m" (m64pDest[iDataCntr + 6]),
00206               "=m" (m64pDest[iDataCntr + 7])
00207             : "m" (m64pSrc[iDataCntr]),
00208               "m" (m64pSrc[iDataCntr + 1]),
00209               "m" (m64pSrc[iDataCntr + 2]),
00210               "m" (m64pSrc[iDataCntr + 3]),
00211               "m" (m64pSrc[iDataCntr + 4]),
00212               "m" (m64pSrc[iDataCntr + 5]),
00213               "m" (m64pSrc[iDataCntr + 6]),
00214               "m" (m64pSrc[iDataCntr + 7]),
00215               "m" (m64pSrc[iDataCntr + 32])
00216             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00217     }
00218     iStartIdx = iDataCount;
00219     iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00220     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00221     {
00222         X86_ASM (
00223             "prefetchnta %2\n\t" \
00224             "movq %1, %%mm0\n\t" \
00225             "movntq %%mm0, %0\n\t"
00226             : "=m" (m64pDest[iDataCntr])
00227             : "m" (m64pSrc[iDataCntr]),
00228               "m" (m64pSrc[iDataCntr + 32])
00229             : "mm0", "memory");
00230     }
00231     if (iDataLength & 0x1)
00232     {
00233         X86_ASM (
00234             "movd %1, %%mm0\n\t" \
00235             "movd %%mm0, %0\n\t"
00236             : "=m" (fpDest[iDataLength - 1])
00237             : "m" (fpSrc[iDataLength - 1])
00238             : "mm0", "memory");
00239     }
00240     X86_ASM (
00241         "femms\n\t" \
00242         "sfence\n\t");
00243 }
00244 
00245 
00246 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00247 {
00248     int iStartIdx;
00249     int iDataCntr;
00250     int iDataCount;
00251     
00252     iStartIdx = 0;
00253     X86_ASM (
00254         "prefetchnta %0\n\t" \
00255         "prefetchnta %1\n\t" \
00256         "prefetchnta %2\n\t" \
00257         "prefetchnta %3\n\t"
00258         :
00259         : "m" (dpSrc[0]),
00260           "m" (dpSrc[8]),
00261           "m" (dpSrc[16]),
00262           "m" (dpSrc[24]));
00263     iDataCount = (iDataLength & 0xfffffff8);
00264     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00265     {
00266         X86_ASM (
00267             "prefetchnta %16\n\t" \
00268             "movq %8, %%mm0\n\t" \
00269             "movq %9, %%mm1\n\t" \
00270             "movq %10, %%mm2\n\t" \
00271             "movq %11, %%mm3\n\t" \
00272             "movq %12, %%mm4\n\t" \
00273             "movq %13, %%mm5\n\t" \
00274             "movq %14, %%mm6\n\t" \
00275             "movq %15, %%mm7\n\t" \
00276             "movntq %%mm0, %0\n\t" \
00277             "movntq %%mm1, %1\n\t" \
00278             "movntq %%mm2, %2\n\t" \
00279             "movntq %%mm3, %3\n\t" \
00280             "movntq %%mm4, %4\n\t" \
00281             "movntq %%mm5, %5\n\t" \
00282             "movntq %%mm6, %6\n\t" \
00283             "movntq %%mm7, %7\n\t"
00284             : "=m" (dpDest[iDataCntr]),
00285               "=m" (dpDest[iDataCntr + 1]),
00286               "=m" (dpDest[iDataCntr + 2]),
00287               "=m" (dpDest[iDataCntr + 3]),
00288               "=m" (dpDest[iDataCntr + 4]),
00289               "=m" (dpDest[iDataCntr + 5]),
00290               "=m" (dpDest[iDataCntr + 6]),
00291               "=m" (dpDest[iDataCntr + 7])
00292             : "m" (dpSrc[iDataCntr]),
00293               "m" (dpSrc[iDataCntr + 1]),
00294               "m" (dpSrc[iDataCntr + 2]),
00295               "m" (dpSrc[iDataCntr + 3]),
00296               "m" (dpSrc[iDataCntr + 4]),
00297               "m" (dpSrc[iDataCntr + 5]),
00298               "m" (dpSrc[iDataCntr + 6]),
00299               "m" (dpSrc[iDataCntr + 7]),
00300               "m" (dpSrc[iDataCntr + 32])
00301             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00302     }
00303     iStartIdx = iDataCount;
00304     iDataCount = iDataLength;
00305     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00306     {
00307         X86_ASM (
00308             "prefetchnta %2\n\t" \
00309             "movq %1, %%mm0\n\t" \
00310             "movntq %%mm0, %0\n\t"
00311             : "=m" (dpDest[iDataCntr])
00312             : "m" (dpSrc[iDataCntr]),
00313               "m" (dpSrc[iDataCntr + 32])
00314             : "mm0", "memory");
00315     }
00316     X86_ASM (
00317         "femms\n\t" \
00318         "sfence\n\t");
00319 }
00320 
00321 
00322 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00323 {
00324     int iDataCntr;
00325     int iDataCount;
00326     stpm64 m64pVect = (stpm64) fpVect;
00327     stm64 m64Src;
00328 
00329     m64Src.f[0] = m64Src.f[1] = fSrc;
00330     iDataCount = (iDataLength >> 1);
00331     X86_ASM (
00332         "movq %0, %%mm1\n\t"
00333         :
00334         : "m" (m64Src)
00335         : "mm1", "memory");
00336     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00337     {
00338         X86_ASM (
00339             "movq %1, %%mm0\n\t" \
00340             "pfadd %%mm1, %%mm0\n\t" \
00341             "movntq %%mm0, %0\n\t"
00342             : "=m" (m64pVect[iDataCntr])
00343             : "0" (m64pVect[iDataCntr])
00344             : "mm0", "mm1", "memory");
00345     }
00346     if (iDataLength & 0x1)
00347     {
00348         X86_ASM (
00349             "movd %1, %%mm0\n\t" \
00350             "pfadd %%mm1, %%mm0\n\t" \
00351             "movd %%mm0, %0\n\t"
00352             : "=m" (fpVect[iDataLength - 1])
00353             : "0" (fpVect[iDataLength - 1])
00354             : "mm0", "mm1", "memory");
00355     }
00356     X86_ASM (
00357         "femms\n\t" \
00358         "sfence\n\t");
00359 }
00360 
00361 
00362 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
00363 {
00364     int iDataCntr;
00365     
00366     X86_ASM (
00367         "movss %0, %%xmm1\n\t"
00368         :
00369         : "m" (fSrc)
00370         : "xmm1", "memory");
00371     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00372     {
00373         X86_ASM (
00374             "movss %1, %%xmm0\n\t" \
00375             "addss %%xmm1, %%xmm0\n\t" \
00376             "movss %%xmm0, %0\n\t"
00377             : "=m" (fpVect[iDataCntr])
00378             : "0" (fpVect[iDataCntr])
00379             : "xmm0", "xmm1", "memory");
00380     }
00381 }
00382 
00383 
00384 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
00385 {
00386     int iDataCntr;
00387     
00388     X86_ASM (
00389         "movsd %0, %%xmm1\n\t"
00390         :
00391         : "m" (dSrc)
00392         : "xmm1", "memory");
00393     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00394     {
00395         X86_ASM (
00396             "movsd %1, %%xmm0\n\t" \
00397             "addsd %%xmm1, %%xmm0\n\t" \
00398             "movsd %%xmm0, %0\n\t"
00399             : "=m" (dpVect[iDataCntr])
00400             : "0" (dpVect[iDataCntr])
00401             : "xmm0", "xmm1", "memory");
00402     }
00403 }
00404 
00405 
00406 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00407 {
00408     int iDataCntr;
00409     int iDataCount;
00410     stpm64 m64pVect = (stpm64) fpVect;
00411     stm64 m64Src;
00412 
00413     m64Src.f[0] = m64Src.f[1] = fSrc;
00414     iDataCount = (iDataLength >> 1);
00415     X86_ASM (
00416         "movq %0, %%mm1\n\t"
00417         :
00418         : "m" (m64Src)
00419         : "mm1", "memory");
00420     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00421     {
00422         X86_ASM (
00423             "movq %1, %%mm0\n\t" \
00424             "pfmul %%mm1, %%mm0\n\t" \
00425             "movntq %%mm0, %0\n\t"
00426             : "=m" (m64pVect[iDataCntr])
00427             : "0" (m64pVect[iDataCntr])
00428             : "mm0", "mm1", "memory");
00429     }
00430     if (iDataLength & 0x1)
00431     {
00432         X86_ASM (
00433             "movd %1, %%mm0\n\t" \
00434             "pfmul %%mm1, %%mm0\n\t" \
00435             "movd %%mm0, %0\n\t"
00436             : "=m" (fpVect[iDataLength - 1])
00437             : "0" (fpVect[iDataLength - 1])
00438             : "mm0", "mm1", "memory");
00439     }
00440     X86_ASM (
00441         "femms\n\t" \
00442         "sfence\n\t");
00443 }
00444 
00445 
00446 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
00447 {
00448     int iDataCntr;
00449 
00450     X86_ASM (
00451         "movss %0, %%xmm1\n\t"
00452         :
00453         : "m" (fSrc)
00454         : "xmm1", "memory");
00455     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00456     {
00457         X86_ASM (
00458             "movss %1, %%xmm0\n\t" \
00459             "mulss %%xmm1, %%xmm0\n\t" \
00460             "movss %%xmm0, %0\n\t"
00461             : "=m" (fpVect[iDataCntr])
00462             : "0" (fpVect[iDataCntr])
00463             : "xmm0", "xmm1", "memory");
00464     }
00465 }
00466 
00467 
00468 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
00469 {
00470     int iDataCntr;
00471     
00472     X86_ASM (
00473         "movsd %0, %%xmm1\n\t"
00474         :
00475         : "m" (dSrc)
00476         : "xmm1", "memory");
00477     for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
00478     {
00479         X86_ASM (
00480             "movsd %1, %%xmm0\n\t" \
00481             "mulsd %%xmm1, %%xmm0\n\t" \
00482             "movsd %%xmm0, %0\n\t"
00483             : "=m" (dpVect[iDataCntr])
00484             : "0" (dpVect[iDataCntr])
00485             : "xmm0", "xmm1", "memory");
00486     }
00487 }
00488 
00489 
00490 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2, 
00491     int iDataLength)
00492 {
00493     int iDataCntr;
00494     int iDataCount;
00495     stpm64 m64pDest = (stpm64) fpDest;
00496     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00497     stm64 m64Src2;
00498 
00499     m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00500     iDataCount = (iDataLength >> 1);
00501     X86_ASM (
00502         "movq %0, %%mm1\n\t"
00503         :
00504         : "m" (m64Src2)
00505         : "mm1", "memory");
00506     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00507     {
00508         X86_ASM (
00509             "movq %1, %%mm0\n\t" \
00510             "pfmul %%mm1, %%mm0\n\t" \
00511             "movntq %%mm0, %0\n\t"
00512             : "=m" (m64pDest[iDataCntr])
00513             : "m" (m64pSrc1[iDataCntr])
00514             : "mm0", "mm1", "memory");
00515     }
00516     if (iDataLength & 0x1)
00517     {
00518         X86_ASM (
00519             "movd %1, %%mm0\n\t" \
00520             "pfmul %%mm1, %%mm0\n\t" \
00521             "movd %%mm0, %0\n\t"
00522             : "=m" (fpDest[iDataLength - 1])
00523             : "m" (fpSrc1[iDataLength - 1])
00524             : "mm0", "mm1", "memory");
00525     }
00526     X86_ASM (
00527         "femms\n\t" \
00528         "sfence\n\t");
00529 }
00530 
00531 
00532 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00533     int iDataLength)
00534 {
00535     int iDataCntr;
00536 
00537     X86_ASM (
00538         "movss %0, %%xmm1\n\t"
00539         :
00540         : "m" (fSrc2)
00541         : "xmm1", "memory");
00542     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00543     {
00544         X86_ASM (
00545             "movss %1, %%xmm0\n\t" \
00546             "mulss %%xmm1, %%xmm0\n\t" \
00547             "movss %%xmm0, %0\n\t"
00548             : "=m" (fpDest[iDataCntr])
00549             : "m" (fpSrc1[iDataCntr])
00550             : "xmm0", "xmm1", "memory");
00551     }
00552 }
00553 
00554 
00555 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
00556     int iDataLength)
00557 {
00558     int iDataCntr;
00559     
00560     X86_ASM (
00561         "movsd %0, %%xmm1\n\t"
00562         :
00563         : "m" (dSrc2)
00564         : "xmm1", "memory");
00565     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00566     {
00567         X86_ASM (
00568             "movsd %1, %%xmm0\n\t" \
00569             "mulsd %%xmm1, %%xmm0\n\t" \
00570             "movsd %%xmm0, %0\n\t"
00571             : "=m" (dpDest[iDataCntr])
00572             : "m" (dpSrc1[iDataCntr])
00573             : "xmm0", "xmm1", "memory");
00574     }
00575 }
00576 
00577 
00578 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00579 {
00580     int iDataCntr;
00581     int iDataCount;
00582     stpm64 m64pDest = (stpm64) fpDest;
00583     stpm64 m64pSrc = (stpm64) fpSrc;
00584 
00585     iDataCount = (iDataLength >> 1);
00586     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00587     {
00588         X86_ASM (
00589             "movq %1, %%mm0\n\t" \
00590             "movq %2, %%mm1\n\t" \
00591             "pfadd %%mm1, %%mm0\n\t" \
00592             "movntq %%mm0, %0\n\t"
00593             : "=m" (m64pDest[iDataCntr])
00594             : "0" (m64pDest[iDataCntr]),
00595               "m" (m64pSrc[iDataCntr])
00596             : "mm0", "mm1", "memory");
00597     }
00598     if (iDataLength & 0x1)
00599     {
00600         X86_ASM (
00601             "movd %1, %%mm0\n\t" \
00602             "movd %2, %%mm1\n\t" \
00603             "pfadd %%mm1, %%mm0\n\t" \
00604             "movd %%mm0, %0\n\t"
00605             : "=m" (fpDest[iDataLength - 1])
00606             : "0" (fpDest[iDataLength - 1]),
00607               "m" (fpSrc[iDataLength - 1])
00608             : "mm0", "mm1", "memory");
00609     }
00610     X86_ASM (
00611         "femms\n\t" \
00612         "sfence\n\t");
00613 }
00614 
00615 
00616 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00617 {
00618     int iDataCntr;
00619     
00620     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00621     {
00622         X86_ASM (
00623             "movss %1, %%xmm0\n\t" \
00624             "addss %2, %%xmm0\n\t" \
00625             "movss %%xmm0, %0\n\t"
00626             : "=m" (fpDest[iDataCntr])
00627             : "0" (fpDest[iDataCntr]),
00628               "m" (fpSrc[iDataCntr])
00629             : "xmm0", "memory");
00630     }
00631 }
00632 
00633 
00634 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
00635 {
00636     int iDataCntr;
00637     
00638     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00639     {
00640         X86_ASM (
00641             "movsd %1, %%xmm0\n\t" \
00642             "addsd %2, %%xmm0\n\t" \
00643             "movsd %%xmm0, %0\n\t"
00644             : "=m" (dpDest[iDataCntr])
00645             : "0" (dpDest[iDataCntr]),
00646               "m" (dpSrc[iDataCntr])
00647             : "xmm0", "memory");
00648     }
00649 }
00650 
00651 
00652 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00653 {
00654     int iDataCntr;
00655     int iDataCount;
00656     stpm64 m64pDest = (stpm64) fpDest;
00657     stpm64 m64pSrc = (stpm64) fpSrc;
00658 
00659     iDataCount = (iDataLength >> 1);
00660     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00661     {
00662         X86_ASM (
00663             "movq %1, %%mm0\n\t" \
00664             "movq %2, %%mm1\n\t" \
00665             "pfmul %%mm1, %%mm0\n\t" \
00666             "movntq %%mm0, %0\n\t"
00667             : "=m" (m64pDest[iDataCntr])
00668             : "0" (m64pDest[iDataCntr]),
00669               "m" (m64pSrc[iDataCntr])
00670             : "mm0", "mm1", "memory");
00671     }
00672     if (iDataLength & 0x1)
00673     {
00674         X86_ASM (
00675             "movd %1, %%mm0\n\t" \
00676             "movd %2, %%mm1\n\t" \
00677             "pfmul %%mm1, %%mm0\n\t" \
00678             "movd %%mm0, %0\n\t"
00679             : "=m" (fpDest[iDataLength - 1])
00680             : "0" (fpDest[iDataLength - 1]),
00681               "m" (fpSrc[iDataLength - 1])
00682             : "mm0", "mm1", "memory");
00683     }
00684     X86_ASM (
00685         "femms\n\t" \
00686         "sfence\n\t");
00687 }
00688 
00689 
00690 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00691 {
00692     int iDataCntr;
00693     
00694     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00695     {
00696         X86_ASM (
00697             "movss %1, %%xmm0\n\t" \
00698             "mulss %2, %%xmm0\n\t" \
00699             "movss %%xmm0, %0\n\t"
00700             : "=m" (fpDest[iDataCntr])
00701             : "0" (fpDest[iDataCntr]),
00702               "m" (fpSrc[iDataCntr])
00703             : "xmm0", "memory");
00704     }
00705 }
00706 
00707 
00708 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
00709 {
00710     int iDataCntr;
00711     
00712     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00713     {
00714         X86_ASM (
00715             "movsd %1, %%xmm0\n\t" \
00716             "mulsd %2, %%xmm0\n\t" \
00717             "movsd %%xmm0, %0\n\t"
00718             : "=m" (dpDest[iDataCntr])
00719             : "0" (dpDest[iDataCntr]),
00720               "m" (dpSrc[iDataCntr])
00721             : "xmm0", "memory");
00722     }
00723 }
00724 
00725 
00726 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1, 
00727     const float *fpSrc2, int iDataLength)
00728 {
00729     int iDataCntr;
00730     int iDataCount;
00731     stpm64 m64pDest = (stpm64) fpDest;
00732     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00733     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00734 
00735     iDataCount = (iDataLength >> 1);
00736     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00737     {
00738         X86_ASM (
00739             "movq %1, %%mm0\n\t" \
00740             "movq %2, %%mm1\n\t" \
00741             "pfadd %%mm1, %%mm0\n\t" \
00742             "movntq %%mm0, %0\n\t"
00743             : "=m" (m64pDest[iDataCntr])
00744             : "m" (m64pSrc1[iDataCntr]),
00745               "m" (m64pSrc2[iDataCntr])
00746             : "mm0", "mm1", "memory");
00747     }
00748     if (iDataLength & 0x1)
00749     {
00750         X86_ASM (
00751             "movd %1, %%mm0\n\t" \
00752             "movd %2, %%mm1\n\t" \
00753             "pfadd %%mm1, %%mm0\n\t" \
00754             "movd %%mm0, %0\n\t"
00755             : "=m" (fpDest[iDataLength - 1])
00756             : "m" (fpSrc1[iDataLength - 1]),
00757               "m" (fpSrc2[iDataLength - 1])
00758             : "mm0", "mm1", "memory");
00759     }
00760     X86_ASM (
00761         "femms\n\t" \
00762         "sfence\n\t");
00763 }
00764 
00765 
00766 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1, 
00767     const float *fpSrc2, int iDataLength)
00768 {
00769     int iDataCntr;
00770     
00771     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00772     {
00773         X86_ASM (
00774             "movss %1, %%xmm0\n\t" \
00775             "addss %2, %%xmm0\n\t" \
00776             "movss %%xmm0, %0\n\t"
00777             : "=m" (fpDest[iDataCntr])
00778             : "m" (fpSrc1[iDataCntr]),
00779               "m" (fpSrc2[iDataCntr])
00780             : "xmm0", "memory");
00781     }
00782 }
00783 
00784 
00785 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1, 
00786     const double *dpSrc2, int iDataLength)
00787 {
00788     int iDataCntr;
00789     
00790     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00791     {
00792         X86_ASM (
00793             "movsd %1, %%xmm0\n\t" \
00794             "addsd %2, %%xmm0\n\t" \
00795             "movsd %%xmm0, %0\n\t"
00796             : "=m" (dpDest[iDataCntr])
00797             : "m" (dpSrc1[iDataCntr]),
00798               "m" (dpSrc2[iDataCntr])
00799             : "xmm0", "memory");
00800     }
00801 }
00802 
00803 
00804 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1, 
00805     const float *fpSrc2, int iDataLength)
00806 {
00807     int iDataCntr;
00808     int iDataCount;
00809     stpm64 m64pDest = (stpm64) fpDest;
00810     stpm64 m64pSrc1 = (stpm64) fpSrc1;
00811     stpm64 m64pSrc2 = (stpm64) fpSrc2;
00812 
00813     iDataCount = (iDataLength >> 1);
00814     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00815     {
00816         X86_ASM (
00817             "movq %1, %%mm0\n\t" \
00818             "movq %2, %%mm1\n\t" \
00819             "pfmul %%mm1, %%mm0\n\t" \
00820             "movntq %%mm0, %0\n\t"
00821             : "=m" (m64pDest[iDataCntr])
00822             : "m" (m64pSrc1[iDataCntr]),
00823               "m" (m64pSrc2[iDataCntr])
00824             : "mm0", "mm1", "memory");
00825     }
00826     if (iDataLength & 0x1)
00827     {
00828         X86_ASM (
00829             "movd %1, %%mm0\n\t" \
00830             "movd %2, %%mm1\n\t" \
00831             "pfmul %%mm1, %%mm0\n\t" \
00832             "movd %%mm0, %0\n\t"
00833             : "=m" (fpDest[iDataLength - 1])
00834             : "m" (fpSrc1[iDataLength - 1]),
00835               "m" (fpSrc2[iDataLength - 1])
00836             : "mm0", "mm1", "memory");
00837     }
00838     X86_ASM (
00839         "femms\n\t" \
00840         "sfence\n\t");
00841 }
00842 
00843 
00844 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1, 
00845     const float *fpSrc2, int iDataLength)
00846 {
00847     int iDataCntr;
00848     
00849     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00850     {
00851         X86_ASM (
00852             "movss %1, %%xmm0\n\t" \
00853             "mulss %2, %%xmm0\n\t" \
00854             "movss %%xmm0, %0\n\t"
00855             : "=m" (fpDest[iDataCntr])
00856             : "m" (fpSrc1[iDataCntr]),
00857               "m" (fpSrc2[iDataCntr])
00858             : "xmm0", "memory");
00859     }
00860 }
00861 
00862 
00863 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1, 
00864     const double *dpSrc2, int iDataLength)
00865 {
00866     int iDataCntr;
00867     
00868     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00869     {
00870         X86_ASM (
00871             "movsd %1, %%xmm0\n\t" \
00872             "mulsd %2, %%xmm0\n\t" \
00873             "movsd %%xmm0, %0\n\t"
00874             : "=m" (dpDest[iDataCntr])
00875             : "m" (dpSrc1[iDataCntr]),
00876               "m" (dpSrc2[iDataCntr])
00877             : "xmm0", "memory");
00878     }
00879 }
00880 
00881 
00882 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00883 {
00884     int iDataCntr;
00885     stpm64 m64pDest = (stpm64) fpDest;
00886     
00887     X86_ASM (
00888         "movq %0, %%mm3\n\t"
00889         :
00890         : "m" (fpSrc[0])
00891         : "mm3", "memory");
00892     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00893     {
00894         X86_ASM (
00895             "movq %1, %%mm0\n\t" \
00896             "movq %%mm3, %%mm1\n\t" \
00897             "pswapd %%mm1, %%mm2\n\t" \
00898             "pfmul %%mm0, %%mm1\n\t" \
00899             "pfmul %%mm0, %%mm2\n\t" \
00900             "pfpnacc %%mm2, %%mm1\n\t"
00901             "movntq %%mm1, %0\n\t"
00902             : "=m" (m64pDest[iDataCntr])
00903             : "0" (m64pDest[iDataCntr])
00904             : "mm0", "mm1", "mm2", "mm3", "memory");
00905     }
00906     X86_ASM (
00907         "femms\n\t" \
00908         "sfence\n\t");
00909 }
00910 
00911 
00912 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00913 {
00914     int iDataCntr;
00915     int iDataCount;
00916     
00917     X86_ASM (
00918         "movss %0, %%xmm2\n\t" \
00919         "movss %1, %%xmm3\n\t"
00920         :
00921         : "m" (fpSrc[0]),
00922           "m" (fpSrc[1])
00923         : "xmm2", "xmm3", "memory");
00924     iDataCount = (iDataLength << 1);
00925     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00926     {
00927         X86_ASM (
00928             "movss %2, %%xmm0\n\t" \
00929             "movss %%xmm0, %%xmm1\n\t" \
00930             "movss %3, %%xmm4\n\t" \
00931             \
00932             "mulss %%xmm2, %%xmm0\n\t" \
00933             "movss %%xmm4, %%xmm5\n\t" \
00934             "mulss %%xmm3, %%xmm5\n\t" \
00935             "subss %%xmm0, %%xmm5\n\t" \
00936             \
00937             "mulss %%xmm3, %%xmm1\n\t" \
00938             "movss %%xmm4, %%xmm5\n\t" \
00939             "mulss %%xmm2, %%xmm5\n\t" \
00940             "addss %%xmm5, %%xmm1\n\t" \
00941             \
00942             "movss %%xmm0, %0\n\t" \
00943             "movss %%xmm1, %1\n\t"
00944             : "=m" (fpDest[iDataCntr]),
00945               "=m" (fpDest[iDataCntr + 1])
00946             : "0" (fpDest[iDataCntr]),
00947               "1" (fpDest[iDataCntr + 1])
00948             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00949     }
00950 }
00951 
00952 
00953 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
00954 {
00955     int iDataCntr;
00956     int iDataCount;
00957     
00958     X86_ASM (
00959         "movsd %0, %%xmm2\n\t" \
00960         "movsd %1, %%xmm3\n\t"
00961         :
00962         : "m" (dpSrc[0]),
00963           "m" (dpSrc[1])
00964         : "xmm2", "xmm3", "memory");
00965     iDataCount = (iDataLength << 1);
00966     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00967     {
00968         X86_ASM (
00969             "movsd %2, %%xmm0\n\t" \
00970             "movsd %%xmm0, %%xmm1\n\t" \
00971             "movsd %3, %%xmm4\n\t" \
00972             \
00973             "mulsd %%xmm2, %%xmm0\n\t" \
00974             "movsd %%xmm4, %%xmm5\n\t" \
00975             "mulsd %%xmm3, %%xmm5\n\t" \
00976             "subsd %%xmm0, %%xmm5\n\t" \
00977             \
00978             "mulsd %%xmm3, %%xmm1\n\t" \
00979             "movsd %%xmm4, %%xmm5\n\t" \
00980             "mulsd %%xmm2, %%xmm5\n\t" \
00981             "addsd %%xmm5, %%xmm1\n\t" \
00982             \
00983             "movsd %%xmm0, %0\n\t" \
00984             "movsd %%xmm1, %1\n\t"
00985             : "=m" (dpDest[iDataCntr]),
00986               "=m" (dpDest[iDataCntr + 1])
00987             : "0" (dpDest[iDataCntr]),
00988               "1" (dpDest[iDataCntr + 1])
00989             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00990     }
00991 }
00992 
00993 
00994 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00995 {
00996     int iDataCntr;
00997     stpm64 m64pDest = (stpm64) fpDest;
00998     stpm64 m64pSrc = (stpm64) fpSrc;
00999     
01000     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01001     {
01002         X86_ASM (
01003             "movq %1, %%mm0\n\t" \
01004             "movq %2, %%mm1\n\t" \
01005             "pswapd %%mm1, %%mm2\n\t" \
01006             "pfmul %%mm0, %%mm1\n\t" \
01007             "pfmul %%mm0, %%mm2\n\t" \
01008             "pfpnacc %%mm2, %%mm1\n\t"
01009             "movntq %%mm1, %0\n\t"
01010             : "=m" (m64pDest[iDataCntr])
01011             : "0" (m64pDest[iDataCntr]),
01012               "m" (m64pSrc[iDataCntr])
01013             : "mm0", "mm1", "mm2", "memory");
01014     }
01015     X86_ASM (
01016         "femms\n\t" \
01017         "sfence\n\t");
01018 }
01019 
01020 
01021 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
01022 {
01023     int iDataCntr;
01024     int iDataCount;
01025     
01026     iDataCount = (iDataLength << 1);
01027     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01028     {
01029         X86_ASM (
01030             "movss %4, %%xmm2\n\t" \
01031             "movss %5, %%xmm3\n\t" \
01032             \
01033             "movss %2, %%xmm0\n\t" \
01034             "movss %%xmm0, %%xmm1\n\t" \
01035             "movss %3, %%xmm4\n\t" \
01036             \
01037             "mulss %%xmm2, %%xmm0\n\t" \
01038             "movss %%xmm4, %%xmm5\n\t" \
01039             "mulss %%xmm3, %%xmm5\n\t" \
01040             "subss %%xmm0, %%xmm5\n\t" \
01041             \
01042             "mulss %%xmm3, %%xmm1\n\t" \
01043             "movss %%xmm4, %%xmm5\n\t" \
01044             "mulss %%xmm2, %%xmm5\n\t" \
01045             "addss %%xmm5, %%xmm1\n\t" \
01046             \
01047             "movss %%xmm0, %0\n\t" \
01048             "movss %%xmm1, %1\n\t"
01049             : "=m" (fpDest[iDataCntr]),
01050               "=m" (fpDest[iDataCntr + 1])
01051             : "0" (fpDest[iDataCntr]),
01052               "1" (fpDest[iDataCntr + 1]),
01053               "m" (fpSrc[iDataCntr]),
01054               "m" (fpSrc[iDataCntr + 1])
01055             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01056     }
01057 }
01058 
01059 
01060 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
01061 {
01062     int iDataCntr;
01063     int iDataCount;
01064     
01065     iDataCount = (iDataLength << 1);
01066     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01067     {
01068         X86_ASM (
01069             "movsd %4, %%xmm2\n\t" \
01070             "movsd %5, %%xmm3\n\t" \
01071             \
01072             "movsd %2, %%xmm0\n\t" \
01073             "movsd %%xmm0, %%xmm1\n\t" \
01074             "movsd %3, %%xmm4\n\t" \
01075             \
01076             "mulsd %%xmm2, %%xmm0\n\t" \
01077             "movsd %%xmm4, %%xmm5\n\t" \
01078             "mulsd %%xmm3, %%xmm5\n\t" \
01079             "subsd %%xmm0, %%xmm5\n\t" \
01080             \
01081             "mulsd %%xmm3, %%xmm1\n\t" \
01082             "movsd %%xmm4, %%xmm5\n\t" \
01083             "mulsd %%xmm2, %%xmm5\n\t" \
01084             "addsd %%xmm5, %%xmm1\n\t" \
01085             \
01086             "movsd %%xmm0, %0\n\t" \
01087             "movsd %%xmm1, %1\n\t"
01088             : "=m" (dpDest[iDataCntr]),
01089               "=m" (dpDest[iDataCntr + 1])
01090             : "0" (dpDest[iDataCntr]),
01091               "1" (dpDest[iDataCntr + 1]),
01092               "m" (dpSrc[iDataCntr]),
01093               "m" (dpSrc[iDataCntr + 1])
01094             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01095     }
01096 }
01097 
01098 
01099 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1, 
01100     const float *fpSrc2, int iDataLength)
01101 {
01102     int iDataCntr;
01103     stpm64 m64pDest = (stpm64) fpDest;
01104     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01105     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01106     
01107     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01108     {
01109         X86_ASM (
01110             "movq %1, %%mm0\n\t" \
01111             "movq %2, %%mm1\n\t" \
01112             "pswapd %%mm1, %%mm2\n\t" \
01113             "pfmul %%mm0, %%mm1\n\t" \
01114             "pfmul %%mm0, %%mm2\n\t" \
01115             "pfpnacc %%mm2, %%mm1\n\t"
01116             "movntq %%mm1, %0\n\t"
01117             : "=m" (m64pDest[iDataCntr])
01118             : "m" (m64pSrc1[iDataCntr]),
01119               "m" (m64pSrc2[iDataCntr])
01120             : "mm0", "mm1", "mm2", "memory");
01121     }
01122     X86_ASM (
01123         "femms\n\t" \
01124         "sfence\n\t");
01125 }
01126 
01127 
01128 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1, 
01129     const float *fpSrc2, int iDataLength)
01130 {
01131     int iDataCntr;
01132     int iDataCount;
01133     
01134     iDataCount = (iDataLength << 1);
01135     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01136     {
01137         X86_ASM (
01138             "movss %4, %%xmm2\n\t" \
01139             "movss %5, %%xmm3\n\t" \
01140             \
01141             "movss %2, %%xmm0\n\t" \
01142             "movss %%xmm0, %%xmm1\n\t" \
01143             "movss %3, %%xmm4\n\t" \
01144             \
01145             "mulss %%xmm2, %%xmm0\n\t" \
01146             "movss %%xmm4, %%xmm5\n\t" \
01147             "mulss %%xmm3, %%xmm5\n\t" \
01148             "subss %%xmm0, %%xmm5\n\t" \
01149             \
01150             "mulss %%xmm3, %%xmm1\n\t" \
01151             "movss %%xmm4, %%xmm5\n\t" \
01152             "mulss %%xmm2, %%xmm5\n\t" \
01153             "addss %%xmm5, %%xmm1\n\t" \
01154             \
01155             "movss %%xmm0, %0\n\t" \
01156             "movss %%xmm1, %1\n\t"
01157             : "=m" (fpDest[iDataCntr]),
01158               "=m" (fpDest[iDataCntr + 1])
01159             : "m" (fpSrc1[iDataCntr]),
01160               "m" (fpSrc1[iDataCntr + 1]),
01161               "m" (fpSrc2[iDataCntr]),
01162               "m" (fpSrc2[iDataCntr + 1])
01163             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01164     }
01165 }
01166 
01167 
01168 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1, 
01169     const double *dpSrc2, int iDataLength)
01170 {
01171     int iDataCntr;
01172     int iDataCount;
01173     
01174     iDataCount = (iDataLength << 1);
01175     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01176     {
01177         X86_ASM (
01178             "movsd %4, %%xmm2\n\t" \
01179             "movsd %5, %%xmm3\n\t" \
01180             \
01181             "movsd %2, %%xmm0\n\t" \
01182             "movsd %%xmm0, %%xmm1\n\t" \
01183             "movsd %3, %%xmm4\n\t" \
01184             \
01185             "mulsd %%xmm2, %%xmm0\n\t" \
01186             "movsd %%xmm4, %%xmm5\n\t" \
01187             "mulsd %%xmm3, %%xmm5\n\t" \
01188             "subsd %%xmm0, %%xmm5\n\t" \
01189             \
01190             "mulsd %%xmm3, %%xmm1\n\t" \
01191             "movsd %%xmm4, %%xmm5\n\t" \
01192             "mulsd %%xmm2, %%xmm5\n\t" \
01193             "addsd %%xmm5, %%xmm1\n\t" \
01194             \
01195             "movsd %%xmm0, %0\n\t" \
01196             "movsd %%xmm1, %1\n\t"
01197             : "=m" (dpDest[iDataCntr]),
01198               "=m" (dpDest[iDataCntr + 1])
01199             : "m" (dpSrc1[iDataCntr]),
01200               "m" (dpSrc1[iDataCntr + 1]),
01201               "m" (dpSrc2[iDataCntr]),
01202               "m" (dpSrc2[iDataCntr + 1])
01203             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01204     }
01205 }
01206 
01207 
01208 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01209 {
01210     int iDataCntr;
01211     int iDataCount;
01212     stpm64 m64pVect = (stpm64) fpVect;
01213     stm64 m64Mul;
01214     stm64 m64Add;
01215 
01216     m64Mul.f[0] = m64Mul.f[1] = fMul;
01217     m64Add.f[0] = m64Add.f[1] = fAdd;
01218     iDataCount = (iDataLength >> 1);
01219     X86_ASM (
01220         "movq %0, %%mm1\n\t" \
01221         "movq %1, %%mm2\n\t"
01222         :
01223         : "m" (m64Mul),
01224           "m" (m64Add)
01225         : "mm1", "mm2", "memory");
01226     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01227     {
01228         X86_ASM (
01229             "movq %1, %%mm0\n\t" \
01230             "pfmul %%mm1, %%mm0\n\t" \
01231             "pfadd %%mm2, %%mm0\n\t" \
01232             "movntq %%mm0, %0\n\t"
01233             : "=m" (m64pVect[iDataCntr])
01234             : "0" (m64pVect[iDataCntr])
01235             : "mm0", "mm1", "mm2", "memory");
01236     }
01237     if (iDataLength & 0x1)
01238     {
01239         X86_ASM (
01240             "movd %1, %%mm0\n\t" \
01241             "pfmul %%mm1, %%mm0\n\t" \
01242             "pfadd %%mm2, %%mm0\n\t" \
01243             "movd %%mm0, %0\n\t"
01244             : "=m" (fpVect[iDataLength - 1])
01245             : "0" (fpVect[iDataLength - 1])
01246             : "mm0", "mm1", "mm2", "memory");
01247     }
01248     X86_ASM (
01249         "femms\n\t" \
01250         "sfence\n\t");
01251 }
01252 
01253 
01254 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01255 {
01256     int iDataCntr;
01257     
01258     X86_ASM (
01259         "movss %0, %%xmm1\n\t" \
01260         "movss %1, %%xmm2\n\t"
01261         :
01262         : "m" (fMul),
01263           "m" (fAdd)
01264         : "xmm1", "xmm2", "memory");
01265     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01266     {
01267         X86_ASM (
01268             "movss %1, %%xmm0\n\t" \
01269             "mulss %%xmm1, %%xmm0\n\t" \
01270             "addss %%xmm2, %%xmm0\n\t" \
01271             "movss %%xmm0, %0\n\t"
01272             : "=m" (fpVect[iDataCntr])
01273             : "0" (fpVect[iDataCntr])
01274             : "xmm0", "xmm1", "xmm2", "memory");
01275     }
01276 }
01277 
01278 
01279 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
01280 {
01281     int iDataCntr;
01282     
01283     X86_ASM (
01284         "movsd %0, %%xmm1\n\t" \
01285         "movsd %1, %%xmm2\n\t"
01286         :
01287         : "m" (dMul),
01288           "m" (dAdd)
01289         : "xmm1", "xmm2", "memory");
01290     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01291     {
01292         X86_ASM (
01293             "movsd %1, %%xmm0\n\t" \
01294             "mulsd %%xmm1, %%xmm0\n\t" \
01295             "addsd %%xmm2, %%xmm0\n\t" \
01296             "movsd %%xmm0, %0\n\t"
01297             : "=m" (dpVect[iDataCntr])
01298             : "0" (dpVect[iDataCntr])
01299             : "xmm0", "xmm1", "xmm2", "memory");
01300     }
01301 }
01302 
01303 
01304 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
01305     float fMul, float fAdd, int iDataLength)
01306 {
01307     int iDataCntr;
01308     int iDataCount;
01309     stpm64 m64pDest = (stpm64) fpDest;
01310     stpm64 m64pSrc = (stpm64) fpSrc;
01311     stm64 m64Mul;
01312     stm64 m64Add;
01313 
01314     m64Mul.f[0] = m64Mul.f[1] = fMul;
01315     m64Add.f[0] = m64Add.f[1] = fAdd;
01316     iDataCount = (iDataLength >> 1);
01317     X86_ASM (
01318         "movq %0, %%mm1\n\t" \
01319         "movq %1, %%mm2\n\t"
01320         :
01321         : "m" (m64Mul),
01322           "m" (m64Add)
01323         : "mm1", "mm2", "memory");
01324     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01325     {
01326         X86_ASM (
01327             "movq %1, %%mm0\n\t" \
01328             "pfmul %%mm1, %%mm0\n\t" \
01329             "pfadd %%mm2, %%mm0\n\t" \
01330             "movntq %%mm0, %0\n\t"
01331             : "=m" (m64pDest[iDataCntr])
01332             : "m" (m64pSrc[iDataCntr])
01333             : "mm0", "mm1", "mm2", "memory");
01334     }
01335     if (iDataLength & 0x1)
01336     {
01337         X86_ASM (
01338             "movd %1, %%mm0\n\t" \
01339             "pfmul %%mm1, %%mm0\n\t" \
01340             "pfadd %%mm2, %%mm0\n\t" \
01341             "movd %%mm0, %0\n\t"
01342             : "=m" (fpDest[iDataLength - 1])
01343             : "m" (fpSrc[iDataLength - 1])
01344             : "mm0", "mm1", "mm2", "memory");
01345     }
01346     X86_ASM (
01347         "femms\n\t" \
01348         "sfence\n\t");
01349 }
01350 
01351 
01352 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc, 
01353     float fMul, float fAdd, int iDataLength)
01354 {
01355     int iDataCntr;
01356     
01357     X86_ASM (
01358         "movss %0, %%xmm1\n\t" \
01359         "movss %1, %%xmm2\n\t"
01360         :
01361         : "m" (fMul),
01362           "m" (fAdd)
01363         : "xmm1", "xmm2", "memory");
01364     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01365     {
01366         X86_ASM (
01367             "movss %1, %%xmm0\n\t" \
01368             "mulss %%xmm1, %%xmm0\n\t" \
01369             "addss %%xmm2, %%xmm0\n\t" \
01370             "movss %%xmm0, %0\n\t"
01371             : "=m" (fpDest[iDataCntr])
01372             : "m" (fpSrc[iDataCntr])
01373             : "xmm0", "xmm1", "xmm2", "memory");
01374     }
01375 }
01376 
01377 
01378 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc, 
01379     double dMul, double dAdd, int iDataLength)
01380 {
01381     int iDataCntr;
01382     
01383     X86_ASM (
01384         "movsd %0, %%xmm1\n\t" \
01385         "movsd %1, %%xmm2\n\t"
01386         :
01387         : "m" (dMul),
01388           "m" (dAdd)
01389         : "xmm1", "xmm2", "memory");
01390     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01391     {
01392         X86_ASM (
01393             "movsd %1, %%xmm0\n\t" \
01394             "mulsd %%xmm1, %%xmm0\n\t" \
01395             "addsd %%xmm2, %%xmm0\n\t" \
01396             "movsd %%xmm0, %0\n\t"
01397             : "=m" (dpDest[iDataCntr])
01398             : "m" (dpSrc[iDataCntr])
01399             : "xmm0", "xmm1", "xmm2", "memory");
01400     }
01401 }
01402 
01403 
01404 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01405 {
01406     int iDataCntr;
01407     int iDataCount;
01408     stpm64 m64pVect = (stpm64) fpVect;
01409     stm64 m64Add;
01410     stm64 m64Mul;
01411 
01412     m64Add.f[0] = m64Add.f[1] = fAdd;
01413     m64Mul.f[0] = m64Mul.f[1] = fMul;
01414     iDataCount = (iDataLength >> 1);
01415     X86_ASM (
01416         "movq %0, %%mm1\n\t" \
01417         "movq %1, %%mm2\n\t"
01418         :
01419         : "m" (m64Add),
01420           "m" (m64Mul)
01421         : "mm1", "mm2", "memory");
01422     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01423     {
01424         X86_ASM (
01425             "movq %1, %%mm0\n\t" \
01426             "pfadd %%mm1, %%mm0\n\t" \
01427             "pfmul %%mm2, %%mm0\n\t" \
01428             "movntq %%mm0, %0\n\t"
01429             : "=m" (m64pVect[iDataCntr])
01430             : "0" (m64pVect[iDataCntr])
01431             : "mm0", "mm1", "mm2", "memory");
01432     }
01433     if (iDataLength & 0x1)
01434     {
01435         X86_ASM (
01436             "movd %1, %%mm0\n\t" \
01437             "pfadd %%mm1, %%mm0\n\t" \
01438             "pfmul %%mm2, %%mm0\n\t" \
01439             "movd %%mm0, %0\n\t"
01440             : "=m" (fpVect[iDataLength - 1])
01441             : "0" (fpVect[iDataLength - 1])
01442             : "mm0", "mm1", "mm2", "memory");
01443     }
01444     X86_ASM (
01445         "femms\n\t" \
01446         "sfence\n\t");
01447 }
01448 
01449 
01450 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01451 {
01452     int iDataCntr;
01453 
01454     X86_ASM (
01455         "movss %0, %%xmm1\n\t" \
01456         "movss %1, %%xmm2\n\t"
01457         :
01458         : "m" (fAdd),
01459           "m" (fMul)
01460         : "xmm1", "xmm2", "memory");
01461     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01462     {
01463         X86_ASM (
01464             "movss %1, %%xmm0\n\t" \
01465             "addss %%xmm1, %%xmm0\n\t" \
01466             "mulss %%xmm2, %%xmm0\n\t" \
01467             "movss %%xmm0, %0\n\t"
01468             : "=m" (fpVect[iDataCntr])
01469             : "0" (fpVect[iDataCntr])
01470             : "xmm0", "xmm1", "xmm2", "memory");
01471     }
01472 }
01473 
01474 
01475 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2, 
01476     int iDataLength)
01477 {
01478     int iDataCntr;
01479     int iDataCount;
01480     float fRes;
01481     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01482     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01483 
01484     iDataCount = (iDataLength >> 1);
01485     X86_ASM (
01486         "pxor %%mm0, %%mm0\n\t"
01487         :
01488         :
01489         : "mm0");
01490     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01491     {
01492         X86_ASM (
01493             "movq %0, %%mm1\n\t" \
01494             "movq %1, %%mm2\n\t" \
01495             "pfmul %%mm2, %%mm1\n\t" \
01496             "pfacc %%mm1, %%mm0\n\t"
01497             :
01498             : "m" (m64pSrc1[iDataCntr]),
01499               "m" (m64pSrc2[iDataCntr])
01500             : "mm0", "mm1", "mm2", "memory");
01501     }
01502     if (iDataLength & 0x1)
01503     {
01504         X86_ASM (
01505             "movd %0, %%mm1\n\t" \
01506             "movd %1, %%mm2\n\t" \
01507             "pfmul %%mm2, %%mm1\n\t" \
01508             "pfacc %%mm1, %%mm0\n\t"
01509             :
01510             : "m" (fpSrc1[iDataLength - 1]),
01511               "m" (fpSrc2[iDataLength - 1])
01512             : "mm0", "mm1", "mm2", "memory");
01513     }
01514     X86_ASM (
01515         "pfacc %%mm0, %%mm0\n\t" \
01516         "movd %%mm0, %0\n\t"
01517         : "=m" (fRes)
01518         :
01519         : "mm0", "memory");
01520     X86_ASM ("femms\n\t");
01521 
01522     return fRes;
01523 }
01524 
01525 
01526 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
01527     int iDataLength)
01528 {
01529     int iDataCntr;
01530     float fRes;
01531     
01532     X86_ASM (
01533         "xorps %%xmm0, %%xmm0\n\t"
01534         :
01535         :
01536         : "xmm0");
01537     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01538     {
01539         X86_ASM (
01540             "movss %0, %%xmm1\n\t" \
01541             "mulss %1, %%xmm1\n\t" \
01542             "addss %%xmm1, %%xmm0\n\t"
01543             :
01544             : "m" (fpSrc1[iDataCntr]),
01545               "m" (fpSrc2[iDataCntr])
01546             : "xmm0", "xmm1", "xmm2", "memory");
01547     }
01548     X86_ASM (
01549         "movss %%xmm0, %0\n\t"
01550         : "=m" (fRes)
01551         :
01552         : "xmm0");
01553 
01554     return fRes;
01555 }
01556 
01557 
01558 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
01559     int iDataLength)
01560 {
01561     int iDataCntr;
01562     double dRes;
01563     
01564     X86_ASM (
01565         "xorpd %%xmm0, %%xmm0\n\t"
01566         :
01567         :
01568         : "xmm0");
01569     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01570     {
01571         X86_ASM (
01572             "movsd %0, %%xmm1\n\t" \
01573             "mulsd %1, %%xmm1\n\t" \
01574             "addsd %%xmm1, %%xmm0\n\t"
01575             :
01576             : "m" (dpSrc1[iDataCntr]),
01577               "m" (dpSrc2[iDataCntr])
01578             : "xmm0", "xmm1", "xmm2", "memory");
01579     }
01580     X86_ASM (
01581         "movsd %%xmm0, %0\n\t"
01582         : "=m" (dRes)
01583         :
01584         : "xmm0");
01585 
01586     return dRes;
01587 }
01588 
01589 
01590 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
01591     int iDataLength)
01592 {
01593     int iDataCntr;
01594     int iDataCount;
01595     stm64 m64Min;
01596     stm64 m64Max;
01597     stpm64 m64pSrc = (stpm64) fpSrc;
01598     
01599     m64Min.f[0] = m64Min.f[1] = FLT_MAX;
01600     m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
01601     iDataCount = (iDataLength >> 1);
01602     X86_ASM (
01603         "movq %0, %%mm1\n\t" \
01604         "movq %1, %%mm2\n\t"
01605         :
01606         : "m" (m64Min),
01607           "m" (m64Max)
01608         : "mm1", "mm2", "memory");
01609     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01610     {
01611         X86_ASM (
01612             "movq %0, %%mm0\n\t" \
01613             "pfmin %%mm0, %%mm1\n\t" \
01614             "pfmax %%mm0, %%mm2\n\t"
01615             :
01616             : "m" (m64pSrc[iDataCntr])
01617             : "mm0", "mm1", "mm2", "memory");
01618     }
01619     if (iDataLength & 0x1)
01620     {
01621         X86_ASM (
01622             "movd %0, %%mm0\n\t" \
01623             "pfmin %%mm0, %%mm1\n\t" \
01624             "pfmax %%mm0, %%mm2\n\t"
01625             :
01626             : "m" (fpSrc[iDataLength - 1])
01627             : "mm0", "mm1", "mm2", "memory");
01628     }
01629     X86_ASM (
01630         "pswapd %%mm1, %%mm3\n\t" \
01631         "pfmin %%mm3, %%mm1\n\t" \
01632         "pswapd %%mm2, %%mm3\n\t" \
01633         "pfmax %%mm3, %%mm2\n\t" \
01634         "movd %%mm1, %0\n\t" \
01635         "movd %%mm2, %1\n\t"
01636         : "=m" (*fpMin),
01637           "=m" (*fpMax)
01638         :
01639         : "mm1", "mm2", "mm3", "memory");
01640     X86_ASM ("femms\n\t");
01641 }
01642 
01643 
01644 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
01645     int iDataLength)
01646 {
01647     int iDataCntr;
01648 
01649     *fpMin = FLT_MAX;
01650     *fpMax = -FLT_MAX;
01651     X86_ASM (
01652         "movss %0, %%xmm0\n\t" \
01653         "movss %1, %%xmm1\n\t"
01654         :
01655         : "m" (*fpMin),
01656           "m" (*fpMax)
01657         : "xmm0", "xmm1", "memory");
01658     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01659     {
01660         X86_ASM (
01661             "movss %0, %%xmm2\n\t" \
01662             "minss %%xmm2, %%xmm0\n\t" \
01663             "maxss %%xmm2, %%xmm1\n\t"
01664             :
01665             : "m" (fpSrc[iDataCntr])
01666             : "xmm0", "xmm1", "xmm2", "memory");
01667     }
01668     X86_ASM (
01669         "movss %%xmm0, %0\n\t" \
01670         "movss %%xmm1, %1\n\t"
01671         : "=m" (*fpMin),
01672           "=m" (*fpMax)
01673         :
01674         : "xmm0", "xmm1", "memory");
01675 }
01676 
01677 
01678 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc, 
01679     int iDataLength)
01680 {
01681     int iDataCntr;
01682 
01683     *dpMin = FLT_MAX;
01684     *dpMax = -FLT_MAX;
01685     X86_ASM (
01686         "movsd %0, %%xmm0\n\t" \
01687         "movsd %1, %%xmm1\n\t"
01688         :
01689         : "m" (*dpMin),
01690           "m" (*dpMax)
01691         : "xmm0", "xmm1", "memory");
01692     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01693     {
01694         X86_ASM (
01695             "movsd %0, %%xmm2\n\t" \
01696             "minsd %%xmm2, %%xmm0\n\t" \
01697             "maxsd %%xmm2, %%xmm1\n\t"
01698             :
01699             : "m" (dpSrc[iDataCntr])
01700             : "xmm0", "xmm1", "xmm2", "memory");
01701     }
01702     X86_ASM (
01703         "movss %%xmm0, %0\n\t" \
01704         "movss %%xmm1, %1\n\t"
01705         : "=m" (*dpMin),
01706           "=m" (*dpMax)
01707         :
01708         : "xmm0", "xmm1", "memory");
01709 }
01710 
01711 
01712 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01713     int iDataLength)
01714 {
01715     int iDataCntr;
01716     int iDataCount;
01717     float fRes;
01718     stpm64 m64pSrc1 = (stpm64) fpSrc1;
01719     stpm64 m64pSrc2 = (stpm64) fpSrc2;
01720     
01721     iDataCount = (iDataLength >> 1);
01722     X86_ASM (
01723         "pxor %%mm3, %%mm3\n\t" \
01724         "pxor %%mm4, %%mm4\n\t" \
01725         "pxor %%mm5, %%mm5\n\t"
01726         :
01727         :
01728         : "mm3", "mm4", "mm5");
01729     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01730     {
01731         X86_ASM (
01732             "movq %0, %%mm0\n\t" \
01733             "movq %1, %%mm1\n\t" \
01734             "movq %%mm1, %%mm2\n\t" \
01735             "pfmul %%mm0, %%mm2\n\t" \
01736             "pfacc %%mm2, %%mm5\n\t" \
01737             "pfmul %%mm0, %%mm0\n\t" \
01738             "pfacc %%mm0, %%mm3\n\t" \
01739             "pfmul %%mm1, %%mm1\n\t" \
01740             "pfacc %%mm1, %%mm4\n\t"
01741             :
01742             : "m" (m64pSrc1[iDataCntr]),
01743               "m" (m64pSrc2[iDataCntr])
01744             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01745     }
01746     if (iDataLength & 0x1)
01747     {
01748         X86_ASM (
01749             "movd %0, %%mm0\n\t" \
01750             "movd %1, %%mm1\n\t" \
01751             "movq %%mm1, %%mm2\n\t" \
01752             "pfmul %%mm0, %%mm2\n\t" \
01753             "pfacc %%mm2, %%mm5\n\t" \
01754             "pfmul %%mm0, %%mm0\n\t" \
01755             "pfacc %%mm0, %%mm3\n\t" \
01756             "pfmul %%mm1, %%mm1\n\t" \
01757             "pfacc %%mm1, %%mm4\n\t"
01758             :
01759             : "m" (fpSrc1[iDataLength - 1]),
01760               "m" (fpSrc2[iDataLength - 1])
01761             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01762     }
01763     X86_ASM (
01764         "pfacc %%mm3, %%mm3\n\t" \
01765         "pfacc %%mm4, %%mm4\n\t" \
01766         "pfacc %%mm5, %%mm5\n\t" \
01767         \
01768         "movd %1, %%mm6\n\t" \
01769         "pswapd %%mm6, %%mm7\n\t" \
01770         "paddd %%mm7, %%mm6\n\t" \
01771         "pi2fd %%mm6, %%mm7\n\t" \
01772         \
01773         "pfrcp %%mm7, %%mm6\n\t" \
01774         "pfrcpit1 %%mm6, %%mm7\n\t" \
01775         "pfrcpit2 %%mm6, %%mm7\n\t" \
01776         \
01777         "pfmul %%mm3, %%mm4\n\t" \
01778         \
01779         "movq %%mm4, %%mm0\n\t" \
01780         "pfrsqrt %%mm4, %%mm1\n\t" \
01781         "movq %%mm1, %%mm2\n\t" \
01782         "pfmul %%mm1, %%mm1\n\t" \
01783         "pfrsqit1 %%mm4, %%mm1\n\t" \
01784         "pfrcpit2 %%mm2, %%mm1\n\t" \
01785         "pfmul %%mm1, %%mm4\n\t" \
01786         \
01787         "pfmul %%mm6, %%mm4\n\t" \
01788         \
01789         "pfrcp %%mm4, %%mm0\n\t" \
01790         "pfrcpit1 %%mm0, %%mm4\n\t" \
01791         "pfrcpit2 %%mm0, %%mm4\n\t" \
01792         \
01793         "pfmul %%mm6, %%mm5\n\t" \
01794         "pfmul %%mm4, %%mm5\n\t" \
01795         "movd %%mm5, %0\n\t"
01796         : "=m" (fRes)
01797         : "m" (iDataLength)
01798         : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01799     X86_ASM ("femms\n\t");
01800 
01801     return fRes;
01802 }
01803 
01804 
01805 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01806     int iDataLength)
01807 {
01808     int iDataCntr;
01809     float fScale;
01810     float fNormFact;
01811     float fProdSum;
01812     float fSqSum1;
01813     float fSqSum2;
01814     float fRes;
01815     
01816     X86_ASM (
01817         "xorps %%xmm0, %%xmm0\n\t" \
01818         "xorps %%xmm1, %%xmm1\n\t" \
01819         "xorps %%xmm2, %%xmm2\n\t"
01820         :
01821         :
01822         : "xmm0", "xmm1", "xmm2");
01823     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01824     {
01825         X86_ASM (
01826             "movss %3, %%xmm3\n\t" \
01827             "movss %4, %%xmm4\n\t" \
01828             \
01829             "movss %%xmm4, %%xmm5\n\t" \
01830             "mulss %%xmm3, %%xmm5\n\t" \
01831             "addss %%xmm5, %%xmm0\n\t" \
01832             \
01833             "movss %%xmm3, %%xmm5\n\t" \
01834             "mulss %%xmm3, %%xmm5\n\t" \
01835             "addss %%xmm5, %%xmm1\n\t" \
01836             \
01837             "movss %%xmm4, %%xmm5\n\t" \
01838             "mulss %%xmm4, %%xmm5\n\t" \
01839             "addss %%xmm5, %%xmm2\n\t" \
01840             \
01841             "movss %%xmm0, %0\n\t" \
01842             "movss %%xmm1, %1\n\t" \
01843             "movss %%xmm2, %2\n\t"
01844             : "=m" (fProdSum),
01845               "=m" (fSqSum1),
01846               "=m" (fSqSum2)
01847             : "m" (fpSrc1[iDataCntr]),
01848               "m" (fpSrc2[iDataCntr])
01849             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01850     }
01851     fScale = 1.0F / iDataLength;
01852     fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
01853     fRes = (fProdSum * fScale) / fNormFact;
01854 
01855     return fRes;
01856 }
01857 
01858 
01859 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
01860     int iDataLength)
01861 {
01862     int iDataCntr;
01863     double dScale;
01864     double dNormFact;
01865     double dProdSum;
01866     double dSqSum1;
01867     double dSqSum2;
01868     double dRes;
01869     
01870     X86_ASM (
01871         "xorpd %%xmm0, %%xmm0\n\t" \
01872         "xorpd %%xmm1, %%xmm1\n\t" \
01873         "xorpd %%xmm2, %%xmm2\n\t"
01874         :
01875         :
01876         : "xmm0", "xmm1", "xmm2");
01877     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01878     {
01879         X86_ASM (
01880             "movsd %3, %%xmm3\n\t" \
01881             "movsd %4, %%xmm4\n\t" \
01882             \
01883             "movsd %%xmm4, %%xmm5\n\t" \
01884             "mulsd %%xmm3, %%xmm5\n\t" \
01885             "addsd %%xmm5, %%xmm0\n\t" \
01886             \
01887             "movsd %%xmm3, %%xmm5\n\t" \
01888             "mulsd %%xmm3, %%xmm5\n\t" \
01889             "addsd %%xmm5, %%xmm1\n\t" \
01890             \
01891             "movsd %%xmm4, %%xmm5\n\t" \
01892             "mulsd %%xmm4, %%xmm5\n\t" \
01893             "addsd %%xmm5, %%xmm2\n\t" \
01894             \
01895             "movsd %%xmm0, %0\n\t" \
01896             "movsd %%xmm1, %1\n\t" \
01897             "movsd %%xmm2, %2\n\t"
01898             : "=m" (dProdSum),
01899               "=m" (dSqSum1),
01900               "=m" (dSqSum2)
01901             : "m" (dpSrc1[iDataCntr]),
01902               "m" (dpSrc2[iDataCntr])
01903             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01904     }
01905     dScale = 1.0 / iDataLength;
01906     dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
01907     dRes = (dProdSum * dScale) / dNormFact;
01908 
01909     return dRes;
01910 }
01911 
01912 
01913 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01914     int iIntMax)
01915 {
01916     int iDataCntr;
01917     float fScale;
01918     
01919     X86_ASM (
01920         "movd %1, %%mm1\n\t" \
01921         "pswapd %%mm1, %%mm2\n\t" \
01922         "paddd %%mm2, %%mm1\n\t" \
01923         "pi2fd %%mm1, %%mm1\n\t" \
01924         "pfrcp %%mm1, %%mm2\n\t" \
01925         "pfrcpit1 %%mm2, %%mm1\n\t" \
01926         "pfrcpit2 %%mm2, %%mm1\n\t" \
01927         "movd %%mm1, %0\n\t"
01928         : "=m" (fScale)
01929         : "m" (iIntMax)
01930         : "mm1", "mm2", "memory");
01931     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01932     {
01933         X86_ASM (
01934             "movd %1, %%mm0\n\t" \
01935             "punpcklwd %%mm0, %%mm0\n\t" \
01936             "pi2fw %%mm0, %%mm0\n\t" \
01937             "pfmul %%mm1, %%mm0\n\t" \
01938             "movntq %%mm0, %0\n\t"
01939             : "=m" (fpDest[iDataCntr])
01940             : "m" (ipSrc[iDataCntr])
01941             : "mm0", "mm1", "memory");
01942     }
01943     X86_ASM (
01944         "femms\n\t" \
01945         "sfence\n\t");
01946     if ((iDataLength % 2) != 0)
01947     {
01948         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01949     }
01950 }
01951 
01952 
01953 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01954     int iIntMax)
01955 {
01956     int iDataCntr;
01957     float fScale;
01958     
01959     X86_ASM (
01960         "movd %1, %%mm1\n\t" \
01961         "pswapd %%mm1, %%mm2\n\t" \
01962         "paddd %%mm2, %%mm1\n\t" \
01963         "pi2fd %%mm1, %%mm1\n\t" \
01964         "pfrcp %%mm1, %%mm2\n\t" \
01965         "pfrcpit1 %%mm2, %%mm1\n\t" \
01966         "pfrcpit2 %%mm2, %%mm1\n\t" \
01967         "movd %%mm1, %0\n\t"
01968         : "=m" (fScale)
01969         : "m" (iIntMax)
01970         : "mm1", "mm2", "memory");
01971     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01972     {
01973         X86_ASM (
01974             "movq %1, %%mm0\n\t" \
01975             "pi2fd %%mm0, %%mm0\n\t" \
01976             "pfmul %%mm1, %%mm0\n\t" \
01977             "movntq %%mm0, %0\n\t"
01978             : "=m" (fpDest[iDataCntr])
01979             : "m" (ipSrc[iDataCntr])
01980             : "mm0", "mm1", "memory");
01981     }
01982     X86_ASM (
01983         "femms\n\t" \
01984         "sfence\n\t");
01985     if ((iDataLength % 2) != 0)
01986     {
01987         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01988     }
01989 }
01990 
01991 
01992 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength, 
01993     const float *fpCoeff, int iCoeffLength)
01994 {
01995     int iSrcCntr;
01996     int iDestCntr;
01997     int iCoeffCntr;
01998     int iSrcCount;
01999     stpm64 m64pDest = (stpm64) fpDest;
02000 
02001     iDestCntr = 0;
02002     iSrcCount = iDataLength + iCoeffLength;
02003     for (iSrcCntr = iCoeffLength; 
02004         iSrcCntr < iSrcCount; 
02005         iSrcCntr += 2)
02006     {
02007         X86_ASM (
02008             "pxor %%mm0, %%mm0\n\t" 
02009             :
02010             :
02011             : "mm0");
02012         for (iCoeffCntr = 0; 
02013             iCoeffCntr < iCoeffLength;
02014             iCoeffCntr++)
02015         {
02016             X86_ASM (
02017                 "movq %0, %%mm1\n\t" \
02018                 "movd %1, %%mm2\n\t" \
02019                 "pswapd %%mm2, %%mm3\n\t" \
02020                 "pfadd %%mm3, %%mm2\n\t" \
02021                 "pfmul %%mm2, %%mm1\n\t" \
02022                 "pfadd %%mm1, %%mm0\n\t" 
02023                 :
02024                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02025                   "m" (fpCoeff[iCoeffCntr])
02026                 : "mm0", "mm1", "mm2", "mm3", "memory");
02027         }
02028         X86_ASM (
02029             "movntq %%mm0, %0\n\t"
02030             : "=m" (m64pDest[iDestCntr++])
02031             :
02032             : "mm0", "memory");
02033     }
02034     if (iDataLength & 0x1)
02035     {
02036         X86_ASM (
02037             "pxor %%mm0, %%mm0\n\t" 
02038             :
02039             :
02040             : "mm0");
02041         for (iCoeffCntr = 0; 
02042             iCoeffCntr < iCoeffLength;
02043             iCoeffCntr++)
02044         {
02045             X86_ASM (
02046                 "movd %0, %%mm1\n\t" \
02047                 "movd %1, %%mm2\n\t" \
02048                 "pfmul %%mm2, %%mm1\n\t" \
02049                 "pfadd %%mm1, %%mm0\n\t" 
02050                 :
02051                 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
02052                   "m" (fpCoeff[iCoeffCntr])
02053                 : "mm0", "mm1", "mm2", "memory");
02054         }
02055         X86_ASM (
02056             "movd %%mm0, %0\n\t"
02057             : "=m" (fpDest[iDataLength - 1])
02058             :
02059             : "mm0", "memory");
02060     }
02061     X86_ASM (
02062         "femms\n\t" \
02063         "sfence\n\t");
02064 }
02065 
02066 
02067 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength, 
02068     const float *fpCoeff, int iCoeffLength)
02069 {
02070     int iDestCntr;
02071     int iSrcCntr;
02072     int iCoeffCntr;
02073     int iSrcCount;
02074 
02075     iDestCntr = 0;
02076     iSrcCount = iDataLength + iCoeffLength;
02077     for (iSrcCntr = iCoeffLength;
02078         iSrcCntr < iSrcCount;
02079         iSrcCntr++)
02080     {
02081         X86_ASM (
02082             "xorps %%xmm0, %%xmm0\n\t"
02083             :
02084             :
02085             : "xmm0");
02086         for (iCoeffCntr = 0;
02087             iCoeffCntr < iCoeffLength;
02088             iCoeffCntr++)
02089         {
02090             X86_ASM (
02091                 "movss %0, %%xmm1\n\t"
02092                 "mulss %1, %%xmm1\n\t"
02093                 "addss %%xmm1, %%xmm0\n\t"
02094                 :
02095                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02096                   "m" (fpCoeff[iCoeffCntr])
02097                 : "xmm0", "xmm1", "memory");
02098         }
02099         X86_ASM (
02100             "movss %%xmm0, %0\n\t"
02101             : "=m" (fpDest[iDestCntr++])
02102             :
02103             : "xmm0", "memory");
02104     }
02105 }
02106 
02107 
02108 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength, 
02109     const double *dpCoeff, int iCoeffLength)
02110 {
02111     int iDestCntr;
02112     int iSrcCntr;
02113     int iCoeffCntr;
02114     int iSrcCount;
02115 
02116     iDestCntr = 0;
02117     iSrcCount = iDataLength + iCoeffLength;
02118     for (iSrcCntr = iCoeffLength;
02119         iSrcCntr < iSrcCount;
02120         iSrcCntr++)
02121     {
02122         X86_ASM (
02123             "xorpd %%xmm0, %%xmm0\n\t"
02124             :
02125             :
02126             : "xmm0");
02127         for (iCoeffCntr = 0;
02128             iCoeffCntr < iCoeffLength;
02129             iCoeffCntr++)
02130         {
02131             X86_ASM (
02132                 "movsd %0, %%xmm1\n\t"
02133                 "mulsd %1, %%xmm1\n\t"
02134                 "addsd %%xmm1, %%xmm0\n\t"
02135                 :
02136                 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
02137                   "m" (dpCoeff[iCoeffCntr])
02138                 : "xmm0", "xmm1", "memory");
02139         }
02140         X86_ASM (
02141             "movsd %%xmm0, %0\n\t"
02142             : "=m" (dpDest[iDestCntr++])
02143             :
02144             : "xmm0", "memory");
02145     }
02146 }
02147 
02148 
02149 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
02150     float *fpX, float *fpY)
02151 {
02152     int iDataCntr;
02153     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02154     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02155     stpm64 m64pX = (stpm64) fpX;
02156     stpm64 m64pY = (stpm64) fpY;
02157 
02158     X86_ASM (
02159         "movq %0, %%mm0\n\t" \
02160         "pswapd %%mm0, %%mm2\n\t" \
02161         "movd %1, %%mm3\n\t" \
02162         "movq %2, %%mm0\n\t" \
02163         "pswapd %%mm0, %%mm4\n\t" \
02164         "movq %3, %%mm5\n\t" \
02165         "movq %4, %%mm7\n\t" \
02166         :
02167         : "m" (*m64pCoeff),
02168           "m" (fpCoeff[0]),
02169           "m" (*m64pCoeff2),
02170           "m" (*m64pX),
02171           "m" (*m64pY)
02172         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02173     for (iDataCntr = 0; 
02174         iDataCntr < iDataLength; 
02175         iDataCntr++)
02176     {
02177         X86_ASM (
02178             "pxor %%mm0, %%mm0\n\t" \
02179             "movd %1, %%mm6\n\t" \
02180             "movq %%mm5, %%mm1\n\t" \
02181             "pfmul %%mm2, %%mm1\n\t" \
02182             "pfacc %%mm1, %%mm0\n\t" \
02183             "movq %%mm6, %%mm1\n\t" \
02184             "pfmul %%mm3, %%mm1\n\t" \
02185             "pfacc %%mm1, %%mm0\n\t" \
02186             "movq %%mm7, %%mm1\n\t" \
02187             "pfmul %%mm4, %%mm1\n\t" \
02188             "pfacc %%mm1, %%mm0\n\t" \
02189             "pfacc %%mm0, %%mm0\n\t" \
02190             \
02191             "pswapd %%mm7, %%mm1\n\t" \
02192             "movq %%mm1, %%mm7\n\t" \
02193             "punpckldq %%mm0, %%mm7\n\t" \
02194             \
02195             "pswapd %%mm5, %%mm1\n\t" \
02196             "movq %%mm1, %%mm5\n\t" \
02197             "movq %%mm6, %%mm1\n\t" \
02198             "punpckldq %%mm1, %%mm5\n\t" \
02199             \
02200             "movd %%mm0, %0\n\t"
02201             : "=m" (fpVect[iDataCntr])
02202             : "0" (fpVect[iDataCntr])
02203             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02204     }
02205     X86_ASM (
02206         "movq %%mm5, %0\n\t" \
02207         "movd %%mm6, %1\n\t" \
02208         "movq %%mm7, %2\n\t"
02209         : "=m" (*m64pX),
02210           "=m" (fpX[2]),
02211           "=m" (*m64pY)
02212         :
02213         : "mm5", "mm6", "mm7", "memory");
02214     X86_ASM ("femms\n\t");
02215 }
02216 
02217 
02218 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
02219     float *fpX, float *fpY)
02220 {
02221     int iDataCntr;
02222     
02223     X86_ASM (
02224         "movss %0, %%xmm1\n\t" \
02225         "movss %1, %%xmm2\n\t" \
02226         "movss %2, %%xmm3\n\t" \
02227         "movss %3, %%xmm4\n\t" \
02228         "prefetchnta %4\n\t"
02229         :
02230         : "m" (fpX[1]),
02231           "m" (fpX[2]),
02232           "m" (fpY[0]),
02233           "m" (fpY[1]),
02234           "m" (fpCoeff[0])
02235         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02236     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02237     {
02238         X86_ASM (
02239             "movss %%xmm1, %%xmm0\n\t" \
02240             "movss %%xmm2, %%xmm1\n\t" \
02241             "movss %1, %%xmm2\n\t" \
02242             \
02243             "movss %2, %%xmm5\n\t" \
02244             "mulss %%xmm2, %%xmm5\n\t" \
02245             "movss %3, %%xmm6\n\t" \
02246             "mulss %%xmm1, %%xmm6\n\t" \
02247             "addss %%xmm6, %%xmm5\n\t" \
02248             "movss %4, %%xmm6\n\t" \
02249             "mulss %%xmm0, %%xmm6\n\t" \
02250             "addss %%xmm6, %%xmm5\n\t" \
02251             \
02252             "movss %5, %%xmm6\n\t" \
02253             "mulss %%xmm4, %%xmm6\n\t" \
02254             "movss %6, %%xmm7\n\t" \
02255             "mulss %%xmm3, %%xmm7\n\t" \
02256             "addss %%xmm7, %%xmm6\n\t" \
02257             \
02258             "addss %%xmm5, %%xmm6\n\t" \
02259             "movss %%xmm4, %%xmm3\n\t" \
02260             "movss %%xmm6, %%xmm4\n\t" \
02261             \
02262             "movss %%xmm6, %0\n\t"
02263             : "=m" (fpVect[iDataCntr])
02264             : "0" (fpVect[iDataCntr]),
02265               "m" (fpCoeff[0]),
02266               "m" (fpCoeff[1]),
02267               "m" (fpCoeff[2]),
02268               "m" (fpCoeff[3]),
02269               "m" (fpCoeff[4])
02270             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02271               "memory");
02272     }
02273     X86_ASM (
02274         "movss %%xmm0, %0\n\t" \
02275         "movss %%xmm1, %1\n\t" \
02276         "movss %%xmm2, %2\n\t" \
02277         "movss %%xmm3, %3\n\t" \
02278         "movss %%xmm4, %4\n\t"
02279         : "=m" (fpX[0]),
02280           "=m" (fpX[1]),
02281           "=m" (fpX[2]),
02282           "=m" (fpY[0]),
02283           "=m" (fpY[1])
02284         :
02285         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02286 }
02287 
02288 
02289 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff, 
02290     double *dpX, double *dpY)
02291 {
02292     int iDataCntr;
02293     
02294     X86_ASM (
02295         "movsd %0, %%xmm1\n\t" \
02296         "movsd %1, %%xmm2\n\t" \
02297         "movsd %2, %%xmm3\n\t" \
02298         "movsd %3, %%xmm4\n\t" \
02299         "prefetchnta %4\n\t" \
02300         "prefetchnta %5\n\t"
02301         :
02302         : "m" (dpX[1]),
02303           "m" (dpX[2]),
02304           "m" (dpY[0]),
02305           "m" (dpY[1]),
02306           "m" (dpCoeff[0]),
02307           "m" (dpCoeff[3])
02308         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02309     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02310     {
02311         X86_ASM (
02312             "movsd %%xmm1, %%xmm0\n\t" \
02313             "movsd %%xmm2, %%xmm1\n\t" \
02314             "movsd %1, %%xmm2\n\t" \
02315             \
02316             "movsd %2, %%xmm5\n\t" \
02317             "mulsd %%xmm2, %%xmm5\n\t" \
02318             "movsd %3, %%xmm6\n\t" \
02319             "mulsd %%xmm1, %%xmm6\n\t" \
02320             "addsd %%xmm6, %%xmm5\n\t" \
02321             "movsd %4, %%xmm6\n\t" \
02322             "mulsd %%xmm0, %%xmm6\n\t" \
02323             "addsd %%xmm6, %%xmm5\n\t" \
02324             \
02325             "movsd %5, %%xmm6\n\t" \
02326             "mulsd %%xmm4, %%xmm6\n\t" \
02327             "movsd %6, %%xmm7\n\t" \
02328             "mulsd %%xmm3, %%xmm7\n\t" \
02329             "addsd %%xmm7, %%xmm6\n\t" \
02330             \
02331             "addsd %%xmm5, %%xmm6\n\t" \
02332             "movsd %%xmm4, %%xmm3\n\t" \
02333             "movsd %%xmm6, %%xmm4\n\t" \
02334             \
02335             "movsd %%xmm6, %0\n\t"
02336             : "=m" (dpVect[iDataCntr])
02337             : "0" (dpVect[iDataCntr]),
02338               "m" (dpCoeff[0]),
02339               "m" (dpCoeff[1]),
02340               "m" (dpCoeff[2]),
02341               "m" (dpCoeff[3]),
02342               "m" (dpCoeff[4])
02343             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02344               "memory");
02345     }
02346     X86_ASM (
02347         "movsd %%xmm0, %0\n\t" \
02348         "movsd %%xmm1, %1\n\t" \
02349         "movsd %%xmm2, %2\n\t" \
02350         "movsd %%xmm3, %3\n\t" \
02351         "movsd %%xmm4, %4\n\t"
02352         : "=m" (dpX[0]),
02353           "=m" (dpX[1]),
02354           "=m" (dpX[2]),
02355           "=m" (dpY[0]),
02356           "=m" (dpY[1])
02357         :
02358         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02359 }
02360 
02361 
02362 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
02363     const float *fpCoeff, float *fpX, float *fpY)
02364 {
02365     int iDataCntr;
02366     stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02367     stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02368     stpm64 m64pX = (stpm64) fpX;
02369     stpm64 m64pY = (stpm64) fpY;
02370 
02371     X86_ASM (
02372         "movq %0, %%mm0\n\t" \
02373         "pswapd %%mm0, %%mm2\n\t" \
02374         "movd %1, %%mm3\n\t" \
02375         "movq %2, %%mm0\n\t" \
02376         "pswapd %%mm0, %%mm4\n\t" \
02377         "movq %3, %%mm5\n\t" \
02378         "movq %4, %%mm7\n\t" \
02379         :
02380         : "m" (*m64pCoeff),
02381           "m" (fpCoeff[0]),
02382           "m" (*m64pCoeff2),
02383           "m" (*m64pX),
02384           "m" (*m64pY)
02385         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02386     for (iDataCntr = 0; 
02387         iDataCntr < iDataLength; 
02388         iDataCntr++)
02389     {
02390         X86_ASM (
02391             "pxor %%mm0, %%mm0\n\t" \
02392             "movd %1, %%mm6\n\t" \
02393             "movq %%mm5, %%mm1\n\t" \
02394             "pfmul %%mm2, %%mm1\n\t" \
02395             "pfacc %%mm1, %%mm0\n\t" \
02396             "movq %%mm6, %%mm1\n\t" \
02397             "pfmul %%mm3, %%mm1\n\t" \
02398             "pfacc %%mm1, %%mm0\n\t" \
02399             "movq %%mm7, %%mm1\n\t" \
02400             "pfmul %%mm4, %%mm1\n\t" \
02401             "pfacc %%mm1, %%mm0\n\t" \
02402             "pfacc %%mm0, %%mm0\n\t" \
02403             \
02404             "pswapd %%mm7, %%mm1\n\t" \
02405             "movq %%mm1, %%mm7\n\t" \
02406             "punpckldq %%mm0, %%mm7\n\t" \
02407             \
02408             "pswapd %%mm5, %%mm1\n\t" \
02409             "movq %%mm1, %%mm5\n\t" \
02410             "movq %%mm6, %%mm1\n\t" \
02411             "punpckldq %%mm1, %%mm5\n\t" \
02412             \
02413             "movd %%mm0, %0\n\t"
02414             : "=m" (fpDest[iDataCntr])
02415             : "m" (fpSrc[iDataCntr])
02416             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02417     }
02418     X86_ASM (
02419         "movq %%mm5, %0\n\t" \
02420         "movd %%mm6, %1\n\t" \
02421         "movq %%mm7, %2\n\t"
02422         : "=m" (*m64pX),
02423           "=m" (fpX[2]),
02424           "=m" (*m64pY)
02425         :
02426         : "mm5", "mm6", "mm7", "memory");
02427     X86_ASM ("femms\n\t");
02428 }
02429 
02430 
02431 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
02432     const float *fpCoeff, float *fpX, float *fpY)
02433 {
02434     int iDataCntr;
02435     
02436     X86_ASM (
02437         "movss %0, %%xmm1\n\t" \
02438         "movss %1, %%xmm2\n\t" \
02439         "movss %2, %%xmm3\n\t" \
02440         "movss %3, %%xmm4\n\t" \
02441         "prefetchnta %4\n\t"
02442         :
02443         : "m" (fpX[1]),
02444           "m" (fpX[2]),
02445           "m" (fpY[0]),
02446           "m" (fpY[1]),
02447           "m" (fpCoeff[0])
02448         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02449     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02450     {
02451         X86_ASM (
02452             "movss %%xmm1, %%xmm0\n\t" \
02453             "movss %%xmm2, %%xmm1\n\t" \
02454             "movss %1, %%xmm2\n\t" \
02455             \
02456             "movss %2, %%xmm5\n\t" \
02457             "mulss %%xmm2, %%xmm5\n\t" \
02458             "movss %3, %%xmm6\n\t" \
02459             "mulss %%xmm1, %%xmm6\n\t" \
02460             "addss %%xmm6, %%xmm5\n\t" \
02461             "movss %4, %%xmm6\n\t" \
02462             "mulss %%xmm0, %%xmm6\n\t" \
02463             "addss %%xmm6, %%xmm5\n\t" \
02464             \
02465             "movss %5, %%xmm6\n\t" \
02466             "mulss %%xmm4, %%xmm6\n\t" \
02467             "movss %6, %%xmm7\n\t" \
02468             "mulss %%xmm3, %%xmm7\n\t" \
02469             "addss %%xmm7, %%xmm6\n\t" \
02470             \
02471             "addss %%xmm5, %%xmm6\n\t" \
02472             "movss %%xmm4, %%xmm3\n\t" \
02473             "movss %%xmm6, %%xmm4\n\t" \
02474             \
02475             "movss %%xmm6, %0\n\t"
02476             : "=m" (fpDest[iDataCntr])
02477             : "m" (fpSrc[iDataCntr]),
02478               "m" (fpCoeff[0]),
02479               "m" (fpCoeff[1]),
02480               "m" (fpCoeff[2]),
02481               "m" (fpCoeff[3]),
02482               "m" (fpCoeff[4])
02483             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02484               "memory");
02485     }
02486     X86_ASM (
02487         "movss %%xmm0, %0\n\t" \
02488         "movss %%xmm1, %1\n\t" \
02489         "movss %%xmm2, %2\n\t" \
02490         "movss %%xmm3, %3\n\t" \
02491         "movss %%xmm4, %4\n\t"
02492         : "=m" (fpX[0]),
02493           "=m" (fpX[1]),
02494           "=m" (fpX[2]),
02495           "=m" (fpY[0]),
02496           "=m" (fpY[1])
02497         :
02498         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02499 }
02500 
02501 
02502 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength, 
02503     const double *dpCoeff, double *dpX, double *dpY)
02504 {
02505     int iDataCntr;
02506     
02507     X86_ASM (
02508         "movsd %0, %%xmm1\n\t" \
02509         "movsd %1, %%xmm2\n\t" \
02510         "movsd %2, %%xmm3\n\t" \
02511         "movsd %3, %%xmm4\n\t" \
02512         "prefetchnta %4\n\t" \
02513         "prefetchnta %5\n\t"
02514         :
02515         : "m" (dpX[1]),
02516           "m" (dpX[2]),
02517           "m" (dpY[0]),
02518           "m" (dpY[1]),
02519           "m" (dpCoeff[0]),
02520           "m" (dpCoeff[3])
02521         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02522     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02523     {
02524         X86_ASM (
02525             "movsd %%xmm1, %%xmm0\n\t" \
02526             "movsd %%xmm2, %%xmm1\n\t" \
02527             "movsd %1, %%xmm2\n\t" \
02528             \
02529             "movsd %2, %%xmm5\n\t" \
02530             "mulsd %%xmm2, %%xmm5\n\t" \
02531             "movsd %3, %%xmm6\n\t" \
02532             "mulsd %%xmm1, %%xmm6\n\t" \
02533             "addsd %%xmm6, %%xmm5\n\t" \
02534             "movsd %4, %%xmm6\n\t" \
02535             "mulsd %%xmm0, %%xmm6\n\t" \
02536             "addsd %%xmm6, %%xmm5\n\t" \
02537             \
02538             "movsd %5, %%xmm6\n\t" \
02539             "mulsd %%xmm4, %%xmm6\n\t" \
02540             "movsd %6, %%xmm7\n\t" \
02541             "mulsd %%xmm3, %%xmm7\n\t" \
02542             "addsd %%xmm7, %%xmm6\n\t" \
02543             \
02544             "addsd %%xmm5, %%xmm6\n\t" \
02545             "movsd %%xmm4, %%xmm3\n\t" \
02546             "movsd %%xmm6, %%xmm4\n\t" \
02547             \
02548             "movsd %%xmm6, %0\n\t"
02549             : "=m" (dpDest[iDataCntr])
02550             : "m" (dpSrc[iDataCntr]),
02551               "m" (dpCoeff[0]),
02552               "m" (dpCoeff[1]),
02553               "m" (dpCoeff[2]),
02554               "m" (dpCoeff[3]),
02555               "m" (dpCoeff[4])
02556             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02557               "memory");
02558     }
02559     X86_ASM (
02560         "movsd %%xmm0, %0\n\t" \
02561         "movsd %%xmm1, %1\n\t" \
02562         "movsd %%xmm2, %2\n\t" \
02563         "movsd %%xmm3, %3\n\t" \
02564         "movsd %%xmm4, %4\n\t"
02565         : "=m" (dpX[0]),
02566           "=m" (dpX[1]),
02567           "=m" (dpX[2]),
02568           "=m" (dpY[0]),
02569           "=m" (dpY[1])
02570         :
02571         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02572 }
02573 
02574 
02575 #ifdef __cplusplus
02576 }
02577 #endif
02578 
02579 #endif

Generated on Tue Mar 2 19:46:46 2004 for libDSP by doxygen 1.3.6