00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifdef DSP_X86
00024
00025
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <limits.h>
00029 #include <math.h>
00030 #include <float.h>
00031
00032 #include "dsp/X86.h"
00033
00034
00035 #ifndef DSP_X86_64
00036 static char cpCPUid[13];
00037 #endif
00038
00039
00040 #ifdef __cplusplus
00041 extern "C"
00042 {
00043 #endif
00044
00045
00046 #ifndef DSP_X86_64
00047 const char *dsp_x86_cpuid ()
00048 {
00049 unsigned int *ipCPUid = (unsigned int *) cpCPUid;
00050
00051 X86_ASM (
00052 "pushl %%eax\n\t" \
00053 "pushl %%ebx\n\t" \
00054 "pushl %%ecx\n\t" \
00055 "pushl %%edx\n\t" \
00056 "xorl %%eax, %%eax\n\t" \
00057 "cpuid\n\t" \
00058 "movl %%ebx, %0\n\t" \
00059 "movl %%ecx, %2\n\t" \
00060 "movl %%edx, %1\n\t" \
00061 "popl %%edx\n\t" \
00062 "popl %%ecx\n\t" \
00063 "popl %%ebx\n\t" \
00064 "popl %%eax\n\t"
00065 : "=m" (ipCPUid[0]),
00066 "=m" (ipCPUid[1]),
00067 "=m" (ipCPUid[2])
00068 :
00069 : "eax", "ebx", "ecx", "edx", "memory");
00070 cpCPUid[12] = '\0';
00071
00072 return cpCPUid;
00073 }
00074
00075
00076 unsigned int dsp_x86_features ()
00077 {
00078 unsigned int uiFeatures = 0;
00079
00080 X86_ASM (
00081 "pushl %%eax\n\t" \
00082 "pushl %%ebx\n\t" \
00083 "pushl %%ecx\n\t" \
00084 "pushl %%edx\n\t" \
00085 "movl $1, %%eax\n\t" \
00086 "cpuid\n\t" \
00087 "movl %%edx, %0\n\t" \
00088 "popl %%edx\n\t" \
00089 "popl %%ecx\n\t" \
00090 "popl %%ebx\n\t" \
00091 "popl %%eax\n\t"
00092 : "=m" (uiFeatures)
00093 :
00094 : "eax", "ebx", "ecx", "edx", "memory");
00095
00096 return uiFeatures;
00097 }
00098
00099
00100 unsigned int dsp_x86_amd_features ()
00101 {
00102 unsigned int uiFunction = 0x80000001;
00103 unsigned int uiFeatures = 0;
00104
00105 X86_ASM (
00106 "pushl %%eax\n\t" \
00107 "pushl %%ebx\n\t" \
00108 "pushl %%ecx\n\t" \
00109 "pushl %%edx\n\t" \
00110 "movl %1, %%eax\n\t" \
00111 "cpuid\n\t" \
00112 "movl %%edx, %0\n\t" \
00113 "popl %%edx\n\t" \
00114 "popl %%ecx\n\t" \
00115 "popl %%ebx\n\t" \
00116 "popl %%eax\n\t"
00117 : "=m" (uiFeatures)
00118 : "m" (uiFunction)
00119 : "eax", "ebx", "ecx", "edx", "memory");
00120
00121 return uiFeatures;
00122 }
00123 #endif
00124
00125
00126 extern int dsp_x86_have_e3dnow ()
00127 {
00128 #ifndef DSP_X86_64
00129 unsigned int uiFeatures;
00130
00131 if (strcmp(dsp_x86_cpuid(), "AuthenticAMD") == 0)
00132 {
00133 uiFeatures = dsp_x86_amd_features();
00134 if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
00135 return 1;
00136 }
00137 return 0;
00138 #else
00139 return 1;
00140 #endif
00141 }
00142
00143
00144 extern int dsp_x86_have_sse2 ()
00145 {
00146 #ifndef DSP_X86_64
00147 unsigned int uiFeatures;
00148
00149 uiFeatures = dsp_x86_features();
00150 if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
00151 return 1;
00152 return 0;
00153 #else
00154 return 1;
00155 #endif
00156 }
00157
00158
00159 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
00160 {
00161 int iStartIdx;
00162 int iDataCntr;
00163 int iDataCount;
00164 stpm64 m64pDest = (stpm64) fpDest;
00165 stpm64 m64pSrc = (stpm64) fpSrc;
00166
00167 iStartIdx = 0;
00168 X86_ASM (
00169 "prefetchnta %0\n\t" \
00170 "prefetchnta %1\n\t" \
00171 "prefetchnta %2\n\t" \
00172 "prefetchnta %3\n\t"
00173 :
00174 : "m" (m64pSrc[0]),
00175 "m" (m64pSrc[8]),
00176 "m" (m64pSrc[16]),
00177 "m" (m64pSrc[24]));
00178 iDataCount = ((iDataLength & 0xfffffff0) >> 1);
00179 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00180 {
00181 X86_ASM (
00182 "prefetchnta %16\n\t" \
00183 "movq %8, %%mm0\n\t" \
00184 "movq %9, %%mm1\n\t" \
00185 "movq %10, %%mm2\n\t" \
00186 "movq %11, %%mm3\n\t" \
00187 "movq %12, %%mm4\n\t" \
00188 "movq %13, %%mm5\n\t" \
00189 "movq %14, %%mm6\n\t" \
00190 "movq %15, %%mm7\n\t" \
00191 "movntq %%mm0, %0\n\t" \
00192 "movntq %%mm1, %1\n\t" \
00193 "movntq %%mm2, %2\n\t" \
00194 "movntq %%mm3, %3\n\t" \
00195 "movntq %%mm4, %4\n\t" \
00196 "movntq %%mm5, %5\n\t" \
00197 "movntq %%mm6, %6\n\t" \
00198 "movntq %%mm7, %7\n\t"
00199 : "=m" (m64pDest[iDataCntr]),
00200 "=m" (m64pDest[iDataCntr + 1]),
00201 "=m" (m64pDest[iDataCntr + 2]),
00202 "=m" (m64pDest[iDataCntr + 3]),
00203 "=m" (m64pDest[iDataCntr + 4]),
00204 "=m" (m64pDest[iDataCntr + 5]),
00205 "=m" (m64pDest[iDataCntr + 6]),
00206 "=m" (m64pDest[iDataCntr + 7])
00207 : "m" (m64pSrc[iDataCntr]),
00208 "m" (m64pSrc[iDataCntr + 1]),
00209 "m" (m64pSrc[iDataCntr + 2]),
00210 "m" (m64pSrc[iDataCntr + 3]),
00211 "m" (m64pSrc[iDataCntr + 4]),
00212 "m" (m64pSrc[iDataCntr + 5]),
00213 "m" (m64pSrc[iDataCntr + 6]),
00214 "m" (m64pSrc[iDataCntr + 7]),
00215 "m" (m64pSrc[iDataCntr + 32])
00216 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00217 }
00218 iStartIdx = iDataCount;
00219 iDataCount = ((iDataLength & 0xfffffffe) >> 1);
00220 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00221 {
00222 X86_ASM (
00223 "prefetchnta %2\n\t" \
00224 "movq %1, %%mm0\n\t" \
00225 "movntq %%mm0, %0\n\t"
00226 : "=m" (m64pDest[iDataCntr])
00227 : "m" (m64pSrc[iDataCntr]),
00228 "m" (m64pSrc[iDataCntr + 32])
00229 : "mm0", "memory");
00230 }
00231 if (iDataLength & 0x1)
00232 {
00233 X86_ASM (
00234 "movd %1, %%mm0\n\t" \
00235 "movd %%mm0, %0\n\t"
00236 : "=m" (fpDest[iDataLength - 1])
00237 : "m" (fpSrc[iDataLength - 1])
00238 : "mm0", "memory");
00239 }
00240 X86_ASM (
00241 "femms\n\t" \
00242 "sfence\n\t");
00243 }
00244
00245
00246 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
00247 {
00248 int iStartIdx;
00249 int iDataCntr;
00250 int iDataCount;
00251
00252 iStartIdx = 0;
00253 X86_ASM (
00254 "prefetchnta %0\n\t" \
00255 "prefetchnta %1\n\t" \
00256 "prefetchnta %2\n\t" \
00257 "prefetchnta %3\n\t"
00258 :
00259 : "m" (dpSrc[0]),
00260 "m" (dpSrc[8]),
00261 "m" (dpSrc[16]),
00262 "m" (dpSrc[24]));
00263 iDataCount = (iDataLength & 0xfffffff8);
00264 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
00265 {
00266 X86_ASM (
00267 "prefetchnta %16\n\t" \
00268 "movq %8, %%mm0\n\t" \
00269 "movq %9, %%mm1\n\t" \
00270 "movq %10, %%mm2\n\t" \
00271 "movq %11, %%mm3\n\t" \
00272 "movq %12, %%mm4\n\t" \
00273 "movq %13, %%mm5\n\t" \
00274 "movq %14, %%mm6\n\t" \
00275 "movq %15, %%mm7\n\t" \
00276 "movntq %%mm0, %0\n\t" \
00277 "movntq %%mm1, %1\n\t" \
00278 "movntq %%mm2, %2\n\t" \
00279 "movntq %%mm3, %3\n\t" \
00280 "movntq %%mm4, %4\n\t" \
00281 "movntq %%mm5, %5\n\t" \
00282 "movntq %%mm6, %6\n\t" \
00283 "movntq %%mm7, %7\n\t"
00284 : "=m" (dpDest[iDataCntr]),
00285 "=m" (dpDest[iDataCntr + 1]),
00286 "=m" (dpDest[iDataCntr + 2]),
00287 "=m" (dpDest[iDataCntr + 3]),
00288 "=m" (dpDest[iDataCntr + 4]),
00289 "=m" (dpDest[iDataCntr + 5]),
00290 "=m" (dpDest[iDataCntr + 6]),
00291 "=m" (dpDest[iDataCntr + 7])
00292 : "m" (dpSrc[iDataCntr]),
00293 "m" (dpSrc[iDataCntr + 1]),
00294 "m" (dpSrc[iDataCntr + 2]),
00295 "m" (dpSrc[iDataCntr + 3]),
00296 "m" (dpSrc[iDataCntr + 4]),
00297 "m" (dpSrc[iDataCntr + 5]),
00298 "m" (dpSrc[iDataCntr + 6]),
00299 "m" (dpSrc[iDataCntr + 7]),
00300 "m" (dpSrc[iDataCntr + 32])
00301 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
00302 }
00303 iStartIdx = iDataCount;
00304 iDataCount = iDataLength;
00305 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
00306 {
00307 X86_ASM (
00308 "prefetchnta %2\n\t" \
00309 "movq %1, %%mm0\n\t" \
00310 "movntq %%mm0, %0\n\t"
00311 : "=m" (dpDest[iDataCntr])
00312 : "m" (dpSrc[iDataCntr]),
00313 "m" (dpSrc[iDataCntr + 32])
00314 : "mm0", "memory");
00315 }
00316 X86_ASM (
00317 "femms\n\t" \
00318 "sfence\n\t");
00319 }
00320
00321
00322 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
00323 {
00324 int iDataCntr;
00325 int iDataCount;
00326 stpm64 m64pVect = (stpm64) fpVect;
00327 stm64 m64Src;
00328
00329 m64Src.f[0] = m64Src.f[1] = fSrc;
00330 iDataCount = (iDataLength >> 1);
00331 X86_ASM (
00332 "movq %0, %%mm1\n\t"
00333 :
00334 : "m" (m64Src)
00335 : "mm1", "memory");
00336 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00337 {
00338 X86_ASM (
00339 "movq %1, %%mm0\n\t" \
00340 "pfadd %%mm1, %%mm0\n\t" \
00341 "movntq %%mm0, %0\n\t"
00342 : "=m" (m64pVect[iDataCntr])
00343 : "0" (m64pVect[iDataCntr])
00344 : "mm0", "mm1", "memory");
00345 }
00346 if (iDataLength & 0x1)
00347 {
00348 X86_ASM (
00349 "movd %1, %%mm0\n\t" \
00350 "pfadd %%mm1, %%mm0\n\t" \
00351 "movd %%mm0, %0\n\t"
00352 : "=m" (fpVect[iDataLength - 1])
00353 : "0" (fpVect[iDataLength - 1])
00354 : "mm0", "mm1", "memory");
00355 }
00356 X86_ASM (
00357 "femms\n\t" \
00358 "sfence\n\t");
00359 }
00360
00361
00362 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
00363 {
00364 int iDataCntr;
00365
00366 X86_ASM (
00367 "movss %0, %%xmm1\n\t"
00368 :
00369 : "m" (fSrc)
00370 : "xmm1", "memory");
00371 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00372 {
00373 X86_ASM (
00374 "movss %1, %%xmm0\n\t" \
00375 "addss %%xmm1, %%xmm0\n\t" \
00376 "movss %%xmm0, %0\n\t"
00377 : "=m" (fpVect[iDataCntr])
00378 : "0" (fpVect[iDataCntr])
00379 : "xmm0", "xmm1", "memory");
00380 }
00381 }
00382
00383
00384 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
00385 {
00386 int iDataCntr;
00387
00388 X86_ASM (
00389 "movsd %0, %%xmm1\n\t"
00390 :
00391 : "m" (dSrc)
00392 : "xmm1", "memory");
00393 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00394 {
00395 X86_ASM (
00396 "movsd %1, %%xmm0\n\t" \
00397 "addsd %%xmm1, %%xmm0\n\t" \
00398 "movsd %%xmm0, %0\n\t"
00399 : "=m" (dpVect[iDataCntr])
00400 : "0" (dpVect[iDataCntr])
00401 : "xmm0", "xmm1", "memory");
00402 }
00403 }
00404
00405
00406 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
00407 {
00408 int iDataCntr;
00409 int iDataCount;
00410 stpm64 m64pVect = (stpm64) fpVect;
00411 stm64 m64Src;
00412
00413 m64Src.f[0] = m64Src.f[1] = fSrc;
00414 iDataCount = (iDataLength >> 1);
00415 X86_ASM (
00416 "movq %0, %%mm1\n\t"
00417 :
00418 : "m" (m64Src)
00419 : "mm1", "memory");
00420 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00421 {
00422 X86_ASM (
00423 "movq %1, %%mm0\n\t" \
00424 "pfmul %%mm1, %%mm0\n\t" \
00425 "movntq %%mm0, %0\n\t"
00426 : "=m" (m64pVect[iDataCntr])
00427 : "0" (m64pVect[iDataCntr])
00428 : "mm0", "mm1", "memory");
00429 }
00430 if (iDataLength & 0x1)
00431 {
00432 X86_ASM (
00433 "movd %1, %%mm0\n\t" \
00434 "pfmul %%mm1, %%mm0\n\t" \
00435 "movd %%mm0, %0\n\t"
00436 : "=m" (fpVect[iDataLength - 1])
00437 : "0" (fpVect[iDataLength - 1])
00438 : "mm0", "mm1", "memory");
00439 }
00440 X86_ASM (
00441 "femms\n\t" \
00442 "sfence\n\t");
00443 }
00444
00445
00446 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
00447 {
00448 int iDataCntr;
00449
00450 X86_ASM (
00451 "movss %0, %%xmm1\n\t"
00452 :
00453 : "m" (fSrc)
00454 : "xmm1", "memory");
00455 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00456 {
00457 X86_ASM (
00458 "movss %1, %%xmm0\n\t" \
00459 "mulss %%xmm1, %%xmm0\n\t" \
00460 "movss %%xmm0, %0\n\t"
00461 : "=m" (fpVect[iDataCntr])
00462 : "0" (fpVect[iDataCntr])
00463 : "xmm0", "xmm1", "memory");
00464 }
00465 }
00466
00467
00468 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
00469 {
00470 int iDataCntr;
00471
00472 X86_ASM (
00473 "movsd %0, %%xmm1\n\t"
00474 :
00475 : "m" (dSrc)
00476 : "xmm1", "memory");
00477 for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
00478 {
00479 X86_ASM (
00480 "movsd %1, %%xmm0\n\t" \
00481 "mulsd %%xmm1, %%xmm0\n\t" \
00482 "movsd %%xmm0, %0\n\t"
00483 : "=m" (dpVect[iDataCntr])
00484 : "0" (dpVect[iDataCntr])
00485 : "xmm0", "xmm1", "memory");
00486 }
00487 }
00488
00489
00490 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00491 int iDataLength)
00492 {
00493 int iDataCntr;
00494 int iDataCount;
00495 stpm64 m64pDest = (stpm64) fpDest;
00496 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00497 stm64 m64Src2;
00498
00499 m64Src2.f[0] = m64Src2.f[1] = fSrc2;
00500 iDataCount = (iDataLength >> 1);
00501 X86_ASM (
00502 "movq %0, %%mm1\n\t"
00503 :
00504 : "m" (m64Src2)
00505 : "mm1", "memory");
00506 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00507 {
00508 X86_ASM (
00509 "movq %1, %%mm0\n\t" \
00510 "pfmul %%mm1, %%mm0\n\t" \
00511 "movntq %%mm0, %0\n\t"
00512 : "=m" (m64pDest[iDataCntr])
00513 : "m" (m64pSrc1[iDataCntr])
00514 : "mm0", "mm1", "memory");
00515 }
00516 if (iDataLength & 0x1)
00517 {
00518 X86_ASM (
00519 "movd %1, %%mm0\n\t" \
00520 "pfmul %%mm1, %%mm0\n\t" \
00521 "movd %%mm0, %0\n\t"
00522 : "=m" (fpDest[iDataLength - 1])
00523 : "m" (fpSrc1[iDataLength - 1])
00524 : "mm0", "mm1", "memory");
00525 }
00526 X86_ASM (
00527 "femms\n\t" \
00528 "sfence\n\t");
00529 }
00530
00531
00532 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
00533 int iDataLength)
00534 {
00535 int iDataCntr;
00536
00537 X86_ASM (
00538 "movss %0, %%xmm1\n\t"
00539 :
00540 : "m" (fSrc2)
00541 : "xmm1", "memory");
00542 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00543 {
00544 X86_ASM (
00545 "movss %1, %%xmm0\n\t" \
00546 "mulss %%xmm1, %%xmm0\n\t" \
00547 "movss %%xmm0, %0\n\t"
00548 : "=m" (fpDest[iDataCntr])
00549 : "m" (fpSrc1[iDataCntr])
00550 : "xmm0", "xmm1", "memory");
00551 }
00552 }
00553
00554
00555 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
00556 int iDataLength)
00557 {
00558 int iDataCntr;
00559
00560 X86_ASM (
00561 "movsd %0, %%xmm1\n\t"
00562 :
00563 : "m" (dSrc2)
00564 : "xmm1", "memory");
00565 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00566 {
00567 X86_ASM (
00568 "movsd %1, %%xmm0\n\t" \
00569 "mulsd %%xmm1, %%xmm0\n\t" \
00570 "movsd %%xmm0, %0\n\t"
00571 : "=m" (dpDest[iDataCntr])
00572 : "m" (dpSrc1[iDataCntr])
00573 : "xmm0", "xmm1", "memory");
00574 }
00575 }
00576
00577
00578 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00579 {
00580 int iDataCntr;
00581 int iDataCount;
00582 stpm64 m64pDest = (stpm64) fpDest;
00583 stpm64 m64pSrc = (stpm64) fpSrc;
00584
00585 iDataCount = (iDataLength >> 1);
00586 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00587 {
00588 X86_ASM (
00589 "movq %1, %%mm0\n\t" \
00590 "movq %2, %%mm1\n\t" \
00591 "pfadd %%mm1, %%mm0\n\t" \
00592 "movntq %%mm0, %0\n\t"
00593 : "=m" (m64pDest[iDataCntr])
00594 : "0" (m64pDest[iDataCntr]),
00595 "m" (m64pSrc[iDataCntr])
00596 : "mm0", "mm1", "memory");
00597 }
00598 if (iDataLength & 0x1)
00599 {
00600 X86_ASM (
00601 "movd %1, %%mm0\n\t" \
00602 "movd %2, %%mm1\n\t" \
00603 "pfadd %%mm1, %%mm0\n\t" \
00604 "movd %%mm0, %0\n\t"
00605 : "=m" (fpDest[iDataLength - 1])
00606 : "0" (fpDest[iDataLength - 1]),
00607 "m" (fpSrc[iDataLength - 1])
00608 : "mm0", "mm1", "memory");
00609 }
00610 X86_ASM (
00611 "femms\n\t" \
00612 "sfence\n\t");
00613 }
00614
00615
00616 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
00617 {
00618 int iDataCntr;
00619
00620 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00621 {
00622 X86_ASM (
00623 "movss %1, %%xmm0\n\t" \
00624 "addss %2, %%xmm0\n\t" \
00625 "movss %%xmm0, %0\n\t"
00626 : "=m" (fpDest[iDataCntr])
00627 : "0" (fpDest[iDataCntr]),
00628 "m" (fpSrc[iDataCntr])
00629 : "xmm0", "memory");
00630 }
00631 }
00632
00633
00634 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
00635 {
00636 int iDataCntr;
00637
00638 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00639 {
00640 X86_ASM (
00641 "movsd %1, %%xmm0\n\t" \
00642 "addsd %2, %%xmm0\n\t" \
00643 "movsd %%xmm0, %0\n\t"
00644 : "=m" (dpDest[iDataCntr])
00645 : "0" (dpDest[iDataCntr]),
00646 "m" (dpSrc[iDataCntr])
00647 : "xmm0", "memory");
00648 }
00649 }
00650
00651
00652 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00653 {
00654 int iDataCntr;
00655 int iDataCount;
00656 stpm64 m64pDest = (stpm64) fpDest;
00657 stpm64 m64pSrc = (stpm64) fpSrc;
00658
00659 iDataCount = (iDataLength >> 1);
00660 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00661 {
00662 X86_ASM (
00663 "movq %1, %%mm0\n\t" \
00664 "movq %2, %%mm1\n\t" \
00665 "pfmul %%mm1, %%mm0\n\t" \
00666 "movntq %%mm0, %0\n\t"
00667 : "=m" (m64pDest[iDataCntr])
00668 : "0" (m64pDest[iDataCntr]),
00669 "m" (m64pSrc[iDataCntr])
00670 : "mm0", "mm1", "memory");
00671 }
00672 if (iDataLength & 0x1)
00673 {
00674 X86_ASM (
00675 "movd %1, %%mm0\n\t" \
00676 "movd %2, %%mm1\n\t" \
00677 "pfmul %%mm1, %%mm0\n\t" \
00678 "movd %%mm0, %0\n\t"
00679 : "=m" (fpDest[iDataLength - 1])
00680 : "0" (fpDest[iDataLength - 1]),
00681 "m" (fpSrc[iDataLength - 1])
00682 : "mm0", "mm1", "memory");
00683 }
00684 X86_ASM (
00685 "femms\n\t" \
00686 "sfence\n\t");
00687 }
00688
00689
00690 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
00691 {
00692 int iDataCntr;
00693
00694 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00695 {
00696 X86_ASM (
00697 "movss %1, %%xmm0\n\t" \
00698 "mulss %2, %%xmm0\n\t" \
00699 "movss %%xmm0, %0\n\t"
00700 : "=m" (fpDest[iDataCntr])
00701 : "0" (fpDest[iDataCntr]),
00702 "m" (fpSrc[iDataCntr])
00703 : "xmm0", "memory");
00704 }
00705 }
00706
00707
00708 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
00709 {
00710 int iDataCntr;
00711
00712 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00713 {
00714 X86_ASM (
00715 "movsd %1, %%xmm0\n\t" \
00716 "mulsd %2, %%xmm0\n\t" \
00717 "movsd %%xmm0, %0\n\t"
00718 : "=m" (dpDest[iDataCntr])
00719 : "0" (dpDest[iDataCntr]),
00720 "m" (dpSrc[iDataCntr])
00721 : "xmm0", "memory");
00722 }
00723 }
00724
00725
00726 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1,
00727 const float *fpSrc2, int iDataLength)
00728 {
00729 int iDataCntr;
00730 int iDataCount;
00731 stpm64 m64pDest = (stpm64) fpDest;
00732 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00733 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00734
00735 iDataCount = (iDataLength >> 1);
00736 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00737 {
00738 X86_ASM (
00739 "movq %1, %%mm0\n\t" \
00740 "movq %2, %%mm1\n\t" \
00741 "pfadd %%mm1, %%mm0\n\t" \
00742 "movntq %%mm0, %0\n\t"
00743 : "=m" (m64pDest[iDataCntr])
00744 : "m" (m64pSrc1[iDataCntr]),
00745 "m" (m64pSrc2[iDataCntr])
00746 : "mm0", "mm1", "memory");
00747 }
00748 if (iDataLength & 0x1)
00749 {
00750 X86_ASM (
00751 "movd %1, %%mm0\n\t" \
00752 "movd %2, %%mm1\n\t" \
00753 "pfadd %%mm1, %%mm0\n\t" \
00754 "movd %%mm0, %0\n\t"
00755 : "=m" (fpDest[iDataLength - 1])
00756 : "m" (fpSrc1[iDataLength - 1]),
00757 "m" (fpSrc2[iDataLength - 1])
00758 : "mm0", "mm1", "memory");
00759 }
00760 X86_ASM (
00761 "femms\n\t" \
00762 "sfence\n\t");
00763 }
00764
00765
00766 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1,
00767 const float *fpSrc2, int iDataLength)
00768 {
00769 int iDataCntr;
00770
00771 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00772 {
00773 X86_ASM (
00774 "movss %1, %%xmm0\n\t" \
00775 "addss %2, %%xmm0\n\t" \
00776 "movss %%xmm0, %0\n\t"
00777 : "=m" (fpDest[iDataCntr])
00778 : "m" (fpSrc1[iDataCntr]),
00779 "m" (fpSrc2[iDataCntr])
00780 : "xmm0", "memory");
00781 }
00782 }
00783
00784
00785 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1,
00786 const double *dpSrc2, int iDataLength)
00787 {
00788 int iDataCntr;
00789
00790 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00791 {
00792 X86_ASM (
00793 "movsd %1, %%xmm0\n\t" \
00794 "addsd %2, %%xmm0\n\t" \
00795 "movsd %%xmm0, %0\n\t"
00796 : "=m" (dpDest[iDataCntr])
00797 : "m" (dpSrc1[iDataCntr]),
00798 "m" (dpSrc2[iDataCntr])
00799 : "xmm0", "memory");
00800 }
00801 }
00802
00803
00804 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1,
00805 const float *fpSrc2, int iDataLength)
00806 {
00807 int iDataCntr;
00808 int iDataCount;
00809 stpm64 m64pDest = (stpm64) fpDest;
00810 stpm64 m64pSrc1 = (stpm64) fpSrc1;
00811 stpm64 m64pSrc2 = (stpm64) fpSrc2;
00812
00813 iDataCount = (iDataLength >> 1);
00814 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
00815 {
00816 X86_ASM (
00817 "movq %1, %%mm0\n\t" \
00818 "movq %2, %%mm1\n\t" \
00819 "pfmul %%mm1, %%mm0\n\t" \
00820 "movntq %%mm0, %0\n\t"
00821 : "=m" (m64pDest[iDataCntr])
00822 : "m" (m64pSrc1[iDataCntr]),
00823 "m" (m64pSrc2[iDataCntr])
00824 : "mm0", "mm1", "memory");
00825 }
00826 if (iDataLength & 0x1)
00827 {
00828 X86_ASM (
00829 "movd %1, %%mm0\n\t" \
00830 "movd %2, %%mm1\n\t" \
00831 "pfmul %%mm1, %%mm0\n\t" \
00832 "movd %%mm0, %0\n\t"
00833 : "=m" (fpDest[iDataLength - 1])
00834 : "m" (fpSrc1[iDataLength - 1]),
00835 "m" (fpSrc2[iDataLength - 1])
00836 : "mm0", "mm1", "memory");
00837 }
00838 X86_ASM (
00839 "femms\n\t" \
00840 "sfence\n\t");
00841 }
00842
00843
00844 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1,
00845 const float *fpSrc2, int iDataLength)
00846 {
00847 int iDataCntr;
00848
00849 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00850 {
00851 X86_ASM (
00852 "movss %1, %%xmm0\n\t" \
00853 "mulss %2, %%xmm0\n\t" \
00854 "movss %%xmm0, %0\n\t"
00855 : "=m" (fpDest[iDataCntr])
00856 : "m" (fpSrc1[iDataCntr]),
00857 "m" (fpSrc2[iDataCntr])
00858 : "xmm0", "memory");
00859 }
00860 }
00861
00862
00863 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1,
00864 const double *dpSrc2, int iDataLength)
00865 {
00866 int iDataCntr;
00867
00868 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00869 {
00870 X86_ASM (
00871 "movsd %1, %%xmm0\n\t" \
00872 "mulsd %2, %%xmm0\n\t" \
00873 "movsd %%xmm0, %0\n\t"
00874 : "=m" (dpDest[iDataCntr])
00875 : "m" (dpSrc1[iDataCntr]),
00876 "m" (dpSrc2[iDataCntr])
00877 : "xmm0", "memory");
00878 }
00879 }
00880
00881
00882 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00883 {
00884 int iDataCntr;
00885 stpm64 m64pDest = (stpm64) fpDest;
00886
00887 X86_ASM (
00888 "movq %0, %%mm3\n\t"
00889 :
00890 : "m" (fpSrc[0])
00891 : "mm3", "memory");
00892 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
00893 {
00894 X86_ASM (
00895 "movq %1, %%mm0\n\t" \
00896 "movq %%mm3, %%mm1\n\t" \
00897 "pswapd %%mm1, %%mm2\n\t" \
00898 "pfmul %%mm0, %%mm1\n\t" \
00899 "pfmul %%mm0, %%mm2\n\t" \
00900 "pfpnacc %%mm2, %%mm1\n\t"
00901 "movntq %%mm1, %0\n\t"
00902 : "=m" (m64pDest[iDataCntr])
00903 : "0" (m64pDest[iDataCntr])
00904 : "mm0", "mm1", "mm2", "mm3", "memory");
00905 }
00906 X86_ASM (
00907 "femms\n\t" \
00908 "sfence\n\t");
00909 }
00910
00911
00912 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
00913 {
00914 int iDataCntr;
00915 int iDataCount;
00916
00917 X86_ASM (
00918 "movss %0, %%xmm2\n\t" \
00919 "movss %1, %%xmm3\n\t"
00920 :
00921 : "m" (fpSrc[0]),
00922 "m" (fpSrc[1])
00923 : "xmm2", "xmm3", "memory");
00924 iDataCount = (iDataLength << 1);
00925 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00926 {
00927 X86_ASM (
00928 "movss %2, %%xmm0\n\t" \
00929 "movss %%xmm0, %%xmm1\n\t" \
00930 "movss %3, %%xmm4\n\t" \
00931 \
00932 "mulss %%xmm2, %%xmm0\n\t" \
00933 "movss %%xmm4, %%xmm5\n\t" \
00934 "mulss %%xmm3, %%xmm5\n\t" \
00935 "subss %%xmm0, %%xmm5\n\t" \
00936 \
00937 "mulss %%xmm3, %%xmm1\n\t" \
00938 "movss %%xmm4, %%xmm5\n\t" \
00939 "mulss %%xmm2, %%xmm5\n\t" \
00940 "addss %%xmm5, %%xmm1\n\t" \
00941 \
00942 "movss %%xmm0, %0\n\t" \
00943 "movss %%xmm1, %1\n\t"
00944 : "=m" (fpDest[iDataCntr]),
00945 "=m" (fpDest[iDataCntr + 1])
00946 : "0" (fpDest[iDataCntr]),
00947 "1" (fpDest[iDataCntr + 1])
00948 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00949 }
00950 }
00951
00952
00953 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
00954 {
00955 int iDataCntr;
00956 int iDataCount;
00957
00958 X86_ASM (
00959 "movsd %0, %%xmm2\n\t" \
00960 "movsd %1, %%xmm3\n\t"
00961 :
00962 : "m" (dpSrc[0]),
00963 "m" (dpSrc[1])
00964 : "xmm2", "xmm3", "memory");
00965 iDataCount = (iDataLength << 1);
00966 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
00967 {
00968 X86_ASM (
00969 "movsd %2, %%xmm0\n\t" \
00970 "movsd %%xmm0, %%xmm1\n\t" \
00971 "movsd %3, %%xmm4\n\t" \
00972 \
00973 "mulsd %%xmm2, %%xmm0\n\t" \
00974 "movsd %%xmm4, %%xmm5\n\t" \
00975 "mulsd %%xmm3, %%xmm5\n\t" \
00976 "subsd %%xmm0, %%xmm5\n\t" \
00977 \
00978 "mulsd %%xmm3, %%xmm1\n\t" \
00979 "movsd %%xmm4, %%xmm5\n\t" \
00980 "mulsd %%xmm2, %%xmm5\n\t" \
00981 "addsd %%xmm5, %%xmm1\n\t" \
00982 \
00983 "movsd %%xmm0, %0\n\t" \
00984 "movsd %%xmm1, %1\n\t"
00985 : "=m" (dpDest[iDataCntr]),
00986 "=m" (dpDest[iDataCntr + 1])
00987 : "0" (dpDest[iDataCntr]),
00988 "1" (dpDest[iDataCntr + 1])
00989 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
00990 }
00991 }
00992
00993
00994 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
00995 {
00996 int iDataCntr;
00997 stpm64 m64pDest = (stpm64) fpDest;
00998 stpm64 m64pSrc = (stpm64) fpSrc;
00999
01000 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01001 {
01002 X86_ASM (
01003 "movq %1, %%mm0\n\t" \
01004 "movq %2, %%mm1\n\t" \
01005 "pswapd %%mm1, %%mm2\n\t" \
01006 "pfmul %%mm0, %%mm1\n\t" \
01007 "pfmul %%mm0, %%mm2\n\t" \
01008 "pfpnacc %%mm2, %%mm1\n\t"
01009 "movntq %%mm1, %0\n\t"
01010 : "=m" (m64pDest[iDataCntr])
01011 : "0" (m64pDest[iDataCntr]),
01012 "m" (m64pSrc[iDataCntr])
01013 : "mm0", "mm1", "mm2", "memory");
01014 }
01015 X86_ASM (
01016 "femms\n\t" \
01017 "sfence\n\t");
01018 }
01019
01020
01021 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
01022 {
01023 int iDataCntr;
01024 int iDataCount;
01025
01026 iDataCount = (iDataLength << 1);
01027 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01028 {
01029 X86_ASM (
01030 "movss %4, %%xmm2\n\t" \
01031 "movss %5, %%xmm3\n\t" \
01032 \
01033 "movss %2, %%xmm0\n\t" \
01034 "movss %%xmm0, %%xmm1\n\t" \
01035 "movss %3, %%xmm4\n\t" \
01036 \
01037 "mulss %%xmm2, %%xmm0\n\t" \
01038 "movss %%xmm4, %%xmm5\n\t" \
01039 "mulss %%xmm3, %%xmm5\n\t" \
01040 "subss %%xmm0, %%xmm5\n\t" \
01041 \
01042 "mulss %%xmm3, %%xmm1\n\t" \
01043 "movss %%xmm4, %%xmm5\n\t" \
01044 "mulss %%xmm2, %%xmm5\n\t" \
01045 "addss %%xmm5, %%xmm1\n\t" \
01046 \
01047 "movss %%xmm0, %0\n\t" \
01048 "movss %%xmm1, %1\n\t"
01049 : "=m" (fpDest[iDataCntr]),
01050 "=m" (fpDest[iDataCntr + 1])
01051 : "0" (fpDest[iDataCntr]),
01052 "1" (fpDest[iDataCntr + 1]),
01053 "m" (fpSrc[iDataCntr]),
01054 "m" (fpSrc[iDataCntr + 1])
01055 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01056 }
01057 }
01058
01059
01060 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
01061 {
01062 int iDataCntr;
01063 int iDataCount;
01064
01065 iDataCount = (iDataLength << 1);
01066 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01067 {
01068 X86_ASM (
01069 "movsd %4, %%xmm2\n\t" \
01070 "movsd %5, %%xmm3\n\t" \
01071 \
01072 "movsd %2, %%xmm0\n\t" \
01073 "movsd %%xmm0, %%xmm1\n\t" \
01074 "movsd %3, %%xmm4\n\t" \
01075 \
01076 "mulsd %%xmm2, %%xmm0\n\t" \
01077 "movsd %%xmm4, %%xmm5\n\t" \
01078 "mulsd %%xmm3, %%xmm5\n\t" \
01079 "subsd %%xmm0, %%xmm5\n\t" \
01080 \
01081 "mulsd %%xmm3, %%xmm1\n\t" \
01082 "movsd %%xmm4, %%xmm5\n\t" \
01083 "mulsd %%xmm2, %%xmm5\n\t" \
01084 "addsd %%xmm5, %%xmm1\n\t" \
01085 \
01086 "movsd %%xmm0, %0\n\t" \
01087 "movsd %%xmm1, %1\n\t"
01088 : "=m" (dpDest[iDataCntr]),
01089 "=m" (dpDest[iDataCntr + 1])
01090 : "0" (dpDest[iDataCntr]),
01091 "1" (dpDest[iDataCntr + 1]),
01092 "m" (dpSrc[iDataCntr]),
01093 "m" (dpSrc[iDataCntr + 1])
01094 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01095 }
01096 }
01097
01098
01099 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1,
01100 const float *fpSrc2, int iDataLength)
01101 {
01102 int iDataCntr;
01103 stpm64 m64pDest = (stpm64) fpDest;
01104 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01105 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01106
01107 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01108 {
01109 X86_ASM (
01110 "movq %1, %%mm0\n\t" \
01111 "movq %2, %%mm1\n\t" \
01112 "pswapd %%mm1, %%mm2\n\t" \
01113 "pfmul %%mm0, %%mm1\n\t" \
01114 "pfmul %%mm0, %%mm2\n\t" \
01115 "pfpnacc %%mm2, %%mm1\n\t"
01116 "movntq %%mm1, %0\n\t"
01117 : "=m" (m64pDest[iDataCntr])
01118 : "m" (m64pSrc1[iDataCntr]),
01119 "m" (m64pSrc2[iDataCntr])
01120 : "mm0", "mm1", "mm2", "memory");
01121 }
01122 X86_ASM (
01123 "femms\n\t" \
01124 "sfence\n\t");
01125 }
01126
01127
01128 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1,
01129 const float *fpSrc2, int iDataLength)
01130 {
01131 int iDataCntr;
01132 int iDataCount;
01133
01134 iDataCount = (iDataLength << 1);
01135 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01136 {
01137 X86_ASM (
01138 "movss %4, %%xmm2\n\t" \
01139 "movss %5, %%xmm3\n\t" \
01140 \
01141 "movss %2, %%xmm0\n\t" \
01142 "movss %%xmm0, %%xmm1\n\t" \
01143 "movss %3, %%xmm4\n\t" \
01144 \
01145 "mulss %%xmm2, %%xmm0\n\t" \
01146 "movss %%xmm4, %%xmm5\n\t" \
01147 "mulss %%xmm3, %%xmm5\n\t" \
01148 "subss %%xmm0, %%xmm5\n\t" \
01149 \
01150 "mulss %%xmm3, %%xmm1\n\t" \
01151 "movss %%xmm4, %%xmm5\n\t" \
01152 "mulss %%xmm2, %%xmm5\n\t" \
01153 "addss %%xmm5, %%xmm1\n\t" \
01154 \
01155 "movss %%xmm0, %0\n\t" \
01156 "movss %%xmm1, %1\n\t"
01157 : "=m" (fpDest[iDataCntr]),
01158 "=m" (fpDest[iDataCntr + 1])
01159 : "m" (fpSrc1[iDataCntr]),
01160 "m" (fpSrc1[iDataCntr + 1]),
01161 "m" (fpSrc2[iDataCntr]),
01162 "m" (fpSrc2[iDataCntr + 1])
01163 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01164 }
01165 }
01166
01167
01168 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1,
01169 const double *dpSrc2, int iDataLength)
01170 {
01171 int iDataCntr;
01172 int iDataCount;
01173
01174 iDataCount = (iDataLength << 1);
01175 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
01176 {
01177 X86_ASM (
01178 "movsd %4, %%xmm2\n\t" \
01179 "movsd %5, %%xmm3\n\t" \
01180 \
01181 "movsd %2, %%xmm0\n\t" \
01182 "movsd %%xmm0, %%xmm1\n\t" \
01183 "movsd %3, %%xmm4\n\t" \
01184 \
01185 "mulsd %%xmm2, %%xmm0\n\t" \
01186 "movsd %%xmm4, %%xmm5\n\t" \
01187 "mulsd %%xmm3, %%xmm5\n\t" \
01188 "subsd %%xmm0, %%xmm5\n\t" \
01189 \
01190 "mulsd %%xmm3, %%xmm1\n\t" \
01191 "movsd %%xmm4, %%xmm5\n\t" \
01192 "mulsd %%xmm2, %%xmm5\n\t" \
01193 "addsd %%xmm5, %%xmm1\n\t" \
01194 \
01195 "movsd %%xmm0, %0\n\t" \
01196 "movsd %%xmm1, %1\n\t"
01197 : "=m" (dpDest[iDataCntr]),
01198 "=m" (dpDest[iDataCntr + 1])
01199 : "m" (dpSrc1[iDataCntr]),
01200 "m" (dpSrc1[iDataCntr + 1]),
01201 "m" (dpSrc2[iDataCntr]),
01202 "m" (dpSrc2[iDataCntr + 1])
01203 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
01204 }
01205 }
01206
01207
01208 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01209 {
01210 int iDataCntr;
01211 int iDataCount;
01212 stpm64 m64pVect = (stpm64) fpVect;
01213 stm64 m64Mul;
01214 stm64 m64Add;
01215
01216 m64Mul.f[0] = m64Mul.f[1] = fMul;
01217 m64Add.f[0] = m64Add.f[1] = fAdd;
01218 iDataCount = (iDataLength >> 1);
01219 X86_ASM (
01220 "movq %0, %%mm1\n\t" \
01221 "movq %1, %%mm2\n\t"
01222 :
01223 : "m" (m64Mul),
01224 "m" (m64Add)
01225 : "mm1", "mm2", "memory");
01226 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01227 {
01228 X86_ASM (
01229 "movq %1, %%mm0\n\t" \
01230 "pfmul %%mm1, %%mm0\n\t" \
01231 "pfadd %%mm2, %%mm0\n\t" \
01232 "movntq %%mm0, %0\n\t"
01233 : "=m" (m64pVect[iDataCntr])
01234 : "0" (m64pVect[iDataCntr])
01235 : "mm0", "mm1", "mm2", "memory");
01236 }
01237 if (iDataLength & 0x1)
01238 {
01239 X86_ASM (
01240 "movd %1, %%mm0\n\t" \
01241 "pfmul %%mm1, %%mm0\n\t" \
01242 "pfadd %%mm2, %%mm0\n\t" \
01243 "movd %%mm0, %0\n\t"
01244 : "=m" (fpVect[iDataLength - 1])
01245 : "0" (fpVect[iDataLength - 1])
01246 : "mm0", "mm1", "mm2", "memory");
01247 }
01248 X86_ASM (
01249 "femms\n\t" \
01250 "sfence\n\t");
01251 }
01252
01253
01254 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
01255 {
01256 int iDataCntr;
01257
01258 X86_ASM (
01259 "movss %0, %%xmm1\n\t" \
01260 "movss %1, %%xmm2\n\t"
01261 :
01262 : "m" (fMul),
01263 "m" (fAdd)
01264 : "xmm1", "xmm2", "memory");
01265 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01266 {
01267 X86_ASM (
01268 "movss %1, %%xmm0\n\t" \
01269 "mulss %%xmm1, %%xmm0\n\t" \
01270 "addss %%xmm2, %%xmm0\n\t" \
01271 "movss %%xmm0, %0\n\t"
01272 : "=m" (fpVect[iDataCntr])
01273 : "0" (fpVect[iDataCntr])
01274 : "xmm0", "xmm1", "xmm2", "memory");
01275 }
01276 }
01277
01278
01279 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
01280 {
01281 int iDataCntr;
01282
01283 X86_ASM (
01284 "movsd %0, %%xmm1\n\t" \
01285 "movsd %1, %%xmm2\n\t"
01286 :
01287 : "m" (dMul),
01288 "m" (dAdd)
01289 : "xmm1", "xmm2", "memory");
01290 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01291 {
01292 X86_ASM (
01293 "movsd %1, %%xmm0\n\t" \
01294 "mulsd %%xmm1, %%xmm0\n\t" \
01295 "addsd %%xmm2, %%xmm0\n\t" \
01296 "movsd %%xmm0, %0\n\t"
01297 : "=m" (dpVect[iDataCntr])
01298 : "0" (dpVect[iDataCntr])
01299 : "xmm0", "xmm1", "xmm2", "memory");
01300 }
01301 }
01302
01303
01304 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
01305 float fMul, float fAdd, int iDataLength)
01306 {
01307 int iDataCntr;
01308 int iDataCount;
01309 stpm64 m64pDest = (stpm64) fpDest;
01310 stpm64 m64pSrc = (stpm64) fpSrc;
01311 stm64 m64Mul;
01312 stm64 m64Add;
01313
01314 m64Mul.f[0] = m64Mul.f[1] = fMul;
01315 m64Add.f[0] = m64Add.f[1] = fAdd;
01316 iDataCount = (iDataLength >> 1);
01317 X86_ASM (
01318 "movq %0, %%mm1\n\t" \
01319 "movq %1, %%mm2\n\t"
01320 :
01321 : "m" (m64Mul),
01322 "m" (m64Add)
01323 : "mm1", "mm2", "memory");
01324 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01325 {
01326 X86_ASM (
01327 "movq %1, %%mm0\n\t" \
01328 "pfmul %%mm1, %%mm0\n\t" \
01329 "pfadd %%mm2, %%mm0\n\t" \
01330 "movntq %%mm0, %0\n\t"
01331 : "=m" (m64pDest[iDataCntr])
01332 : "m" (m64pSrc[iDataCntr])
01333 : "mm0", "mm1", "mm2", "memory");
01334 }
01335 if (iDataLength & 0x1)
01336 {
01337 X86_ASM (
01338 "movd %1, %%mm0\n\t" \
01339 "pfmul %%mm1, %%mm0\n\t" \
01340 "pfadd %%mm2, %%mm0\n\t" \
01341 "movd %%mm0, %0\n\t"
01342 : "=m" (fpDest[iDataLength - 1])
01343 : "m" (fpSrc[iDataLength - 1])
01344 : "mm0", "mm1", "mm2", "memory");
01345 }
01346 X86_ASM (
01347 "femms\n\t" \
01348 "sfence\n\t");
01349 }
01350
01351
01352 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc,
01353 float fMul, float fAdd, int iDataLength)
01354 {
01355 int iDataCntr;
01356
01357 X86_ASM (
01358 "movss %0, %%xmm1\n\t" \
01359 "movss %1, %%xmm2\n\t"
01360 :
01361 : "m" (fMul),
01362 "m" (fAdd)
01363 : "xmm1", "xmm2", "memory");
01364 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01365 {
01366 X86_ASM (
01367 "movss %1, %%xmm0\n\t" \
01368 "mulss %%xmm1, %%xmm0\n\t" \
01369 "addss %%xmm2, %%xmm0\n\t" \
01370 "movss %%xmm0, %0\n\t"
01371 : "=m" (fpDest[iDataCntr])
01372 : "m" (fpSrc[iDataCntr])
01373 : "xmm0", "xmm1", "xmm2", "memory");
01374 }
01375 }
01376
01377
01378 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc,
01379 double dMul, double dAdd, int iDataLength)
01380 {
01381 int iDataCntr;
01382
01383 X86_ASM (
01384 "movsd %0, %%xmm1\n\t" \
01385 "movsd %1, %%xmm2\n\t"
01386 :
01387 : "m" (dMul),
01388 "m" (dAdd)
01389 : "xmm1", "xmm2", "memory");
01390 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01391 {
01392 X86_ASM (
01393 "movsd %1, %%xmm0\n\t" \
01394 "mulsd %%xmm1, %%xmm0\n\t" \
01395 "addsd %%xmm2, %%xmm0\n\t" \
01396 "movsd %%xmm0, %0\n\t"
01397 : "=m" (dpDest[iDataCntr])
01398 : "m" (dpSrc[iDataCntr])
01399 : "xmm0", "xmm1", "xmm2", "memory");
01400 }
01401 }
01402
01403
01404 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01405 {
01406 int iDataCntr;
01407 int iDataCount;
01408 stpm64 m64pVect = (stpm64) fpVect;
01409 stm64 m64Add;
01410 stm64 m64Mul;
01411
01412 m64Add.f[0] = m64Add.f[1] = fAdd;
01413 m64Mul.f[0] = m64Mul.f[1] = fMul;
01414 iDataCount = (iDataLength >> 1);
01415 X86_ASM (
01416 "movq %0, %%mm1\n\t" \
01417 "movq %1, %%mm2\n\t"
01418 :
01419 : "m" (m64Add),
01420 "m" (m64Mul)
01421 : "mm1", "mm2", "memory");
01422 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01423 {
01424 X86_ASM (
01425 "movq %1, %%mm0\n\t" \
01426 "pfadd %%mm1, %%mm0\n\t" \
01427 "pfmul %%mm2, %%mm0\n\t" \
01428 "movntq %%mm0, %0\n\t"
01429 : "=m" (m64pVect[iDataCntr])
01430 : "0" (m64pVect[iDataCntr])
01431 : "mm0", "mm1", "mm2", "memory");
01432 }
01433 if (iDataLength & 0x1)
01434 {
01435 X86_ASM (
01436 "movd %1, %%mm0\n\t" \
01437 "pfadd %%mm1, %%mm0\n\t" \
01438 "pfmul %%mm2, %%mm0\n\t" \
01439 "movd %%mm0, %0\n\t"
01440 : "=m" (fpVect[iDataLength - 1])
01441 : "0" (fpVect[iDataLength - 1])
01442 : "mm0", "mm1", "mm2", "memory");
01443 }
01444 X86_ASM (
01445 "femms\n\t" \
01446 "sfence\n\t");
01447 }
01448
01449
01450 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
01451 {
01452 int iDataCntr;
01453
01454 X86_ASM (
01455 "movss %0, %%xmm1\n\t" \
01456 "movss %1, %%xmm2\n\t"
01457 :
01458 : "m" (fAdd),
01459 "m" (fMul)
01460 : "xmm1", "xmm2", "memory");
01461 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01462 {
01463 X86_ASM (
01464 "movss %1, %%xmm0\n\t" \
01465 "addss %%xmm1, %%xmm0\n\t" \
01466 "mulss %%xmm2, %%xmm0\n\t" \
01467 "movss %%xmm0, %0\n\t"
01468 : "=m" (fpVect[iDataCntr])
01469 : "0" (fpVect[iDataCntr])
01470 : "xmm0", "xmm1", "xmm2", "memory");
01471 }
01472 }
01473
01474
01475 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2,
01476 int iDataLength)
01477 {
01478 int iDataCntr;
01479 int iDataCount;
01480 float fRes;
01481 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01482 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01483
01484 iDataCount = (iDataLength >> 1);
01485 X86_ASM (
01486 "pxor %%mm0, %%mm0\n\t"
01487 :
01488 :
01489 : "mm0");
01490 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01491 {
01492 X86_ASM (
01493 "movq %0, %%mm1\n\t" \
01494 "movq %1, %%mm2\n\t" \
01495 "pfmul %%mm2, %%mm1\n\t" \
01496 "pfacc %%mm1, %%mm0\n\t"
01497 :
01498 : "m" (m64pSrc1[iDataCntr]),
01499 "m" (m64pSrc2[iDataCntr])
01500 : "mm0", "mm1", "mm2", "memory");
01501 }
01502 if (iDataLength & 0x1)
01503 {
01504 X86_ASM (
01505 "movd %0, %%mm1\n\t" \
01506 "movd %1, %%mm2\n\t" \
01507 "pfmul %%mm2, %%mm1\n\t" \
01508 "pfacc %%mm1, %%mm0\n\t"
01509 :
01510 : "m" (fpSrc1[iDataLength - 1]),
01511 "m" (fpSrc2[iDataLength - 1])
01512 : "mm0", "mm1", "mm2", "memory");
01513 }
01514 X86_ASM (
01515 "pfacc %%mm0, %%mm0\n\t" \
01516 "movd %%mm0, %0\n\t"
01517 : "=m" (fRes)
01518 :
01519 : "mm0", "memory");
01520 X86_ASM ("femms\n\t");
01521
01522 return fRes;
01523 }
01524
01525
01526 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
01527 int iDataLength)
01528 {
01529 int iDataCntr;
01530 float fRes;
01531
01532 X86_ASM (
01533 "xorps %%xmm0, %%xmm0\n\t"
01534 :
01535 :
01536 : "xmm0");
01537 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01538 {
01539 X86_ASM (
01540 "movss %0, %%xmm1\n\t" \
01541 "mulss %1, %%xmm1\n\t" \
01542 "addss %%xmm1, %%xmm0\n\t"
01543 :
01544 : "m" (fpSrc1[iDataCntr]),
01545 "m" (fpSrc2[iDataCntr])
01546 : "xmm0", "xmm1", "xmm2", "memory");
01547 }
01548 X86_ASM (
01549 "movss %%xmm0, %0\n\t"
01550 : "=m" (fRes)
01551 :
01552 : "xmm0");
01553
01554 return fRes;
01555 }
01556
01557
01558 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
01559 int iDataLength)
01560 {
01561 int iDataCntr;
01562 double dRes;
01563
01564 X86_ASM (
01565 "xorpd %%xmm0, %%xmm0\n\t"
01566 :
01567 :
01568 : "xmm0");
01569 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01570 {
01571 X86_ASM (
01572 "movsd %0, %%xmm1\n\t" \
01573 "mulsd %1, %%xmm1\n\t" \
01574 "addsd %%xmm1, %%xmm0\n\t"
01575 :
01576 : "m" (dpSrc1[iDataCntr]),
01577 "m" (dpSrc2[iDataCntr])
01578 : "xmm0", "xmm1", "xmm2", "memory");
01579 }
01580 X86_ASM (
01581 "movsd %%xmm0, %0\n\t"
01582 : "=m" (dRes)
01583 :
01584 : "xmm0");
01585
01586 return dRes;
01587 }
01588
01589
01590 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
01591 int iDataLength)
01592 {
01593 int iDataCntr;
01594 int iDataCount;
01595 stm64 m64Min;
01596 stm64 m64Max;
01597 stpm64 m64pSrc = (stpm64) fpSrc;
01598
01599 m64Min.f[0] = m64Min.f[1] = FLT_MAX;
01600 m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
01601 iDataCount = (iDataLength >> 1);
01602 X86_ASM (
01603 "movq %0, %%mm1\n\t" \
01604 "movq %1, %%mm2\n\t"
01605 :
01606 : "m" (m64Min),
01607 "m" (m64Max)
01608 : "mm1", "mm2", "memory");
01609 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01610 {
01611 X86_ASM (
01612 "movq %0, %%mm0\n\t" \
01613 "pfmin %%mm0, %%mm1\n\t" \
01614 "pfmax %%mm0, %%mm2\n\t"
01615 :
01616 : "m" (m64pSrc[iDataCntr])
01617 : "mm0", "mm1", "mm2", "memory");
01618 }
01619 if (iDataLength & 0x1)
01620 {
01621 X86_ASM (
01622 "movd %0, %%mm0\n\t" \
01623 "pfmin %%mm0, %%mm1\n\t" \
01624 "pfmax %%mm0, %%mm2\n\t"
01625 :
01626 : "m" (fpSrc[iDataLength - 1])
01627 : "mm0", "mm1", "mm2", "memory");
01628 }
01629 X86_ASM (
01630 "pswapd %%mm1, %%mm3\n\t" \
01631 "pfmin %%mm3, %%mm1\n\t" \
01632 "pswapd %%mm2, %%mm3\n\t" \
01633 "pfmax %%mm3, %%mm2\n\t" \
01634 "movd %%mm1, %0\n\t" \
01635 "movd %%mm2, %1\n\t"
01636 : "=m" (*fpMin),
01637 "=m" (*fpMax)
01638 :
01639 : "mm1", "mm2", "mm3", "memory");
01640 X86_ASM ("femms\n\t");
01641 }
01642
01643
01644 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
01645 int iDataLength)
01646 {
01647 int iDataCntr;
01648
01649 *fpMin = FLT_MAX;
01650 *fpMax = -FLT_MAX;
01651 X86_ASM (
01652 "movss %0, %%xmm0\n\t" \
01653 "movss %1, %%xmm1\n\t"
01654 :
01655 : "m" (*fpMin),
01656 "m" (*fpMax)
01657 : "xmm0", "xmm1", "memory");
01658 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01659 {
01660 X86_ASM (
01661 "movss %0, %%xmm2\n\t" \
01662 "minss %%xmm2, %%xmm0\n\t" \
01663 "maxss %%xmm2, %%xmm1\n\t"
01664 :
01665 : "m" (fpSrc[iDataCntr])
01666 : "xmm0", "xmm1", "xmm2", "memory");
01667 }
01668 X86_ASM (
01669 "movss %%xmm0, %0\n\t" \
01670 "movss %%xmm1, %1\n\t"
01671 : "=m" (*fpMin),
01672 "=m" (*fpMax)
01673 :
01674 : "xmm0", "xmm1", "memory");
01675 }
01676
01677
01678 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc,
01679 int iDataLength)
01680 {
01681 int iDataCntr;
01682
01683 *dpMin = FLT_MAX;
01684 *dpMax = -FLT_MAX;
01685 X86_ASM (
01686 "movsd %0, %%xmm0\n\t" \
01687 "movsd %1, %%xmm1\n\t"
01688 :
01689 : "m" (*dpMin),
01690 "m" (*dpMax)
01691 : "xmm0", "xmm1", "memory");
01692 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01693 {
01694 X86_ASM (
01695 "movsd %0, %%xmm2\n\t" \
01696 "minsd %%xmm2, %%xmm0\n\t" \
01697 "maxsd %%xmm2, %%xmm1\n\t"
01698 :
01699 : "m" (dpSrc[iDataCntr])
01700 : "xmm0", "xmm1", "xmm2", "memory");
01701 }
01702 X86_ASM (
01703 "movss %%xmm0, %0\n\t" \
01704 "movss %%xmm1, %1\n\t"
01705 : "=m" (*dpMin),
01706 "=m" (*dpMax)
01707 :
01708 : "xmm0", "xmm1", "memory");
01709 }
01710
01711
01712 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01713 int iDataLength)
01714 {
01715 int iDataCntr;
01716 int iDataCount;
01717 float fRes;
01718 stpm64 m64pSrc1 = (stpm64) fpSrc1;
01719 stpm64 m64pSrc2 = (stpm64) fpSrc2;
01720
01721 iDataCount = (iDataLength >> 1);
01722 X86_ASM (
01723 "pxor %%mm3, %%mm3\n\t" \
01724 "pxor %%mm4, %%mm4\n\t" \
01725 "pxor %%mm5, %%mm5\n\t"
01726 :
01727 :
01728 : "mm3", "mm4", "mm5");
01729 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
01730 {
01731 X86_ASM (
01732 "movq %0, %%mm0\n\t" \
01733 "movq %1, %%mm1\n\t" \
01734 "movq %%mm1, %%mm2\n\t" \
01735 "pfmul %%mm0, %%mm2\n\t" \
01736 "pfacc %%mm2, %%mm5\n\t" \
01737 "pfmul %%mm0, %%mm0\n\t" \
01738 "pfacc %%mm0, %%mm3\n\t" \
01739 "pfmul %%mm1, %%mm1\n\t" \
01740 "pfacc %%mm1, %%mm4\n\t"
01741 :
01742 : "m" (m64pSrc1[iDataCntr]),
01743 "m" (m64pSrc2[iDataCntr])
01744 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01745 }
01746 if (iDataLength & 0x1)
01747 {
01748 X86_ASM (
01749 "movd %0, %%mm0\n\t" \
01750 "movd %1, %%mm1\n\t" \
01751 "movq %%mm1, %%mm2\n\t" \
01752 "pfmul %%mm0, %%mm2\n\t" \
01753 "pfacc %%mm2, %%mm5\n\t" \
01754 "pfmul %%mm0, %%mm0\n\t" \
01755 "pfacc %%mm0, %%mm3\n\t" \
01756 "pfmul %%mm1, %%mm1\n\t" \
01757 "pfacc %%mm1, %%mm4\n\t"
01758 :
01759 : "m" (fpSrc1[iDataLength - 1]),
01760 "m" (fpSrc2[iDataLength - 1])
01761 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
01762 }
01763 X86_ASM (
01764 "pfacc %%mm3, %%mm3\n\t" \
01765 "pfacc %%mm4, %%mm4\n\t" \
01766 "pfacc %%mm5, %%mm5\n\t" \
01767 \
01768 "movd %1, %%mm6\n\t" \
01769 "pswapd %%mm6, %%mm7\n\t" \
01770 "paddd %%mm7, %%mm6\n\t" \
01771 "pi2fd %%mm6, %%mm7\n\t" \
01772 \
01773 "pfrcp %%mm7, %%mm6\n\t" \
01774 "pfrcpit1 %%mm6, %%mm7\n\t" \
01775 "pfrcpit2 %%mm6, %%mm7\n\t" \
01776 \
01777 "pfmul %%mm3, %%mm4\n\t" \
01778 \
01779 "movq %%mm4, %%mm0\n\t" \
01780 "pfrsqrt %%mm4, %%mm1\n\t" \
01781 "movq %%mm1, %%mm2\n\t" \
01782 "pfmul %%mm1, %%mm1\n\t" \
01783 "pfrsqit1 %%mm4, %%mm1\n\t" \
01784 "pfrcpit2 %%mm2, %%mm1\n\t" \
01785 "pfmul %%mm1, %%mm4\n\t" \
01786 \
01787 "pfmul %%mm6, %%mm4\n\t" \
01788 \
01789 "pfrcp %%mm4, %%mm0\n\t" \
01790 "pfrcpit1 %%mm0, %%mm4\n\t" \
01791 "pfrcpit2 %%mm0, %%mm4\n\t" \
01792 \
01793 "pfmul %%mm6, %%mm5\n\t" \
01794 "pfmul %%mm4, %%mm5\n\t" \
01795 "movd %%mm5, %0\n\t"
01796 : "=m" (fRes)
01797 : "m" (iDataLength)
01798 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
01799 X86_ASM ("femms\n\t");
01800
01801 return fRes;
01802 }
01803
01804
01805 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
01806 int iDataLength)
01807 {
01808 int iDataCntr;
01809 float fScale;
01810 float fNormFact;
01811 float fProdSum;
01812 float fSqSum1;
01813 float fSqSum2;
01814 float fRes;
01815
01816 X86_ASM (
01817 "xorps %%xmm0, %%xmm0\n\t" \
01818 "xorps %%xmm1, %%xmm1\n\t" \
01819 "xorps %%xmm2, %%xmm2\n\t"
01820 :
01821 :
01822 : "xmm0", "xmm1", "xmm2");
01823 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01824 {
01825 X86_ASM (
01826 "movss %3, %%xmm3\n\t" \
01827 "movss %4, %%xmm4\n\t" \
01828 \
01829 "movss %%xmm4, %%xmm5\n\t" \
01830 "mulss %%xmm3, %%xmm5\n\t" \
01831 "addss %%xmm5, %%xmm0\n\t" \
01832 \
01833 "movss %%xmm3, %%xmm5\n\t" \
01834 "mulss %%xmm3, %%xmm5\n\t" \
01835 "addss %%xmm5, %%xmm1\n\t" \
01836 \
01837 "movss %%xmm4, %%xmm5\n\t" \
01838 "mulss %%xmm4, %%xmm5\n\t" \
01839 "addss %%xmm5, %%xmm2\n\t" \
01840 \
01841 "movss %%xmm0, %0\n\t" \
01842 "movss %%xmm1, %1\n\t" \
01843 "movss %%xmm2, %2\n\t"
01844 : "=m" (fProdSum),
01845 "=m" (fSqSum1),
01846 "=m" (fSqSum2)
01847 : "m" (fpSrc1[iDataCntr]),
01848 "m" (fpSrc2[iDataCntr])
01849 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01850 }
01851 fScale = 1.0F / iDataLength;
01852 fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
01853 fRes = (fProdSum * fScale) / fNormFact;
01854
01855 return fRes;
01856 }
01857
01858
01859 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
01860 int iDataLength)
01861 {
01862 int iDataCntr;
01863 double dScale;
01864 double dNormFact;
01865 double dProdSum;
01866 double dSqSum1;
01867 double dSqSum2;
01868 double dRes;
01869
01870 X86_ASM (
01871 "xorpd %%xmm0, %%xmm0\n\t" \
01872 "xorpd %%xmm1, %%xmm1\n\t" \
01873 "xorpd %%xmm2, %%xmm2\n\t"
01874 :
01875 :
01876 : "xmm0", "xmm1", "xmm2");
01877 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
01878 {
01879 X86_ASM (
01880 "movsd %3, %%xmm3\n\t" \
01881 "movsd %4, %%xmm4\n\t" \
01882 \
01883 "movsd %%xmm4, %%xmm5\n\t" \
01884 "mulsd %%xmm3, %%xmm5\n\t" \
01885 "addsd %%xmm5, %%xmm0\n\t" \
01886 \
01887 "movsd %%xmm3, %%xmm5\n\t" \
01888 "mulsd %%xmm3, %%xmm5\n\t" \
01889 "addsd %%xmm5, %%xmm1\n\t" \
01890 \
01891 "movsd %%xmm4, %%xmm5\n\t" \
01892 "mulsd %%xmm4, %%xmm5\n\t" \
01893 "addsd %%xmm5, %%xmm2\n\t" \
01894 \
01895 "movsd %%xmm0, %0\n\t" \
01896 "movsd %%xmm1, %1\n\t" \
01897 "movsd %%xmm2, %2\n\t"
01898 : "=m" (dProdSum),
01899 "=m" (dSqSum1),
01900 "=m" (dSqSum2)
01901 : "m" (dpSrc1[iDataCntr]),
01902 "m" (dpSrc2[iDataCntr])
01903 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
01904 }
01905 dScale = 1.0 / iDataLength;
01906 dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
01907 dRes = (dProdSum * dScale) / dNormFact;
01908
01909 return dRes;
01910 }
01911
01912
01913 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
01914 int iIntMax)
01915 {
01916 int iDataCntr;
01917 float fScale;
01918
01919 X86_ASM (
01920 "movd %1, %%mm1\n\t" \
01921 "pswapd %%mm1, %%mm2\n\t" \
01922 "paddd %%mm2, %%mm1\n\t" \
01923 "pi2fd %%mm1, %%mm1\n\t" \
01924 "pfrcp %%mm1, %%mm2\n\t" \
01925 "pfrcpit1 %%mm2, %%mm1\n\t" \
01926 "pfrcpit2 %%mm2, %%mm1\n\t" \
01927 "movd %%mm1, %0\n\t"
01928 : "=m" (fScale)
01929 : "m" (iIntMax)
01930 : "mm1", "mm2", "memory");
01931 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01932 {
01933 X86_ASM (
01934 "movd %1, %%mm0\n\t" \
01935 "punpcklwd %%mm0, %%mm0\n\t" \
01936 "pi2fw %%mm0, %%mm0\n\t" \
01937 "pfmul %%mm1, %%mm0\n\t" \
01938 "movntq %%mm0, %0\n\t"
01939 : "=m" (fpDest[iDataCntr])
01940 : "m" (ipSrc[iDataCntr])
01941 : "mm0", "mm1", "memory");
01942 }
01943 X86_ASM (
01944 "femms\n\t" \
01945 "sfence\n\t");
01946 if ((iDataLength % 2) != 0)
01947 {
01948 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01949 }
01950 }
01951
01952
01953 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
01954 int iIntMax)
01955 {
01956 int iDataCntr;
01957 float fScale;
01958
01959 X86_ASM (
01960 "movd %1, %%mm1\n\t" \
01961 "pswapd %%mm1, %%mm2\n\t" \
01962 "paddd %%mm2, %%mm1\n\t" \
01963 "pi2fd %%mm1, %%mm1\n\t" \
01964 "pfrcp %%mm1, %%mm2\n\t" \
01965 "pfrcpit1 %%mm2, %%mm1\n\t" \
01966 "pfrcpit2 %%mm2, %%mm1\n\t" \
01967 "movd %%mm1, %0\n\t"
01968 : "=m" (fScale)
01969 : "m" (iIntMax)
01970 : "mm1", "mm2", "memory");
01971 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
01972 {
01973 X86_ASM (
01974 "movq %1, %%mm0\n\t" \
01975 "pi2fd %%mm0, %%mm0\n\t" \
01976 "pfmul %%mm1, %%mm0\n\t" \
01977 "movntq %%mm0, %0\n\t"
01978 : "=m" (fpDest[iDataCntr])
01979 : "m" (ipSrc[iDataCntr])
01980 : "mm0", "mm1", "memory");
01981 }
01982 X86_ASM (
01983 "femms\n\t" \
01984 "sfence\n\t");
01985 if ((iDataLength % 2) != 0)
01986 {
01987 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
01988 }
01989 }
01990
01991
01992 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength,
01993 const float *fpCoeff, int iCoeffLength)
01994 {
01995 int iSrcCntr;
01996 int iDestCntr;
01997 int iCoeffCntr;
01998 int iSrcCount;
01999 stpm64 m64pDest = (stpm64) fpDest;
02000
02001 iDestCntr = 0;
02002 iSrcCount = iDataLength + iCoeffLength;
02003 for (iSrcCntr = iCoeffLength;
02004 iSrcCntr < iSrcCount;
02005 iSrcCntr += 2)
02006 {
02007 X86_ASM (
02008 "pxor %%mm0, %%mm0\n\t"
02009 :
02010 :
02011 : "mm0");
02012 for (iCoeffCntr = 0;
02013 iCoeffCntr < iCoeffLength;
02014 iCoeffCntr++)
02015 {
02016 X86_ASM (
02017 "movq %0, %%mm1\n\t" \
02018 "movd %1, %%mm2\n\t" \
02019 "pswapd %%mm2, %%mm3\n\t" \
02020 "pfadd %%mm3, %%mm2\n\t" \
02021 "pfmul %%mm2, %%mm1\n\t" \
02022 "pfadd %%mm1, %%mm0\n\t"
02023 :
02024 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02025 "m" (fpCoeff[iCoeffCntr])
02026 : "mm0", "mm1", "mm2", "mm3", "memory");
02027 }
02028 X86_ASM (
02029 "movntq %%mm0, %0\n\t"
02030 : "=m" (m64pDest[iDestCntr++])
02031 :
02032 : "mm0", "memory");
02033 }
02034 if (iDataLength & 0x1)
02035 {
02036 X86_ASM (
02037 "pxor %%mm0, %%mm0\n\t"
02038 :
02039 :
02040 : "mm0");
02041 for (iCoeffCntr = 0;
02042 iCoeffCntr < iCoeffLength;
02043 iCoeffCntr++)
02044 {
02045 X86_ASM (
02046 "movd %0, %%mm1\n\t" \
02047 "movd %1, %%mm2\n\t" \
02048 "pfmul %%mm2, %%mm1\n\t" \
02049 "pfadd %%mm1, %%mm0\n\t"
02050 :
02051 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
02052 "m" (fpCoeff[iCoeffCntr])
02053 : "mm0", "mm1", "mm2", "memory");
02054 }
02055 X86_ASM (
02056 "movd %%mm0, %0\n\t"
02057 : "=m" (fpDest[iDataLength - 1])
02058 :
02059 : "mm0", "memory");
02060 }
02061 X86_ASM (
02062 "femms\n\t" \
02063 "sfence\n\t");
02064 }
02065
02066
02067 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength,
02068 const float *fpCoeff, int iCoeffLength)
02069 {
02070 int iDestCntr;
02071 int iSrcCntr;
02072 int iCoeffCntr;
02073 int iSrcCount;
02074
02075 iDestCntr = 0;
02076 iSrcCount = iDataLength + iCoeffLength;
02077 for (iSrcCntr = iCoeffLength;
02078 iSrcCntr < iSrcCount;
02079 iSrcCntr++)
02080 {
02081 X86_ASM (
02082 "xorps %%xmm0, %%xmm0\n\t"
02083 :
02084 :
02085 : "xmm0");
02086 for (iCoeffCntr = 0;
02087 iCoeffCntr < iCoeffLength;
02088 iCoeffCntr++)
02089 {
02090 X86_ASM (
02091 "movss %0, %%xmm1\n\t"
02092 "mulss %1, %%xmm1\n\t"
02093 "addss %%xmm1, %%xmm0\n\t"
02094 :
02095 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
02096 "m" (fpCoeff[iCoeffCntr])
02097 : "xmm0", "xmm1", "memory");
02098 }
02099 X86_ASM (
02100 "movss %%xmm0, %0\n\t"
02101 : "=m" (fpDest[iDestCntr++])
02102 :
02103 : "xmm0", "memory");
02104 }
02105 }
02106
02107
02108 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength,
02109 const double *dpCoeff, int iCoeffLength)
02110 {
02111 int iDestCntr;
02112 int iSrcCntr;
02113 int iCoeffCntr;
02114 int iSrcCount;
02115
02116 iDestCntr = 0;
02117 iSrcCount = iDataLength + iCoeffLength;
02118 for (iSrcCntr = iCoeffLength;
02119 iSrcCntr < iSrcCount;
02120 iSrcCntr++)
02121 {
02122 X86_ASM (
02123 "xorpd %%xmm0, %%xmm0\n\t"
02124 :
02125 :
02126 : "xmm0");
02127 for (iCoeffCntr = 0;
02128 iCoeffCntr < iCoeffLength;
02129 iCoeffCntr++)
02130 {
02131 X86_ASM (
02132 "movsd %0, %%xmm1\n\t"
02133 "mulsd %1, %%xmm1\n\t"
02134 "addsd %%xmm1, %%xmm0\n\t"
02135 :
02136 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
02137 "m" (dpCoeff[iCoeffCntr])
02138 : "xmm0", "xmm1", "memory");
02139 }
02140 X86_ASM (
02141 "movsd %%xmm0, %0\n\t"
02142 : "=m" (dpDest[iDestCntr++])
02143 :
02144 : "xmm0", "memory");
02145 }
02146 }
02147
02148
02149 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
02150 float *fpX, float *fpY)
02151 {
02152 int iDataCntr;
02153 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02154 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02155 stpm64 m64pX = (stpm64) fpX;
02156 stpm64 m64pY = (stpm64) fpY;
02157
02158 X86_ASM (
02159 "movq %0, %%mm0\n\t" \
02160 "pswapd %%mm0, %%mm2\n\t" \
02161 "movd %1, %%mm3\n\t" \
02162 "movq %2, %%mm0\n\t" \
02163 "pswapd %%mm0, %%mm4\n\t" \
02164 "movq %3, %%mm5\n\t" \
02165 "movq %4, %%mm7\n\t" \
02166 :
02167 : "m" (*m64pCoeff),
02168 "m" (fpCoeff[0]),
02169 "m" (*m64pCoeff2),
02170 "m" (*m64pX),
02171 "m" (*m64pY)
02172 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02173 for (iDataCntr = 0;
02174 iDataCntr < iDataLength;
02175 iDataCntr++)
02176 {
02177 X86_ASM (
02178 "pxor %%mm0, %%mm0\n\t" \
02179 "movd %1, %%mm6\n\t" \
02180 "movq %%mm5, %%mm1\n\t" \
02181 "pfmul %%mm2, %%mm1\n\t" \
02182 "pfacc %%mm1, %%mm0\n\t" \
02183 "movq %%mm6, %%mm1\n\t" \
02184 "pfmul %%mm3, %%mm1\n\t" \
02185 "pfacc %%mm1, %%mm0\n\t" \
02186 "movq %%mm7, %%mm1\n\t" \
02187 "pfmul %%mm4, %%mm1\n\t" \
02188 "pfacc %%mm1, %%mm0\n\t" \
02189 "pfacc %%mm0, %%mm0\n\t" \
02190 \
02191 "pswapd %%mm7, %%mm1\n\t" \
02192 "movq %%mm1, %%mm7\n\t" \
02193 "punpckldq %%mm0, %%mm7\n\t" \
02194 \
02195 "pswapd %%mm5, %%mm1\n\t" \
02196 "movq %%mm1, %%mm5\n\t" \
02197 "movq %%mm6, %%mm1\n\t" \
02198 "punpckldq %%mm1, %%mm5\n\t" \
02199 \
02200 "movd %%mm0, %0\n\t"
02201 : "=m" (fpVect[iDataCntr])
02202 : "0" (fpVect[iDataCntr])
02203 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02204 }
02205 X86_ASM (
02206 "movq %%mm5, %0\n\t" \
02207 "movd %%mm6, %1\n\t" \
02208 "movq %%mm7, %2\n\t"
02209 : "=m" (*m64pX),
02210 "=m" (fpX[2]),
02211 "=m" (*m64pY)
02212 :
02213 : "mm5", "mm6", "mm7", "memory");
02214 X86_ASM ("femms\n\t");
02215 }
02216
02217
02218 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
02219 float *fpX, float *fpY)
02220 {
02221 int iDataCntr;
02222
02223 X86_ASM (
02224 "movss %0, %%xmm1\n\t" \
02225 "movss %1, %%xmm2\n\t" \
02226 "movss %2, %%xmm3\n\t" \
02227 "movss %3, %%xmm4\n\t" \
02228 "prefetchnta %4\n\t"
02229 :
02230 : "m" (fpX[1]),
02231 "m" (fpX[2]),
02232 "m" (fpY[0]),
02233 "m" (fpY[1]),
02234 "m" (fpCoeff[0])
02235 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02236 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02237 {
02238 X86_ASM (
02239 "movss %%xmm1, %%xmm0\n\t" \
02240 "movss %%xmm2, %%xmm1\n\t" \
02241 "movss %1, %%xmm2\n\t" \
02242 \
02243 "movss %2, %%xmm5\n\t" \
02244 "mulss %%xmm2, %%xmm5\n\t" \
02245 "movss %3, %%xmm6\n\t" \
02246 "mulss %%xmm1, %%xmm6\n\t" \
02247 "addss %%xmm6, %%xmm5\n\t" \
02248 "movss %4, %%xmm6\n\t" \
02249 "mulss %%xmm0, %%xmm6\n\t" \
02250 "addss %%xmm6, %%xmm5\n\t" \
02251 \
02252 "movss %5, %%xmm6\n\t" \
02253 "mulss %%xmm4, %%xmm6\n\t" \
02254 "movss %6, %%xmm7\n\t" \
02255 "mulss %%xmm3, %%xmm7\n\t" \
02256 "addss %%xmm7, %%xmm6\n\t" \
02257 \
02258 "addss %%xmm5, %%xmm6\n\t" \
02259 "movss %%xmm4, %%xmm3\n\t" \
02260 "movss %%xmm6, %%xmm4\n\t" \
02261 \
02262 "movss %%xmm6, %0\n\t"
02263 : "=m" (fpVect[iDataCntr])
02264 : "0" (fpVect[iDataCntr]),
02265 "m" (fpCoeff[0]),
02266 "m" (fpCoeff[1]),
02267 "m" (fpCoeff[2]),
02268 "m" (fpCoeff[3]),
02269 "m" (fpCoeff[4])
02270 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02271 "memory");
02272 }
02273 X86_ASM (
02274 "movss %%xmm0, %0\n\t" \
02275 "movss %%xmm1, %1\n\t" \
02276 "movss %%xmm2, %2\n\t" \
02277 "movss %%xmm3, %3\n\t" \
02278 "movss %%xmm4, %4\n\t"
02279 : "=m" (fpX[0]),
02280 "=m" (fpX[1]),
02281 "=m" (fpX[2]),
02282 "=m" (fpY[0]),
02283 "=m" (fpY[1])
02284 :
02285 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02286 }
02287
02288
02289 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff,
02290 double *dpX, double *dpY)
02291 {
02292 int iDataCntr;
02293
02294 X86_ASM (
02295 "movsd %0, %%xmm1\n\t" \
02296 "movsd %1, %%xmm2\n\t" \
02297 "movsd %2, %%xmm3\n\t" \
02298 "movsd %3, %%xmm4\n\t" \
02299 "prefetchnta %4\n\t" \
02300 "prefetchnta %5\n\t"
02301 :
02302 : "m" (dpX[1]),
02303 "m" (dpX[2]),
02304 "m" (dpY[0]),
02305 "m" (dpY[1]),
02306 "m" (dpCoeff[0]),
02307 "m" (dpCoeff[3])
02308 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02309 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02310 {
02311 X86_ASM (
02312 "movsd %%xmm1, %%xmm0\n\t" \
02313 "movsd %%xmm2, %%xmm1\n\t" \
02314 "movsd %1, %%xmm2\n\t" \
02315 \
02316 "movsd %2, %%xmm5\n\t" \
02317 "mulsd %%xmm2, %%xmm5\n\t" \
02318 "movsd %3, %%xmm6\n\t" \
02319 "mulsd %%xmm1, %%xmm6\n\t" \
02320 "addsd %%xmm6, %%xmm5\n\t" \
02321 "movsd %4, %%xmm6\n\t" \
02322 "mulsd %%xmm0, %%xmm6\n\t" \
02323 "addsd %%xmm6, %%xmm5\n\t" \
02324 \
02325 "movsd %5, %%xmm6\n\t" \
02326 "mulsd %%xmm4, %%xmm6\n\t" \
02327 "movsd %6, %%xmm7\n\t" \
02328 "mulsd %%xmm3, %%xmm7\n\t" \
02329 "addsd %%xmm7, %%xmm6\n\t" \
02330 \
02331 "addsd %%xmm5, %%xmm6\n\t" \
02332 "movsd %%xmm4, %%xmm3\n\t" \
02333 "movsd %%xmm6, %%xmm4\n\t" \
02334 \
02335 "movsd %%xmm6, %0\n\t"
02336 : "=m" (dpVect[iDataCntr])
02337 : "0" (dpVect[iDataCntr]),
02338 "m" (dpCoeff[0]),
02339 "m" (dpCoeff[1]),
02340 "m" (dpCoeff[2]),
02341 "m" (dpCoeff[3]),
02342 "m" (dpCoeff[4])
02343 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02344 "memory");
02345 }
02346 X86_ASM (
02347 "movsd %%xmm0, %0\n\t" \
02348 "movsd %%xmm1, %1\n\t" \
02349 "movsd %%xmm2, %2\n\t" \
02350 "movsd %%xmm3, %3\n\t" \
02351 "movsd %%xmm4, %4\n\t"
02352 : "=m" (dpX[0]),
02353 "=m" (dpX[1]),
02354 "=m" (dpX[2]),
02355 "=m" (dpY[0]),
02356 "=m" (dpY[1])
02357 :
02358 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02359 }
02360
02361
02362 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
02363 const float *fpCoeff, float *fpX, float *fpY)
02364 {
02365 int iDataCntr;
02366 stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
02367 stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
02368 stpm64 m64pX = (stpm64) fpX;
02369 stpm64 m64pY = (stpm64) fpY;
02370
02371 X86_ASM (
02372 "movq %0, %%mm0\n\t" \
02373 "pswapd %%mm0, %%mm2\n\t" \
02374 "movd %1, %%mm3\n\t" \
02375 "movq %2, %%mm0\n\t" \
02376 "pswapd %%mm0, %%mm4\n\t" \
02377 "movq %3, %%mm5\n\t" \
02378 "movq %4, %%mm7\n\t" \
02379 :
02380 : "m" (*m64pCoeff),
02381 "m" (fpCoeff[0]),
02382 "m" (*m64pCoeff2),
02383 "m" (*m64pX),
02384 "m" (*m64pY)
02385 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
02386 for (iDataCntr = 0;
02387 iDataCntr < iDataLength;
02388 iDataCntr++)
02389 {
02390 X86_ASM (
02391 "pxor %%mm0, %%mm0\n\t" \
02392 "movd %1, %%mm6\n\t" \
02393 "movq %%mm5, %%mm1\n\t" \
02394 "pfmul %%mm2, %%mm1\n\t" \
02395 "pfacc %%mm1, %%mm0\n\t" \
02396 "movq %%mm6, %%mm1\n\t" \
02397 "pfmul %%mm3, %%mm1\n\t" \
02398 "pfacc %%mm1, %%mm0\n\t" \
02399 "movq %%mm7, %%mm1\n\t" \
02400 "pfmul %%mm4, %%mm1\n\t" \
02401 "pfacc %%mm1, %%mm0\n\t" \
02402 "pfacc %%mm0, %%mm0\n\t" \
02403 \
02404 "pswapd %%mm7, %%mm1\n\t" \
02405 "movq %%mm1, %%mm7\n\t" \
02406 "punpckldq %%mm0, %%mm7\n\t" \
02407 \
02408 "pswapd %%mm5, %%mm1\n\t" \
02409 "movq %%mm1, %%mm5\n\t" \
02410 "movq %%mm6, %%mm1\n\t" \
02411 "punpckldq %%mm1, %%mm5\n\t" \
02412 \
02413 "movd %%mm0, %0\n\t"
02414 : "=m" (fpDest[iDataCntr])
02415 : "m" (fpSrc[iDataCntr])
02416 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
02417 }
02418 X86_ASM (
02419 "movq %%mm5, %0\n\t" \
02420 "movd %%mm6, %1\n\t" \
02421 "movq %%mm7, %2\n\t"
02422 : "=m" (*m64pX),
02423 "=m" (fpX[2]),
02424 "=m" (*m64pY)
02425 :
02426 : "mm5", "mm6", "mm7", "memory");
02427 X86_ASM ("femms\n\t");
02428 }
02429
02430
02431 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
02432 const float *fpCoeff, float *fpX, float *fpY)
02433 {
02434 int iDataCntr;
02435
02436 X86_ASM (
02437 "movss %0, %%xmm1\n\t" \
02438 "movss %1, %%xmm2\n\t" \
02439 "movss %2, %%xmm3\n\t" \
02440 "movss %3, %%xmm4\n\t" \
02441 "prefetchnta %4\n\t"
02442 :
02443 : "m" (fpX[1]),
02444 "m" (fpX[2]),
02445 "m" (fpY[0]),
02446 "m" (fpY[1]),
02447 "m" (fpCoeff[0])
02448 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02449 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02450 {
02451 X86_ASM (
02452 "movss %%xmm1, %%xmm0\n\t" \
02453 "movss %%xmm2, %%xmm1\n\t" \
02454 "movss %1, %%xmm2\n\t" \
02455 \
02456 "movss %2, %%xmm5\n\t" \
02457 "mulss %%xmm2, %%xmm5\n\t" \
02458 "movss %3, %%xmm6\n\t" \
02459 "mulss %%xmm1, %%xmm6\n\t" \
02460 "addss %%xmm6, %%xmm5\n\t" \
02461 "movss %4, %%xmm6\n\t" \
02462 "mulss %%xmm0, %%xmm6\n\t" \
02463 "addss %%xmm6, %%xmm5\n\t" \
02464 \
02465 "movss %5, %%xmm6\n\t" \
02466 "mulss %%xmm4, %%xmm6\n\t" \
02467 "movss %6, %%xmm7\n\t" \
02468 "mulss %%xmm3, %%xmm7\n\t" \
02469 "addss %%xmm7, %%xmm6\n\t" \
02470 \
02471 "addss %%xmm5, %%xmm6\n\t" \
02472 "movss %%xmm4, %%xmm3\n\t" \
02473 "movss %%xmm6, %%xmm4\n\t" \
02474 \
02475 "movss %%xmm6, %0\n\t"
02476 : "=m" (fpDest[iDataCntr])
02477 : "m" (fpSrc[iDataCntr]),
02478 "m" (fpCoeff[0]),
02479 "m" (fpCoeff[1]),
02480 "m" (fpCoeff[2]),
02481 "m" (fpCoeff[3]),
02482 "m" (fpCoeff[4])
02483 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02484 "memory");
02485 }
02486 X86_ASM (
02487 "movss %%xmm0, %0\n\t" \
02488 "movss %%xmm1, %1\n\t" \
02489 "movss %%xmm2, %2\n\t" \
02490 "movss %%xmm3, %3\n\t" \
02491 "movss %%xmm4, %4\n\t"
02492 : "=m" (fpX[0]),
02493 "=m" (fpX[1]),
02494 "=m" (fpX[2]),
02495 "=m" (fpY[0]),
02496 "=m" (fpY[1])
02497 :
02498 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02499 }
02500
02501
02502 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength,
02503 const double *dpCoeff, double *dpX, double *dpY)
02504 {
02505 int iDataCntr;
02506
02507 X86_ASM (
02508 "movsd %0, %%xmm1\n\t" \
02509 "movsd %1, %%xmm2\n\t" \
02510 "movsd %2, %%xmm3\n\t" \
02511 "movsd %3, %%xmm4\n\t" \
02512 "prefetchnta %4\n\t" \
02513 "prefetchnta %5\n\t"
02514 :
02515 : "m" (dpX[1]),
02516 "m" (dpX[2]),
02517 "m" (dpY[0]),
02518 "m" (dpY[1]),
02519 "m" (dpCoeff[0]),
02520 "m" (dpCoeff[3])
02521 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02522 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
02523 {
02524 X86_ASM (
02525 "movsd %%xmm1, %%xmm0\n\t" \
02526 "movsd %%xmm2, %%xmm1\n\t" \
02527 "movsd %1, %%xmm2\n\t" \
02528 \
02529 "movsd %2, %%xmm5\n\t" \
02530 "mulsd %%xmm2, %%xmm5\n\t" \
02531 "movsd %3, %%xmm6\n\t" \
02532 "mulsd %%xmm1, %%xmm6\n\t" \
02533 "addsd %%xmm6, %%xmm5\n\t" \
02534 "movsd %4, %%xmm6\n\t" \
02535 "mulsd %%xmm0, %%xmm6\n\t" \
02536 "addsd %%xmm6, %%xmm5\n\t" \
02537 \
02538 "movsd %5, %%xmm6\n\t" \
02539 "mulsd %%xmm4, %%xmm6\n\t" \
02540 "movsd %6, %%xmm7\n\t" \
02541 "mulsd %%xmm3, %%xmm7\n\t" \
02542 "addsd %%xmm7, %%xmm6\n\t" \
02543 \
02544 "addsd %%xmm5, %%xmm6\n\t" \
02545 "movsd %%xmm4, %%xmm3\n\t" \
02546 "movsd %%xmm6, %%xmm4\n\t" \
02547 \
02548 "movsd %%xmm6, %0\n\t"
02549 : "=m" (dpDest[iDataCntr])
02550 : "m" (dpSrc[iDataCntr]),
02551 "m" (dpCoeff[0]),
02552 "m" (dpCoeff[1]),
02553 "m" (dpCoeff[2]),
02554 "m" (dpCoeff[3]),
02555 "m" (dpCoeff[4])
02556 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
02557 "memory");
02558 }
02559 X86_ASM (
02560 "movsd %%xmm0, %0\n\t" \
02561 "movsd %%xmm1, %1\n\t" \
02562 "movsd %%xmm2, %2\n\t" \
02563 "movsd %%xmm3, %3\n\t" \
02564 "movsd %%xmm4, %4\n\t"
02565 : "=m" (dpX[0]),
02566 "=m" (dpX[1]),
02567 "=m" (dpX[2]),
02568 "=m" (dpY[0]),
02569 "=m" (dpY[1])
02570 :
02571 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
02572 }
02573
02574
02575 #ifdef __cplusplus
02576 }
02577 #endif
02578
02579 #endif