00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
00033 Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
00034 const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
00035 int l, int lmax NJ1)
00036 {
00037 if (njobs>1)
00038 {
00039 while (l<lmax-2)
00040 {
00041 Tb lam_3, lam_4;
00042 Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
00043 for (int i=0; i<nvec; ++i)
00044 lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00045 r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
00046 for (int i=0; i<nvec; ++i)
00047 lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
00048 r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
00049 for (int i=0; i<nvec; ++i)
00050 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
00051 for (int j=0; j<njobs; ++j)
00052 {
00053 Tv ar2=vload(creal(alm[njobs*l+j])),
00054 ai2=vload(cimag(alm[njobs*l+j])),
00055 ar4=vload(creal(alm[njobs*(l+2)+j])),
00056 ai4=vload(cimag(alm[njobs*(l+2)+j]));
00057 for (int i=0; i<nvec; ++i)
00058 {
00059 vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
00060 vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
00061 }
00062 Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
00063 ai3=vload(cimag(alm[njobs*(l+1)+j])),
00064 ar1=vload(creal(alm[njobs*(l+3)+j])),
00065 ai1=vload(cimag(alm[njobs*(l+3)+j]));
00066 for (int i=0; i<nvec; ++i)
00067 {
00068 vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
00069 vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
00070 }
00071 }
00072 r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
00073 for (int i=0; i<nvec; ++i)
00074 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
00075 l+=4;
00076 }
00077 }
00078 while (l<lmax)
00079 {
00080 Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
00081 for (int i=0; i<nvec; ++i)
00082 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00083 for (int j=0; j<njobs; ++j)
00084 {
00085 Tv ar=vload(creal(alm[njobs*l+j])),
00086 ai=vload(cimag(alm[njobs*l+j]));
00087 for (int i=0; i<nvec; ++i)
00088 {
00089 vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
00090 vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
00091 }
00092 ar=vload(creal(alm[njobs*(l+1)+j]));
00093 ai=vload(cimag(alm[njobs*(l+1)+j]));
00094 for (int i=0; i<nvec; ++i)
00095 {
00096 vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
00097 vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
00098 }
00099 }
00100 r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
00101 for (int i=0; i<nvec; ++i)
00102 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
00103 l+=2;
00104 }
00105 if (l==lmax)
00106 {
00107 for (int j=0; j<njobs; ++j)
00108 {
00109 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
00110 for (int i=0; i<nvec; ++i)
00111 {
00112 vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
00113 vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
00114 }
00115 }
00116 }
00117 }
00118
00119 static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
00120 const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
00121 const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
00122 NJ1)
00123 {
00124 while (l<lmax)
00125 {
00126 Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
00127 for (int i=0; i<nvec; ++i)
00128 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00129 for (int j=0; j<njobs; ++j)
00130 {
00131 Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
00132 for (int i=0; i<nvec; ++i)
00133 {
00134 vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
00135 vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
00136 }
00137 for (int i=0; i<nvec; ++i)
00138 {
00139 vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
00140 vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
00141 }
00142 vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
00143 }
00144 r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
00145 for (int i=0; i<nvec; ++i)
00146 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
00147 l+=2;
00148 }
00149 if (l==lmax)
00150 {
00151 for (int j=0; j<njobs; ++j)
00152 {
00153 Tv tre=vzero, tim=vzero;
00154 for (int i=0; i<nvec; ++i)
00155 {
00156 vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
00157 vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
00158 }
00159 alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
00160 }
00161 }
00162 }
00163
00164 static void Z(calc_alm2map) (const Tb cth, const Tb sth,
00165 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
00166 Y(Tbri) * restrict p2 NJ1)
00167 {
00168 int l,lmax=gen->lmax;
00169 Tb lam_1,lam_2,scale;
00170 Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
00171 job->opcnt += (l-gen->m) * 4*VLEN*nvec;
00172 if (l>lmax) return;
00173 job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
00174
00175 Tb corfac;
00176 Y(getCorfac)(scale,&corfac,gen->cf);
00177 const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
00178 const dcmplx * restrict alm=job->almtmp;
00179 int full_ieee = Y(TballGe)(scale,sharp_minscale);
00180 while (!full_ieee)
00181 {
00182 for (int j=0; j<njobs; ++j)
00183 {
00184 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
00185 for (int i=0; i<nvec; ++i)
00186 {
00187 Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
00188 vfmaeq(p1[j].r.v[i],tmp,ar);
00189 vfmaeq(p1[j].i.v[i],tmp,ai);
00190 }
00191 }
00192 if (++l>lmax) break;
00193 Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
00194 for (int i=0; i<nvec; ++i)
00195 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00196 for (int j=0; j<njobs; ++j)
00197 {
00198 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
00199 for (int i=0; i<nvec; ++i)
00200 {
00201 Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
00202 vfmaeq(p2[j].r.v[i],tmp,ar);
00203 vfmaeq(p2[j].i.v[i],tmp,ai);
00204 }
00205 }
00206 if (++l>lmax) break;
00207 r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
00208 for (int i=0; i<nvec; ++i)
00209 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
00210 if (Y(rescale)(&lam_1,&lam_2,&scale))
00211 {
00212 Y(getCorfac)(scale,&corfac,gen->cf);
00213 full_ieee = Y(TballGe)(scale,sharp_minscale);
00214 }
00215 }
00216 if (l>lmax) return;
00217
00218 Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
00219 Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
00220 }
00221
00222 static void Z(calc_map2alm) (const Tb cth, const Tb sth,
00223 const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
00224 const Y(Tbri) * restrict p2 NJ1)
00225 {
00226 int lmax=gen->lmax;
00227 Tb lam_1,lam_2,scale;
00228 int l=gen->m;
00229 Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
00230 job->opcnt += (l-gen->m) * 4*VLEN*nvec;
00231 if (l>lmax) return;
00232 job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
00233
00234 const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
00235 Tb corfac;
00236 Y(getCorfac)(scale,&corfac,gen->cf);
00237 dcmplx * restrict alm=job->almtmp;
00238 int full_ieee = Y(TballGe)(scale,sharp_minscale);
00239 while (!full_ieee)
00240 {
00241 for (int j=0; j<njobs; ++j)
00242 {
00243 Tv tre=vzero, tim=vzero;
00244 for (int i=0; i<nvec; ++i)
00245 {
00246 Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
00247 vfmaeq(tre,tmp,p1[j].r.v[i]);
00248 vfmaeq(tim,tmp,p1[j].i.v[i]);
00249 }
00250 alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
00251 }
00252 if (++l>lmax) return;
00253 Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
00254 for (int i=0; i<nvec; ++i)
00255 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00256 for (int j=0; j<njobs; ++j)
00257 {
00258 Tv tre=vzero, tim=vzero;
00259 for (int i=0; i<nvec; ++i)
00260 {
00261 Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
00262 vfmaeq(tre,tmp,p2[j].r.v[i]);
00263 vfmaeq(tim,tmp,p2[j].i.v[i]);
00264 }
00265 alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
00266 }
00267 if (++l>lmax) return;
00268 r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
00269 for (int i=0; i<nvec; ++i)
00270 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
00271 if (Y(rescale)(&lam_1,&lam_2,&scale))
00272 {
00273 Y(getCorfac)(scale,&corfac,gen->cf);
00274 full_ieee = Y(TballGe)(scale,sharp_minscale);
00275 }
00276 }
00277
00278 Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
00279 Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
00280 }
00281
00282 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
00283 const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
00284 {
00285 for (int j=0; j<njobs; ++j)
00286 {
00287 Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
00288 acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
00289 for (int i=0; i<nvec; ++i)
00290 {
00291 Tv lw=vadd(rxp.v[i],rxm.v[i]);
00292 vfmaeq(px[j].qr.v[i],agr,lw);
00293 vfmaeq(px[j].qi.v[i],agi,lw);
00294 vfmaeq(px[j].ur.v[i],acr,lw);
00295 vfmaeq(px[j].ui.v[i],aci,lw);
00296 }
00297 for (int i=0; i<nvec; ++i)
00298 {
00299 Tv lx=vsub(rxm.v[i],rxp.v[i]);
00300 vfmseq(py[j].qr.v[i],aci,lx);
00301 vfmaeq(py[j].qi.v[i],acr,lx);
00302 vfmaeq(py[j].ur.v[i],agi,lx);
00303 vfmseq(py[j].ui.v[i],agr,lx);
00304 }
00305 }
00306 }
00307
00308 static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
00309 const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
00310 const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
00311 {
00312 for (int j=0; j<njobs; ++j)
00313 {
00314 Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
00315 acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
00316 Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
00317 acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
00318 for (int i=0; i<nvec; ++i)
00319 {
00320 Tv lw1=vadd(r2p.v[i],r2m.v[i]);
00321 Tv lx2=vsub(r1m.v[i],r1p.v[i]);
00322 vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
00323 vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
00324 vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
00325 vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
00326 }
00327 for (int i=0; i<nvec; ++i)
00328 {
00329 Tv lx1=vsub(r2m.v[i],r2p.v[i]);
00330 Tv lw2=vadd(r1p.v[i],r1m.v[i]);
00331 vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
00332 vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
00333 vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
00334 vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
00335 }
00336 }
00337 }
00338
00339 static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
00340 const Y(Tbqu) * restrict py, const Tb * restrict rxp,
00341 const Tb * restrict rxm, dcmplx * restrict alm NJ1)
00342 {
00343 for (int j=0; j<njobs; ++j)
00344 {
00345 Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
00346 for (int i=0; i<nvec; ++i)
00347 {
00348 Tv lw=vadd(rxp->v[i],rxm->v[i]);
00349 vfmaeq(agr,px[j].qr.v[i],lw);
00350 vfmaeq(agi,px[j].qi.v[i],lw);
00351 vfmaeq(acr,px[j].ur.v[i],lw);
00352 vfmaeq(aci,px[j].ui.v[i],lw);
00353 }
00354 for (int i=0; i<nvec; ++i)
00355 {
00356 Tv lx=vsub(rxm->v[i],rxp->v[i]);
00357 vfmseq(agr,py[j].ui.v[i],lx);
00358 vfmaeq(agi,py[j].ur.v[i],lx);
00359 vfmaeq(acr,py[j].qi.v[i],lx);
00360 vfmseq(aci,py[j].qr.v[i],lx);
00361 }
00362 vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
00363 }
00364 }
00365
00366 static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
00367 Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
00368 const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
00369 int lmax NJ1)
00370 {
00371 while (l<lmax)
00372 {
00373 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
00374 fx2=vload(fx[l+1].f[2]);
00375 for (int i=0; i<nvec; ++i)
00376 {
00377 rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
00378 vmul(fx2,rec1p.v[i]));
00379 rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
00380 vmul(fx2,rec1m.v[i]));
00381 }
00382 Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
00383 &alm[2*njobs*(l+1)] NJ2);
00384 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
00385 fx2=vload(fx[l+2].f[2]);
00386 for (int i=0; i<nvec; ++i)
00387 {
00388 rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
00389 vmul(fx2,rec2p.v[i]));
00390 rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
00391 vmul(fx2,rec2m.v[i]));
00392 }
00393 l+=2;
00394 }
00395 if (l==lmax)
00396 Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
00397 }
00398
00399 static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
00400 const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
00401 const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
00402 NJ1)
00403 {
00404 while (l<lmax)
00405 {
00406 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
00407 fx2=vload(fx[l+1].f[2]);
00408 for (int i=0; i<nvec; ++i)
00409 {
00410 rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
00411 vmul(fx2,rec1p.v[i]));
00412 rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
00413 vmul(fx2,rec1m.v[i]));
00414 }
00415 Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
00416 Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
00417 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
00418 fx2=vload(fx[l+2].f[2]);
00419 for (int i=0; i<nvec; ++i)
00420 {
00421 rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
00422 vmul(fx2,rec2p.v[i]));
00423 rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
00424 vmul(fx2,rec2m.v[i]));
00425 }
00426 l+=2;
00427 }
00428 if (l==lmax)
00429 Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
00430 }
00431
00432 static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
00433 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
00434 Y(Tbqu) * restrict p2 NJ1)
00435 {
00436 int l, lmax=gen->lmax;
00437 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
00438 Y(iter_to_ieee_spin)
00439 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
00440 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
00441 if (l>lmax) return;
00442 job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
00443
00444 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
00445 Tb corfacp,corfacm;
00446 Y(getCorfac)(scalep,&corfacp,gen->cf);
00447 Y(getCorfac)(scalem,&corfacm,gen->cf);
00448 const dcmplx * restrict alm=job->almtmp;
00449 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
00450 && Y(TballGe)(scalem,sharp_minscale);
00451 while (!full_ieee)
00452 {
00453 Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
00454 &alm[2*njobs*l] NJ2);
00455 if (++l>lmax) break;
00456 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
00457 Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
00458 &alm[2*njobs*l] NJ2);
00459 if (++l>lmax) break;
00460 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
00461 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
00462 {
00463 Y(getCorfac)(scalep,&corfacp,gen->cf);
00464 Y(getCorfac)(scalem,&corfacm,gen->cf);
00465 full_ieee = Y(TballGe)(scalep,sharp_minscale)
00466 && Y(TballGe)(scalem,sharp_minscale);
00467 }
00468 }
00469
00470 if (l>lmax) return;
00471
00472 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
00473 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
00474 Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
00475 lmax NJ2);
00476 }
00477
00478 static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
00479 const sharp_Ylmgen_C * restrict gen, sharp_job *job,
00480 const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
00481 {
00482 int l, lmax=gen->lmax;
00483 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
00484 Y(iter_to_ieee_spin)
00485 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
00486 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
00487 if (l>lmax) return;
00488 job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
00489
00490 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
00491 Tb corfacp,corfacm;
00492 Y(getCorfac)(scalep,&corfacp,gen->cf);
00493 Y(getCorfac)(scalem,&corfacm,gen->cf);
00494 dcmplx * restrict alm=job->almtmp;
00495 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
00496 && Y(TballGe)(scalem,sharp_minscale);
00497 while (!full_ieee)
00498 {
00499 Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
00500 Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
00501 if (++l>lmax) return;
00502 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
00503 t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
00504 Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
00505 if (++l>lmax) return;
00506 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
00507 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
00508 {
00509 Y(getCorfac)(scalep,&corfacp,gen->cf);
00510 Y(getCorfac)(scalem,&corfacm,gen->cf);
00511 full_ieee = Y(TballGe)(scalep,sharp_minscale)
00512 && Y(TballGe)(scalem,sharp_minscale);
00513 }
00514 }
00515
00516 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
00517 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
00518 Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
00519 }
00520
00521 static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
00522 const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
00523 {
00524 for (int j=0; j<njobs; ++j)
00525 {
00526 Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
00527 for (int i=0; i<nvec; ++i)
00528 {
00529 Tv lw=vadd(rxp.v[i],rxm.v[i]);
00530 vfmaeq(px[j].qr.v[i],ar,lw);
00531 vfmaeq(px[j].qi.v[i],ai,lw);
00532 }
00533 for (int i=0; i<nvec; ++i)
00534 {
00535 Tv lx=vsub(rxm.v[i],rxp.v[i]);
00536 vfmaeq(py[j].ur.v[i],ai,lx);
00537 vfmseq(py[j].ui.v[i],ar,lx);
00538 }
00539 }
00540 }
00541
00542 static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
00543 Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
00544 const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
00545 int lmax NJ1)
00546 {
00547 while (l<lmax)
00548 {
00549 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
00550 fx2=vload(fx[l+1].f[2]);
00551 for (int i=0; i<nvec; ++i)
00552 {
00553 rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
00554 vmul(fx2,rec1p.v[i]));
00555 rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
00556 vmul(fx2,rec1m.v[i]));
00557 }
00558 Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
00559 Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
00560 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
00561 fx2=vload(fx[l+2].f[2]);
00562 for (int i=0; i<nvec; ++i)
00563 {
00564 rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
00565 vmul(fx2,rec2p.v[i]));
00566 rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
00567 vmul(fx2,rec2m.v[i]));
00568 }
00569 l+=2;
00570 }
00571 if (l==lmax)
00572 Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
00573 }
00574
00575 static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
00576 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
00577 Y(Tbqu) * restrict p2 NJ1)
00578 {
00579 int l, lmax=gen->lmax;
00580 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
00581 Y(iter_to_ieee_spin)
00582 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
00583 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
00584 if (l>lmax) return;
00585 job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
00586
00587 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
00588 Tb corfacp,corfacm;
00589 Y(getCorfac)(scalep,&corfacp,gen->cf);
00590 Y(getCorfac)(scalem,&corfacm,gen->cf);
00591 const dcmplx * restrict alm=job->almtmp;
00592 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
00593 && Y(TballGe)(scalem,sharp_minscale);
00594 while (!full_ieee)
00595 {
00596 Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
00597 &alm[njobs*l] NJ2);
00598 if (++l>lmax) break;
00599 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
00600 Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
00601 &alm[njobs*l] NJ2);
00602 if (++l>lmax) break;
00603 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
00604 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
00605 {
00606 Y(getCorfac)(scalep,&corfacp,gen->cf);
00607 Y(getCorfac)(scalem,&corfacm,gen->cf);
00608 full_ieee = Y(TballGe)(scalep,sharp_minscale)
00609 && Y(TballGe)(scalem,sharp_minscale);
00610 }
00611 }
00612
00613 if (l>lmax) return;
00614
00615 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
00616 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
00617 Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
00618 lmax NJ2);
00619 }
00620
00621
00622 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
00623
00624 static void Z(inner_loop) (sharp_job *job, const int *ispair,
00625 const double *cth_, const double *sth_, int llim, int ulim,
00626 sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
00627 {
00628 const int nval=nvec*VLEN;
00629 const int m = job->ainfo->mval[mi];
00630 sharp_Ylmgen_prepare (gen, m);
00631
00632 switch (job->type)
00633 {
00634 case SHARP_ALM2MAP:
00635 case SHARP_ALM2MAP_DERIV1:
00636 {
00637 if (job->spin==0)
00638 {
00639 for (int ith=0; ith<ulim-llim; ith+=nval)
00640 {
00641 Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
00642 Y(Tbu) cth, sth;
00643
00644 int skip=1;
00645 for (int i=0; i<nval; ++i)
00646 {
00647 int itot=i+ith;
00648 if (itot>=ulim-llim) itot=ulim-llim-1;
00649 if (mlim[itot]>=m) skip=0;
00650 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
00651 }
00652 if (!skip)
00653 Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
00654
00655 for (int i=0; i<nval; ++i)
00656 {
00657 int itot=i+ith;
00658 if (itot<ulim-llim)
00659 {
00660 for (int j=0; j<njobs; ++j)
00661 {
00662 int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
00663 complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
00664 r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
00665 job->phase[phas_idx] = r1+r2;
00666 if (ispair[itot])
00667 job->phase[phas_idx+1] = r1-r2;
00668 }
00669 }
00670 }
00671 }
00672 }
00673 else
00674 {
00675 for (int ith=0; ith<ulim-llim; ith+=nval)
00676 {
00677 Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
00678 Y(Tbu) cth, sth;
00679 int skip=1;
00680
00681 for (int i=0; i<nval; ++i)
00682 {
00683 int itot=i+ith;
00684 if (itot>=ulim-llim) itot=ulim-llim-1;
00685 if (mlim[itot]>=m) skip=0;
00686 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
00687 }
00688 if (!skip)
00689 (job->type==SHARP_ALM2MAP) ?
00690 Z(calc_alm2map_spin )
00691 (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
00692 Z(calc_alm2map_deriv1)
00693 (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
00694
00695 for (int i=0; i<nval; ++i)
00696 {
00697 int itot=i+ith;
00698 if (itot<ulim-llim)
00699 {
00700 for (int j=0; j<njobs; ++j)
00701 {
00702 int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
00703 complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
00704 q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
00705 u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
00706 u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
00707 job->phase[phas_idx] = q1+q2;
00708 job->phase[phas_idx+2] = u1+u2;
00709 if (ispair[itot])
00710 {
00711 dcmplx *phQ = &(job->phase[phas_idx+1]),
00712 *phU = &(job->phase[phas_idx+3]);
00713 *phQ = q1-q2;
00714 *phU = u1-u2;
00715 if ((gen->mhi-gen->m+gen->s)&1)
00716 { *phQ=-(*phQ); *phU=-(*phU); }
00717 }
00718 }
00719 }
00720 }
00721 }
00722 }
00723 break;
00724 }
00725 case SHARP_MAP2ALM:
00726 {
00727 if (job->spin==0)
00728 {
00729 for (int ith=0; ith<ulim-llim; ith+=nval)
00730 {
00731 Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
00732 Y(Tbu) cth, sth;
00733 int skip=1;
00734
00735 for (int i=0; i<nval; ++i)
00736 {
00737 int itot=i+ith;
00738 if (itot>=ulim-llim) itot=ulim-llim-1;
00739 if (mlim[itot]>=m) skip=0;
00740 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
00741 if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
00742 {
00743 for (int j=0; j<njobs; ++j)
00744 {
00745 int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
00746 dcmplx ph1=job->phase[phas_idx];
00747 dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
00748 p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
00749 p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
00750 }
00751 }
00752 }
00753 if (!skip)
00754 Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
00755 }
00756 }
00757 else
00758 {
00759 for (int ith=0; ith<ulim-llim; ith+=nval)
00760 {
00761 Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
00762 Y(Tbu) cth, sth;
00763 int skip=1;
00764
00765 for (int i=0; i<nval; ++i)
00766 {
00767 int itot=i+ith;
00768 if (itot>=ulim-llim) itot=ulim-llim-1;
00769 if (mlim[itot]>=m) skip=0;
00770 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
00771 if (i+ith<ulim-llim)
00772 {
00773 for (int j=0; j<njobs; ++j)
00774 {
00775 int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
00776 dcmplx p1Q=job->phase[phas_idx],
00777 p1U=job->phase[phas_idx+2],
00778 p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
00779 p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
00780 if ((gen->mhi-gen->m+gen->s)&1)
00781 { p2Q=-p2Q; p2U=-p2U; }
00782 p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
00783 p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
00784 p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
00785 p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
00786 }
00787 }
00788 }
00789 if (!skip)
00790 Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
00791 }
00792 }
00793 break;
00794 }
00795 default:
00796 {
00797 UTIL_FAIL("must not happen");
00798 break;
00799 }
00800 }
00801 }
00802
00803 #undef VZERO