00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 typedef struct
00033 { Tv v[nvec]; } Tb;
00034
00035 typedef union
00036 { Tb b; double s[VLEN*nvec]; } Y(Tbu);
00037
00038 typedef struct
00039 { Tb r, i; } Y(Tbri);
00040
00041 typedef struct
00042 { Tb qr, qi, ur, ui; } Y(Tbqu);
00043
00044 typedef struct
00045 { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
00046
00047 typedef struct
00048 { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
00049
00050 typedef union
00051 { Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
00052
00053 typedef union
00054 { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
00055
00056 static inline Tb Y(Tbconst)(double val)
00057 {
00058 Tv v=vload(val);
00059 Tb res;
00060 for (int i=0; i<nvec; ++i) res.v[i]=v;
00061 return res;
00062 }
00063
00064 static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
00065 { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
00066
00067 static inline Tb Y(Tbprod)(Tb a, Tb b)
00068 { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
00069
00070 static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
00071 { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
00072
00073 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
00074 double maxval)
00075 {
00076 const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
00077 const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
00078 for (int i=0;i<nvec; ++i)
00079 {
00080 Tv mask = vgt(vabs(val->v[i]),vfmax);
00081 while (vanyTrue(mask))
00082 {
00083 vmuleq(val->v[i],vblend(mask,vfsmall,vone));
00084 vaddeq(scale->v[i],vblend(mask,vone,vzero));
00085 mask = vgt(vabs(val->v[i]),vfmax);
00086 }
00087 mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
00088 while (vanyTrue(mask))
00089 {
00090 vmuleq(val->v[i],vblend(mask,vfbig,vone));
00091 vsubeq(scale->v[i],vblend(mask,vone,vzero));
00092 mask = vand(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
00093 }
00094 }
00095 }
00096
00097 static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
00098 Tb * restrict ress)
00099 {
00100 Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
00101
00102 Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
00103
00104 do
00105 {
00106 if (npow&1)
00107 {
00108 for (int i=0; i<nvec; ++i)
00109 {
00110 vmuleq(res.v[i],val.v[i]);
00111 vaddeq(scale.v[i],scaleint.v[i]);
00112 }
00113 Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
00114 }
00115 for (int i=0; i<nvec; ++i)
00116 {
00117 vmuleq(val.v[i],val.v[i]);
00118 vaddeq(scaleint.v[i],scaleint.v[i]);
00119 }
00120 Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
00121 }
00122 while(npow>>=1);
00123
00124 *resd=res;
00125 *ress=scale;
00126 }
00127
00128 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
00129 Tb * restrict scale)
00130 {
00131 int did_scale=0;
00132 for (int i=0;i<nvec; ++i)
00133 {
00134 Tv mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
00135 if (vanyTrue(mask))
00136 {
00137 did_scale=1;
00138 Tv fact = vblend(mask,vload(sharp_fsmall),vone);
00139 vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
00140 vaddeq(scale->v[i],vblend(mask,vone,vzero));
00141 }
00142 }
00143 return did_scale;
00144 }
00145
00146 static inline int Y(TballLt)(Tb a,double b)
00147 {
00148 Tv vb=vload(b);
00149 Tv res=vlt(a.v[0],vb);
00150 for (int i=1; i<nvec; ++i)
00151 res=vand(res,vlt(a.v[i],vb));
00152 return vallTrue(res);
00153 }
00154 static inline int Y(TballGt)(Tb a,double b)
00155 {
00156 Tv vb=vload(b);
00157 Tv res=vgt(a.v[0],vb);
00158 for (int i=1; i<nvec; ++i)
00159 res=vand(res,vgt(a.v[i],vb));
00160 return vallTrue(res);
00161 }
00162 static inline int Y(TballGe)(Tb a,double b)
00163 {
00164 Tv vb=vload(b);
00165 Tv res=vge(a.v[0],vb);
00166 for (int i=1; i<nvec; ++i)
00167 res=vand(res,vge(a.v[i],vb));
00168 return vallTrue(res);
00169 }
00170
00171 static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
00172 const double * restrict cf)
00173 {
00174 Y(Tbu) sc, corf;
00175 sc.b=scale;
00176 for (int i=0; i<VLEN*nvec; ++i)
00177 corf.s[i] = (sc.s[i]<sharp_minscale) ?
00178 0. : cf[(int)(sc.s[i])-sharp_minscale];
00179 *corfac=corf.b;
00180 }
00181
00182 static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
00183 Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
00184 const sharp_Ylmgen_C * restrict gen)
00185 {
00186 int l=gen->m;
00187 Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
00188 Y(mypow) (sth,l,&lam_2,&scale);
00189 Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
00190 Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
00191
00192 int below_limit = Y(TballLt)(scale,sharp_limscale);
00193 while (below_limit)
00194 {
00195 if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
00196 Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
00197 for (int i=0; i<nvec; ++i)
00198 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
00199 r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
00200 for (int i=0; i<nvec; ++i)
00201 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
00202 if (Y(rescale)(&lam_1,&lam_2,&scale))
00203 below_limit = Y(TballLt)(scale,sharp_limscale);
00204 l+=2;
00205 }
00206 *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
00207 }
00208
00209 static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
00210 Tb * restrict ryp, Tb * restrict rym, const Tb cth,
00211 const sharp_ylmgen_dbl3 fx)
00212 {
00213 Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
00214 for (int i=0; i<nvec; ++i)
00215 {
00216 rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
00217 vmul(fx2,rxp->v[i]));
00218 rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
00219 vmul(fx2,rxm->v[i]));
00220 }
00221 }
00222
00223 static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
00224 Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
00225 Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
00226 {
00227 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
00228 Tb cth2, sth2;
00229 for (int i=0; i<nvec; ++i)
00230 {
00231 cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
00232 cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
00233 sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
00234 sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
00235 Tv mask=vlt(sth.v[i],vzero);
00236 Tv cfct=vblend(vand(mask,vlt(cth.v[i],vzero)),vload(-1.),vone);
00237 cth2.v[i]=vmul(cth2.v[i],cfct);
00238 Tv sfct=vblend(vand(mask,vgt(cth.v[i],vzero)),vload(-1.),vone);
00239 sth2.v[i]=vmul(sth2.v[i],sfct);
00240 }
00241
00242 Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
00243 Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
00244 Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
00245
00246 Tb rec2p, rec2m, scalep, scalem;
00247 Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
00248 Tv prefac=vload(gen->prefac[gen->m]),
00249 prescale=vload(gen->fscale[gen->m]);
00250 for (int i=0; i<nvec; ++i)
00251 {
00252 rec2p.v[i]=vmul(prefac,ccp.v[i]);
00253 scalep.v[i]=vadd(prescale,ccps.v[i]);
00254 rec2m.v[i]=vmul(prefac,csp.v[i]);
00255 scalem.v[i]=vadd(prescale,csps.v[i]);
00256 }
00257 Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
00258 Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
00259 for (int i=0; i<nvec; ++i)
00260 {
00261 rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
00262 scalep.v[i]=vadd(scalep.v[i],ssps.v[i]);
00263 rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
00264 scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
00265 if (gen->preMinus_p)
00266 rec2p.v[i]=vneg(rec2p.v[i]);
00267 if (gen->preMinus_m)
00268 rec2m.v[i]=vneg(rec2m.v[i]);
00269 if (gen->s&1)
00270 rec2p.v[i]=vneg(rec2p.v[i]);
00271 }
00272 Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
00273 Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
00274
00275 int l=gen->mhi;
00276
00277 int below_limit = Y(TballLt)(scalep,sharp_limscale)
00278 && Y(TballLt)(scalem,sharp_limscale);
00279 while (below_limit)
00280 {
00281 if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
00282 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
00283 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
00284 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
00285 below_limit = Y(TballLt)(scalep,sharp_limscale)
00286 && Y(TballLt)(scalem,sharp_limscale);
00287 l+=2;
00288 }
00289
00290 *l_=l;
00291 *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
00292 *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
00293 }