![]() |
NFFT
3.3.2
|
00001 /* 00002 * Copyright (c) 2002, 2016 Jens Keiner, Stefan Kunis, Daniel Potts 00003 * 00004 * This program is free software; you can redistribute it and/or modify it under 00005 * the terms of the GNU General Public License as published by the Free Software 00006 * Foundation; either version 2 of the License, or (at your option) any later 00007 * version. 00008 * 00009 * This program is distributed in the hope that it will be useful, but WITHOUT 00010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 00011 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 00012 * details. 00013 * 00014 * You should have received a copy of the GNU General Public License along with 00015 * this program; if not, write to the Free Software Foundation, Inc., 51 00016 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00017 */ 00018 00019 /* Nonequispaced FFT */ 00020 00021 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */ 00022 00023 /* configure header */ 00024 #include "config.h" 00025 00026 /* complex datatype (maybe) */ 00027 #ifdef HAVE_COMPLEX_H 00028 #include<complex.h> 00029 #endif 00030 00031 /* NFFT headers */ 00032 #include "nfft3.h" 00033 #include "infft.h" 00034 00035 #ifdef _OPENMP 00036 #include <omp.h> 00037 #endif 00038 00039 #ifdef OMP_ASSERT 00040 #include <assert.h> 00041 #endif 00042 00043 #undef X 00044 #define X(name) NFFT(name) 00045 00047 static inline INT intprod(const INT *vec, const INT a, const INT d) 00048 { 00049 INT t, p; 00050 00051 p = 1; 00052 for (t = 0; t < d; t++) 00053 p *= vec[t] - a; 00054 00055 return p; 00056 } 00057 00058 /* handy shortcuts */ 00059 #define BASE(x) CEXP(x) 00060 00075 static inline void sort0(const INT d, const INT *n, const INT m, 00076 const INT local_x_num, const R *local_x, INT *ar_x) 00077 { 00078 INT u_j[d], i, j, help, rhigh; 00079 INT *ar_x_temp; 00080 INT nprod; 00081 00082 for (i = 0; i < local_x_num; i++) 00083 { 00084 ar_x[2 * i] = 0; 00085 ar_x[2 *i + 1] = i; 00086 for (j = 0; j < d; j++) 00087 { 00088 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m))); 00089 u_j[j] = (help % n[j] + n[j]) % n[j]; 00090 00091 ar_x[2 * i] += u_j[j]; 00092 if (j + 1 < d) 00093 ar_x[2 * i] *= n[j + 1]; 00094 } 00095 } 00096 00097 for (j = 0, nprod = 1; j < d; j++) 00098 nprod *= n[j]; 00099 00100 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1; 00101 00102 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT)); 00103 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh); 00104 #ifdef OMP_ASSERT 00105 for (i = 1; i < local_x_num; i++) 00106 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]); 00107 #endif 00108 Y(free)(ar_x_temp); 00109 } 00110 00119 static inline void sort(const X(plan) *ths) 00120 { 00121 if (ths->flags & NFFT_SORT_NODES) 00122 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x); 00123 } 00124 00145 void X(trafo_direct)(const X(plan) *ths) 00146 { 00147 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f; 00148 00149 memset(f, 0, (size_t)(ths->M_total) * sizeof(C)); 00150 00151 if (ths->d == 1) 00152 { 00153 /* specialize for univariate case, rationale: faster */ 00154 INT j; 00155 #ifdef _OPENMP 00156 #pragma omp parallel for default(shared) private(j) 00157 #endif 00158 for (j = 0; j < ths->M_total; j++) 00159 { 00160 INT k_L; 00161 for (k_L = 0; k_L < ths->N_total; k_L++) 00162 { 00163 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j]; 00164 f[j] += f_hat[k_L] * BASE(-II * omega); 00165 } 00166 } 00167 } 00168 else 00169 { 00170 /* multivariate case */ 00171 INT j; 00172 #ifdef _OPENMP 00173 #pragma omp parallel for default(shared) private(j) 00174 #endif 00175 for (j = 0; j < ths->M_total; j++) 00176 { 00177 R x[ths->d], omega, Omega[ths->d + 1]; 00178 INT t, t2, k_L, k[ths->d]; 00179 Omega[0] = K(0.0); 00180 for (t = 0; t < ths->d; t++) 00181 { 00182 k[t] = -ths->N[t]/2; 00183 x[t] = K2PI * ths->x[j * ths->d + t]; 00184 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t]; 00185 } 00186 omega = Omega[ths->d]; 00187 00188 for (k_L = 0; k_L < ths->N_total; k_L++) 00189 { 00190 f[j] += f_hat[k_L] * BASE(-II * omega); 00191 { 00192 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--) 00193 k[t]-= ths->N[t]-1; 00194 00195 k[t]++; 00196 00197 for (t2 = t; t2 < ths->d; t2++) 00198 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2]; 00199 00200 omega = Omega[ths->d]; 00201 } 00202 } 00203 } 00204 } 00205 } 00206 00207 void X(adjoint_direct)(const X(plan) *ths) 00208 { 00209 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f; 00210 00211 memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C)); 00212 00213 if (ths->d == 1) 00214 { 00215 /* specialize for univariate case, rationale: faster */ 00216 #ifdef _OPENMP 00217 INT k_L; 00218 #pragma omp parallel for default(shared) private(k_L) 00219 for (k_L = 0; k_L < ths->N_total; k_L++) 00220 { 00221 INT j; 00222 for (j = 0; j < ths->M_total; j++) 00223 { 00224 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j]; 00225 f_hat[k_L] += f[j] * BASE(II * omega); 00226 } 00227 } 00228 #else 00229 INT j; 00230 for (j = 0; j < ths->M_total; j++) 00231 { 00232 INT k_L; 00233 for (k_L = 0; k_L < ths->N_total; k_L++) 00234 { 00235 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j]; 00236 f_hat[k_L] += f[j] * BASE(II * omega); 00237 } 00238 } 00239 #endif 00240 } 00241 else 00242 { 00243 /* multivariate case */ 00244 INT j, k_L; 00245 #ifdef _OPENMP 00246 #pragma omp parallel for default(shared) private(j, k_L) 00247 for (k_L = 0; k_L < ths->N_total; k_L++) 00248 { 00249 INT k[ths->d], k_temp, t; 00250 00251 k_temp = k_L; 00252 00253 for (t = ths->d - 1; t >= 0; t--) 00254 { 00255 k[t] = k_temp % ths->N[t] - ths->N[t]/2; 00256 k_temp /= ths->N[t]; 00257 } 00258 00259 for (j = 0; j < ths->M_total; j++) 00260 { 00261 R omega = K(0.0); 00262 for (t = 0; t < ths->d; t++) 00263 omega += k[t] * K2PI * ths->x[j * ths->d + t]; 00264 f_hat[k_L] += f[j] * BASE(II * omega); 00265 } 00266 } 00267 #else 00268 for (j = 0; j < ths->M_total; j++) 00269 { 00270 R x[ths->d], omega, Omega[ths->d+1]; 00271 INT t, t2, k[ths->d]; 00272 Omega[0] = K(0.0); 00273 for (t = 0; t < ths->d; t++) 00274 { 00275 k[t] = -ths->N[t]/2; 00276 x[t] = K2PI * ths->x[j * ths->d + t]; 00277 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t]; 00278 } 00279 omega = Omega[ths->d]; 00280 for (k_L = 0; k_L < ths->N_total; k_L++) 00281 { 00282 f_hat[k_L] += f[j] * BASE(II * omega); 00283 00284 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--) 00285 k[t]-= ths->N[t]-1; 00286 00287 k[t]++; 00288 00289 for (t2 = t; t2 < ths->d; t2++) 00290 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2]; 00291 00292 omega = Omega[ths->d]; 00293 } 00294 } 00295 #endif 00296 } 00297 } 00298 00324 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op, 00325 const INT act_dim) 00326 { 00327 const R xj = ths->x[j * ths->d + act_dim]; 00328 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim]))); 00329 00330 (*up) = c - (ths->m); 00331 (*op) = c + 1 + (ths->m); 00332 } 00333 00334 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m) 00335 { 00336 INT c = LRINT(FLOOR(x * (R)(n))); 00337 00338 *u = (c - m + n) % n; 00339 *o = (c + 1 + m + n) % n; 00340 } 00341 00342 #define MACRO_D_compute_A \ 00343 { \ 00344 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \ 00345 } 00346 00347 #define MACRO_D_compute_T \ 00348 { \ 00349 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \ 00350 } 00351 00352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C)); 00353 00354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C)); 00355 00356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]]; 00357 00358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2)); 00359 00360 #define MACRO_init_k_ks \ 00361 { \ 00362 for (t = ths->d-1; 0 <= t; t--) \ 00363 { \ 00364 kp[t] = k[t] = 0; \ 00365 ks[t] = ths->N[t]/2; \ 00366 } \ 00367 t++; \ 00368 } 00369 00370 #define MACRO_update_c_phi_inv_k(which_one) \ 00371 { \ 00372 for (t2 = t; t2 < ths->d; t2++) \ 00373 { \ 00374 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \ 00375 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \ 00376 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \ 00377 } \ 00378 } 00379 00380 #define MACRO_count_k_ks \ 00381 { \ 00382 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \ 00383 { \ 00384 kp[t] = k[t] = 0; \ 00385 ks[t]= ths->N[t]/2; \ 00386 } \ 00387 \ 00388 kp[t]++; k[t]++; ks[t]++; \ 00389 if(kp[t] == ths->N[t]/2) \ 00390 { \ 00391 k[t] = ths->n[t] - ths->N[t]/2; \ 00392 ks[t] = 0; \ 00393 } \ 00394 } \ 00395 00396 /* sub routines for the fast transforms matrix vector multiplication with D, D^T */ 00397 #define MACRO_D(which_one) \ 00398 static inline void D_serial_ ## which_one (X(plan) *ths) \ 00399 { \ 00400 C *f_hat, *g_hat; /* local copy */ \ 00401 R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \ 00402 INT t, t2; /* index dimensions */ \ 00403 INT k_L; /* plain index */ \ 00404 INT kp[ths->d]; /* multi index (simple) */ \ 00405 INT k[ths->d]; /* multi index in g_hat */ \ 00406 INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \ 00407 INT k_plain[ths->d+1]; /* postfix plain index */ \ 00408 INT ks_plain[ths->d+1]; /* postfix plain index */ \ 00409 \ 00410 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \ 00411 MACRO_D_init_result_ ## which_one; \ 00412 \ 00413 c_phi_inv_k[0] = K(1.0); \ 00414 k_plain[0] = 0; \ 00415 ks_plain[0] = 0; \ 00416 \ 00417 MACRO_init_k_ks; \ 00418 \ 00419 if (ths->flags & PRE_PHI_HUT) \ 00420 { \ 00421 for (k_L = 0; k_L < ths->N_total; k_L++) \ 00422 { \ 00423 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \ 00424 MACRO_D_compute_ ## which_one; \ 00425 MACRO_count_k_ks; \ 00426 } \ 00427 } \ 00428 else \ 00429 { \ 00430 for (k_L = 0; k_L < ths->N_total; k_L++) \ 00431 { \ 00432 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \ 00433 MACRO_D_compute_ ## which_one; \ 00434 MACRO_count_k_ks; \ 00435 } \ 00436 } \ 00437 } 00438 00439 #ifdef _OPENMP 00440 static inline void D_openmp_A(X(plan) *ths) 00441 { 00442 C *f_hat, *g_hat; 00443 INT k_L; 00445 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; 00446 memset(g_hat, 0, ths->n_total * sizeof(C)); 00447 00448 if (ths->flags & PRE_PHI_HUT) 00449 { 00450 #pragma omp parallel for default(shared) private(k_L) 00451 for (k_L = 0; k_L < ths->N_total; k_L++) 00452 { 00453 INT kp[ths->d]; //0..N-1 00454 INT k[ths->d]; 00455 INT ks[ths->d]; 00456 R c_phi_inv_k_val = K(1.0); 00457 INT k_plain_val = 0; 00458 INT ks_plain_val = 0; 00459 INT t; 00460 INT k_temp = k_L; 00461 00462 for (t = ths->d-1; t >= 0; t--) 00463 { 00464 kp[t] = k_temp % ths->N[t]; 00465 if (kp[t] >= ths->N[t]/2) 00466 k[t] = ths->n[t] - ths->N[t] + kp[t]; 00467 else 00468 k[t] = kp[t]; 00469 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t]; 00470 k_temp /= ths->N[t]; 00471 } 00472 00473 for (t = 0; t < ths->d; t++) 00474 { 00475 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]]; 00476 ks_plain_val = ks_plain_val*ths->N[t] + ks[t]; 00477 k_plain_val = k_plain_val*ths->n[t] + k[t]; 00478 } 00479 00480 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val; 00481 } /* for(k_L) */ 00482 } /* if(PRE_PHI_HUT) */ 00483 else 00484 { 00485 #pragma omp parallel for default(shared) private(k_L) 00486 for (k_L = 0; k_L < ths->N_total; k_L++) 00487 { 00488 INT kp[ths->d]; //0..N-1 00489 INT k[ths->d]; 00490 INT ks[ths->d]; 00491 R c_phi_inv_k_val = K(1.0); 00492 INT k_plain_val = 0; 00493 INT ks_plain_val = 0; 00494 INT t; 00495 INT k_temp = k_L; 00496 00497 for (t = ths->d-1; t >= 0; t--) 00498 { 00499 kp[t] = k_temp % ths->N[t]; 00500 if (kp[t] >= ths->N[t]/2) 00501 k[t] = ths->n[t] - ths->N[t] + kp[t]; 00502 else 00503 k[t] = kp[t]; 00504 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t]; 00505 k_temp /= ths->N[t]; 00506 } 00507 00508 for (t = 0; t < ths->d; t++) 00509 { 00510 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t)); 00511 ks_plain_val = ks_plain_val*ths->N[t] + ks[t]; 00512 k_plain_val = k_plain_val*ths->n[t] + k[t]; 00513 } 00514 00515 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val; 00516 } /* for(k_L) */ 00517 } /* else(PRE_PHI_HUT) */ 00518 } 00519 #endif 00520 00521 #ifndef _OPENMP 00522 MACRO_D(A) 00523 #endif 00524 00525 static inline void D_A(X(plan) *ths) 00526 { 00527 #ifdef _OPENMP 00528 D_openmp_A(ths); 00529 #else 00530 D_serial_A(ths); 00531 #endif 00532 } 00533 00534 #ifdef _OPENMP 00535 static void D_openmp_T(X(plan) *ths) 00536 { 00537 C *f_hat, *g_hat; 00538 INT k_L; 00540 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; 00541 memset(f_hat, 0, ths->N_total * sizeof(C)); 00542 00543 if (ths->flags & PRE_PHI_HUT) 00544 { 00545 #pragma omp parallel for default(shared) private(k_L) 00546 for (k_L = 0; k_L < ths->N_total; k_L++) 00547 { 00548 INT kp[ths->d]; //0..N-1 00549 INT k[ths->d]; 00550 INT ks[ths->d]; 00551 R c_phi_inv_k_val = K(1.0); 00552 INT k_plain_val = 0; 00553 INT ks_plain_val = 0; 00554 INT t; 00555 INT k_temp = k_L; 00556 00557 for (t = ths->d - 1; t >= 0; t--) 00558 { 00559 kp[t] = k_temp % ths->N[t]; 00560 if (kp[t] >= ths->N[t]/2) 00561 k[t] = ths->n[t] - ths->N[t] + kp[t]; 00562 else 00563 k[t] = kp[t]; 00564 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t]; 00565 k_temp /= ths->N[t]; 00566 } 00567 00568 for (t = 0; t < ths->d; t++) 00569 { 00570 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]]; 00571 ks_plain_val = ks_plain_val*ths->N[t] + ks[t]; 00572 k_plain_val = k_plain_val*ths->n[t] + k[t]; 00573 } 00574 00575 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val; 00576 } /* for(k_L) */ 00577 } /* if(PRE_PHI_HUT) */ 00578 else 00579 { 00580 #pragma omp parallel for default(shared) private(k_L) 00581 for (k_L = 0; k_L < ths->N_total; k_L++) 00582 { 00583 INT kp[ths->d]; //0..N-1 00584 INT k[ths->d]; 00585 INT ks[ths->d]; 00586 R c_phi_inv_k_val = K(1.0); 00587 INT k_plain_val = 0; 00588 INT ks_plain_val = 0; 00589 INT t; 00590 INT k_temp = k_L; 00591 00592 for (t = ths->d-1; t >= 0; t--) 00593 { 00594 kp[t] = k_temp % ths->N[t]; 00595 if (kp[t] >= ths->N[t]/2) 00596 k[t] = ths->n[t] - ths->N[t] + kp[t]; 00597 else 00598 k[t] = kp[t]; 00599 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t]; 00600 k_temp /= ths->N[t]; 00601 } 00602 00603 for (t = 0; t < ths->d; t++) 00604 { 00605 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t)); 00606 ks_plain_val = ks_plain_val*ths->N[t] + ks[t]; 00607 k_plain_val = k_plain_val*ths->n[t] + k[t]; 00608 } 00609 00610 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val; 00611 } /* for(k_L) */ 00612 } /* else(PRE_PHI_HUT) */ 00613 } 00614 #endif 00615 00616 #ifndef _OPENMP 00617 MACRO_D(T) 00618 #endif 00619 00620 static void D_T(X(plan) *ths) 00621 { 00622 #ifdef _OPENMP 00623 D_openmp_T(ths); 00624 #else 00625 D_serial_T(ths); 00626 #endif 00627 } 00628 00629 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */ 00630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C)); 00631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C)); 00632 00633 #define MACRO_B_PRE_FULL_PSI_compute_A \ 00634 { \ 00635 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \ 00636 } 00637 00638 #define MACRO_B_PRE_FULL_PSI_compute_T \ 00639 { \ 00640 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \ 00641 } 00642 00643 #define MACRO_B_compute_A \ 00644 { \ 00645 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \ 00646 } 00647 00648 #define MACRO_B_compute_T \ 00649 { \ 00650 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \ 00651 } 00652 00653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]] 00654 00655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]] 00656 00657 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \ 00658 - ((R)l[t2])/((R)ths->n[t2]), t2) 00659 00660 #define MACRO_init_uo_l_lj_t \ 00661 { \ 00662 for (t = ths->d-1; t >= 0; t--) \ 00663 { \ 00664 uo(ths,j,&u[t],&o[t],t); \ 00665 l[t] = u[t]; \ 00666 lj[t] = 0; \ 00667 } \ 00668 t++; \ 00669 } 00670 00671 #define MACRO_update_phi_prod_ll_plain(which_one) { \ 00672 for (t2 = t; t2 < ths->d; t2++) \ 00673 { \ 00674 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \ 00675 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \ 00676 } \ 00677 } 00678 00679 #define MACRO_count_uo_l_lj_t \ 00680 { \ 00681 for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \ 00682 { \ 00683 l[t] = u[t]; \ 00684 lj[t] = 0; \ 00685 } \ 00686 \ 00687 l[t]++; \ 00688 lj[t]++; \ 00689 } 00690 00691 #define MACRO_B(which_one) \ 00692 static inline void B_serial_ ## which_one (X(plan) *ths) \ 00693 { \ 00694 INT lprod; /* 'regular bandwidth' of matrix B */ \ 00695 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \ 00696 INT t, t2; /* index dimensions */ \ 00697 INT j; /* index nodes */ \ 00698 INT l_L, ix; /* index one row of B */ \ 00699 INT l[ths->d]; /* multi index u<=l<=o */ \ 00700 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \ 00701 INT ll_plain[ths->d+1]; /* postfix plain index in g */ \ 00702 R phi_prod[ths->d+1]; /* postfix product of PHI */ \ 00703 C *f, *g; /* local copy */ \ 00704 C *fj; /* local copy */ \ 00705 R y[ths->d]; \ 00706 R fg_psi[ths->d][2*ths->m+2]; \ 00707 R fg_exp_l[ths->d][2*ths->m+2]; \ 00708 INT l_fg,lj_fg; \ 00709 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \ 00710 R ip_w; \ 00711 INT ip_u; \ 00712 INT ip_s = ths->K/(ths->m+2); \ 00713 \ 00714 f = (C*)ths->f; g = (C*)ths->g; \ 00715 \ 00716 MACRO_B_init_result_ ## which_one; \ 00717 \ 00718 if (ths->flags & PRE_FULL_PSI) \ 00719 { \ 00720 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \ 00721 { \ 00722 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \ 00723 { \ 00724 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \ 00725 } \ 00726 } \ 00727 return; \ 00728 } \ 00729 \ 00730 phi_prod[0] = K(1.0); \ 00731 ll_plain[0] = 0; \ 00732 \ 00733 for (t = 0, lprod = 1; t < ths->d; t++) \ 00734 lprod *= (2 * ths->m + 2); \ 00735 \ 00736 if (ths->flags & PRE_PSI) \ 00737 { \ 00738 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \ 00739 { \ 00740 MACRO_init_uo_l_lj_t; \ 00741 \ 00742 for (l_L = 0; l_L < lprod; l_L++) \ 00743 { \ 00744 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \ 00745 \ 00746 MACRO_B_compute_ ## which_one; \ 00747 \ 00748 MACRO_count_uo_l_lj_t; \ 00749 } /* for(l_L) */ \ 00750 } /* for(j) */ \ 00751 return; \ 00752 } /* if(PRE_PSI) */ \ 00753 \ 00754 if (ths->flags & PRE_FG_PSI) \ 00755 { \ 00756 for(t2 = 0; t2 < ths->d; t2++) \ 00757 { \ 00758 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \ 00759 tmpEXP2sq = tmpEXP2*tmpEXP2; \ 00760 tmp2 = K(1.0); \ 00761 tmp3 = K(1.0); \ 00762 fg_exp_l[t2][0] = K(1.0); \ 00763 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \ 00764 { \ 00765 tmp3 = tmp2*tmpEXP2; \ 00766 tmp2 *= tmpEXP2sq; \ 00767 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \ 00768 } \ 00769 } \ 00770 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \ 00771 { \ 00772 MACRO_init_uo_l_lj_t; \ 00773 \ 00774 for (t2 = 0; t2 < ths->d; t2++) \ 00775 { \ 00776 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \ 00777 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \ 00778 tmp1 = K(1.0); \ 00779 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 00780 { \ 00781 tmp1 *= tmpEXP1; \ 00782 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 00783 } \ 00784 } \ 00785 \ 00786 for (l_L= 0; l_L < lprod; l_L++) \ 00787 { \ 00788 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \ 00789 \ 00790 MACRO_B_compute_ ## which_one; \ 00791 \ 00792 MACRO_count_uo_l_lj_t; \ 00793 } /* for(l_L) */ \ 00794 } /* for(j) */ \ 00795 return; \ 00796 } /* if(PRE_FG_PSI) */ \ 00797 \ 00798 if (ths->flags & FG_PSI) \ 00799 { \ 00800 for (t2 = 0; t2 < ths->d; t2++) \ 00801 { \ 00802 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \ 00803 tmpEXP2sq = tmpEXP2*tmpEXP2; \ 00804 tmp2 = K(1.0); \ 00805 tmp3 = K(1.0); \ 00806 fg_exp_l[t2][0] = K(1.0); \ 00807 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \ 00808 { \ 00809 tmp3 = tmp2*tmpEXP2; \ 00810 tmp2 *= tmpEXP2sq; \ 00811 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \ 00812 } \ 00813 } \ 00814 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \ 00815 { \ 00816 MACRO_init_uo_l_lj_t; \ 00817 \ 00818 for (t2 = 0; t2 < ths->d; t2++) \ 00819 { \ 00820 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\ 00821 \ 00822 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \ 00823 /ths->b[t2]); \ 00824 tmp1 = K(1.0); \ 00825 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 00826 { \ 00827 tmp1 *= tmpEXP1; \ 00828 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 00829 } \ 00830 } \ 00831 \ 00832 for (l_L = 0; l_L < lprod; l_L++) \ 00833 { \ 00834 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \ 00835 \ 00836 MACRO_B_compute_ ## which_one; \ 00837 \ 00838 MACRO_count_uo_l_lj_t; \ 00839 } /* for(l_L) */ \ 00840 } /* for(j) */ \ 00841 return; \ 00842 } /* if(FG_PSI) */ \ 00843 \ 00844 if (ths->flags & PRE_LIN_PSI) \ 00845 { \ 00846 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \ 00847 { \ 00848 MACRO_init_uo_l_lj_t; \ 00849 \ 00850 for (t2 = 0; t2 < ths->d; t2++) \ 00851 { \ 00852 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \ 00853 * ((R)(ths->K))) / (R)(ths->m + 2); \ 00854 ip_u = LRINT(FLOOR(y[t2])); \ 00855 ip_w = y[t2]-ip_u; \ 00856 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \ 00857 { \ 00858 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \ 00859 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \ 00860 * (ip_w); \ 00861 } \ 00862 } \ 00863 \ 00864 for (l_L = 0; l_L < lprod; l_L++) \ 00865 { \ 00866 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \ 00867 \ 00868 MACRO_B_compute_ ## which_one; \ 00869 \ 00870 MACRO_count_uo_l_lj_t; \ 00871 } /* for(l_L) */ \ 00872 } /* for(j) */ \ 00873 return; \ 00874 } /* if(PRE_LIN_PSI) */ \ 00875 \ 00876 /* no precomputed psi at all */ \ 00877 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \ 00878 { \ 00879 MACRO_init_uo_l_lj_t; \ 00880 \ 00881 for (l_L = 0; l_L < lprod; l_L++) \ 00882 { \ 00883 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \ 00884 \ 00885 MACRO_B_compute_ ## which_one; \ 00886 \ 00887 MACRO_count_uo_l_lj_t; \ 00888 } /* for(l_L) */ \ 00889 } /* for(j) */ \ 00890 } /* nfft_B */ \ 00891 00892 #ifndef _OPENMP 00893 MACRO_B(A) 00894 #endif 00895 00896 #ifdef _OPENMP 00897 static inline void B_openmp_A (X(plan) *ths) 00898 { 00899 INT lprod; /* 'regular bandwidth' of matrix B */ 00900 INT k; 00901 00902 memset(ths->f, 0, ths->M_total * sizeof(C)); 00903 00904 for (k = 0, lprod = 1; k < ths->d; k++) 00905 lprod *= (2*ths->m+2); 00906 00907 if (ths->flags & PRE_FULL_PSI) 00908 { 00909 #pragma omp parallel for default(shared) private(k) 00910 for (k = 0; k < ths->M_total; k++) 00911 { 00912 INT l; 00913 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 00914 ths->f[j] = K(0.0); 00915 for (l = 0; l < lprod; l++) 00916 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]]; 00917 } 00918 return; 00919 } 00920 00921 if (ths->flags & PRE_PSI) 00922 { 00923 #pragma omp parallel for default(shared) private(k) 00924 for (k = 0; k < ths->M_total; k++) 00925 { 00926 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 00927 INT t, t2; /* index dimensions */ 00928 INT l_L; /* index one row of B */ 00929 INT l[ths->d]; /* multi index u<=l<=o */ 00930 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 00931 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 00932 R phi_prod[ths->d+1]; /* postfix product of PHI */ 00933 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 00934 00935 phi_prod[0] = K(1.0); 00936 ll_plain[0] = 0; 00937 00938 MACRO_init_uo_l_lj_t; 00939 00940 for (l_L = 0; l_L < lprod; l_L++) 00941 { 00942 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); 00943 00944 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; 00945 00946 MACRO_count_uo_l_lj_t; 00947 } /* for(l_L) */ 00948 } /* for(j) */ 00949 return; 00950 } /* if(PRE_PSI) */ 00951 00952 if (ths->flags & PRE_FG_PSI) 00953 { 00954 INT t, t2; /* index dimensions */ 00955 R fg_exp_l[ths->d][2*ths->m+2]; 00956 00957 for (t2 = 0; t2 < ths->d; t2++) 00958 { 00959 INT lj_fg; 00960 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); 00961 R tmpEXP2sq = tmpEXP2*tmpEXP2; 00962 R tmp2 = K(1.0); 00963 R tmp3 = K(1.0); 00964 fg_exp_l[t2][0] = K(1.0); 00965 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) 00966 { 00967 tmp3 = tmp2*tmpEXP2; 00968 tmp2 *= tmpEXP2sq; 00969 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; 00970 } 00971 } 00972 00973 #pragma omp parallel for default(shared) private(k,t,t2) 00974 for (k = 0; k < ths->M_total; k++) 00975 { 00976 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 00977 R phi_prod[ths->d+1]; /* postfix product of PHI */ 00978 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 00979 INT l[ths->d]; /* multi index u<=l<=o */ 00980 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 00981 R fg_psi[ths->d][2*ths->m+2]; 00982 R tmpEXP1, tmp1; 00983 INT l_fg,lj_fg; 00984 INT l_L; 00985 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 00986 00987 phi_prod[0] = K(1.0); 00988 ll_plain[0] = 0; 00989 00990 MACRO_init_uo_l_lj_t; 00991 00992 for (t2 = 0; t2 < ths->d; t2++) 00993 { 00994 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; 00995 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; 00996 tmp1 = K(1.0); 00997 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) 00998 { 00999 tmp1 *= tmpEXP1; 01000 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; 01001 } 01002 } 01003 01004 for (l_L= 0; l_L < lprod; l_L++) 01005 { 01006 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01007 01008 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; 01009 01010 MACRO_count_uo_l_lj_t; 01011 } /* for(l_L) */ 01012 } /* for(j) */ 01013 return; 01014 } /* if(PRE_FG_PSI) */ 01015 01016 if (ths->flags & FG_PSI) 01017 { 01018 INT t, t2; /* index dimensions */ 01019 R fg_exp_l[ths->d][2*ths->m+2]; 01020 01021 sort(ths); 01022 01023 for (t2 = 0; t2 < ths->d; t2++) 01024 { 01025 INT lj_fg; 01026 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); 01027 R tmpEXP2sq = tmpEXP2*tmpEXP2; 01028 R tmp2 = K(1.0); 01029 R tmp3 = K(1.0); 01030 fg_exp_l[t2][0] = K(1.0); 01031 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) 01032 { 01033 tmp3 = tmp2*tmpEXP2; 01034 tmp2 *= tmpEXP2sq; 01035 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; 01036 } 01037 } 01038 01039 #pragma omp parallel for default(shared) private(k,t,t2) 01040 for (k = 0; k < ths->M_total; k++) 01041 { 01042 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01043 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01044 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01045 INT l[ths->d]; /* multi index u<=l<=o */ 01046 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01047 R fg_psi[ths->d][2*ths->m+2]; 01048 R tmpEXP1, tmp1; 01049 INT l_fg,lj_fg; 01050 INT l_L; 01051 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01052 01053 phi_prod[0] = K(1.0); 01054 ll_plain[0] = 0; 01055 01056 MACRO_init_uo_l_lj_t; 01057 01058 for (t2 = 0; t2 < ths->d; t2++) 01059 { 01060 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2)); 01061 01062 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) 01063 /ths->b[t2]); 01064 tmp1 = K(1.0); 01065 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) 01066 { 01067 tmp1 *= tmpEXP1; 01068 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; 01069 } 01070 } 01071 01072 for (l_L = 0; l_L < lprod; l_L++) 01073 { 01074 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01075 01076 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; 01077 01078 MACRO_count_uo_l_lj_t; 01079 } /* for(l_L) */ 01080 } /* for(j) */ 01081 return; 01082 } /* if(FG_PSI) */ 01083 01084 if (ths->flags & PRE_LIN_PSI) 01085 { 01086 sort(ths); 01087 01088 #pragma omp parallel for default(shared) private(k) 01089 for (k = 0; k<ths->M_total; k++) 01090 { 01091 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01092 INT t, t2; /* index dimensions */ 01093 INT l_L; /* index one row of B */ 01094 INT l[ths->d]; /* multi index u<=l<=o */ 01095 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01096 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01097 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01098 R y[ths->d]; 01099 R fg_psi[ths->d][2*ths->m+2]; 01100 INT l_fg,lj_fg; 01101 R ip_w; 01102 INT ip_u; 01103 INT ip_s = ths->K/(ths->m+2); 01104 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01105 01106 phi_prod[0] = K(1.0); 01107 ll_plain[0] = 0; 01108 01109 MACRO_init_uo_l_lj_t; 01110 01111 for (t2 = 0; t2 < ths->d; t2++) 01112 { 01113 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) 01114 * ((R)ths->K))/(ths->m+2); 01115 ip_u = LRINT(FLOOR(y[t2])); 01116 ip_w = y[t2]-ip_u; 01117 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) 01118 { 01119 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] 01120 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] 01121 * (ip_w); 01122 } 01123 } 01124 01125 for (l_L = 0; l_L < lprod; l_L++) 01126 { 01127 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01128 01129 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; 01130 01131 MACRO_count_uo_l_lj_t; 01132 } /* for(l_L) */ 01133 } /* for(j) */ 01134 return; 01135 } /* if(PRE_LIN_PSI) */ 01136 01137 /* no precomputed psi at all */ 01138 sort(ths); 01139 01140 #pragma omp parallel for default(shared) private(k) 01141 for (k = 0; k < ths->M_total; k++) 01142 { 01143 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01144 INT t, t2; /* index dimensions */ 01145 INT l_L; /* index one row of B */ 01146 INT l[ths->d]; /* multi index u<=l<=o */ 01147 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01148 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01149 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01150 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01151 01152 phi_prod[0] = K(1.0); 01153 ll_plain[0] = 0; 01154 01155 MACRO_init_uo_l_lj_t; 01156 01157 for (l_L = 0; l_L < lprod; l_L++) 01158 { 01159 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); 01160 01161 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; 01162 01163 MACRO_count_uo_l_lj_t; 01164 } /* for(l_L) */ 01165 } /* for(j) */ 01166 } 01167 #endif 01168 01169 static void B_A(X(plan) *ths) 01170 { 01171 #ifdef _OPENMP 01172 B_openmp_A(ths); 01173 #else 01174 B_serial_A(ths); 01175 #endif 01176 } 01177 01178 #ifdef _OPENMP 01179 01194 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key) 01195 { 01196 INT left = 0, right = len - 1; 01197 01198 if (len == 1) 01199 return 0; 01200 01201 while (left < right - 1) 01202 { 01203 INT i = (left + right) / 2; 01204 if (ar_x[2*i] >= key) 01205 right = i; 01206 else if (ar_x[2*i] < key) 01207 left = i; 01208 } 01209 01210 if (ar_x[2*left] < key && left != len-1) 01211 return left+1; 01212 01213 return left; 01214 } 01215 #endif 01216 01217 #ifdef _OPENMP 01218 01233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0, 01234 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d, 01235 const INT *n, const INT m) 01236 { 01237 const INT n0 = n[0]; 01238 INT k; 01239 INT nthreads = omp_get_num_threads(); 01240 INT nthreads_used = MIN(nthreads, n0); 01241 INT size_per_thread = n0 / nthreads_used; 01242 INT size_left = n0 - size_per_thread * nthreads_used; 01243 INT size_g[nthreads_used]; 01244 INT offset_g[nthreads_used]; 01245 INT my_id = omp_get_thread_num(); 01246 INT n_prod_rest = 1; 01247 01248 for (k = 1; k < d; k++) 01249 n_prod_rest *= n[k]; 01250 01251 *min_u_a = -1; 01252 *max_u_a = -1; 01253 *min_u_b = -1; 01254 *max_u_b = -1; 01255 *my_u0 = -1; 01256 *my_o0 = -1; 01257 01258 if (my_id < nthreads_used) 01259 { 01260 const INT m22 = 2 * m + 2; 01261 01262 offset_g[0] = 0; 01263 for (k = 0; k < nthreads_used; k++) 01264 { 01265 if (k > 0) 01266 offset_g[k] = offset_g[k-1] + size_g[k-1]; 01267 size_g[k] = size_per_thread; 01268 if (size_left > 0) 01269 { 01270 size_g[k]++; 01271 size_left--; 01272 } 01273 } 01274 01275 *my_u0 = offset_g[my_id]; 01276 *my_o0 = offset_g[my_id] + size_g[my_id] - 1; 01277 01278 if (nthreads_used > 1) 01279 { 01280 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1; 01281 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1); 01282 } 01283 else 01284 { 01285 *min_u_a = 0; 01286 *max_u_a = n_prod_rest * n0 - 1; 01287 } 01288 01289 if (*min_u_a < 0) 01290 { 01291 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0); 01292 *max_u_b = n_prod_rest * n0 - 1; 01293 *min_u_a = 0; 01294 } 01295 01296 if (*min_u_b != -1 && *min_u_b <= *max_u_a) 01297 { 01298 *max_u_a = *max_u_b; 01299 *min_u_b = -1; 01300 *max_u_b = -1; 01301 } 01302 #ifdef OMP_ASSERT 01303 assert(*min_u_a <= *max_u_a); 01304 assert(*min_u_b <= *max_u_b); 01305 assert(*min_u_b == -1 || *max_u_a < *min_u_b); 01306 #endif 01307 } 01308 } 01309 #endif 01310 01319 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g, 01320 const R *psi, const C *f, const INT M, const INT d, const INT *n, 01321 const INT m, const unsigned flags, const INT *index_x) 01322 { 01323 INT k; 01324 INT lprod; 01325 #ifdef _OPENMP 01326 INT lprod_m1; 01327 #endif 01328 #ifndef _OPENMP 01329 UNUSED(n); 01330 #endif 01331 { 01332 INT t; 01333 for(t = 0, lprod = 1; t < d; t++) 01334 lprod *= 2 * m + 2; 01335 } 01336 #ifdef _OPENMP 01337 lprod_m1 = lprod / (2 * m + 2); 01338 #endif 01339 01340 #ifdef _OPENMP 01341 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT) 01342 { 01343 #pragma omp parallel private(k) 01344 { 01345 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; 01346 const INT *ar_x = index_x; 01347 INT n_prod_rest = 1; 01348 01349 for (k = 1; k < d; k++) 01350 n_prod_rest *= n[k]; 01351 01352 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m); 01353 01354 if (min_u_a != -1) 01355 { 01356 k = index_x_binary_search(ar_x, M, min_u_a); 01357 #ifdef OMP_ASSERT 01358 assert(ar_x[2*k] >= min_u_a || k == M-1); 01359 if (k > 0) 01360 assert(ar_x[2*k-2] < min_u_a); 01361 #endif 01362 while (k < M) 01363 { 01364 INT l0, lrest; 01365 INT u_prod = ar_x[2*k]; 01366 INT j = ar_x[2*k+1]; 01367 01368 if (u_prod < min_u_a || u_prod > max_u_a) 01369 break; 01370 01371 for (l0 = 0; l0 < 2 * m + 2; l0++) 01372 { 01373 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1]; 01374 01375 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1) 01376 continue; 01377 01378 for (lrest = 0; lrest < lprod_m1; lrest++) 01379 { 01380 const INT l = l0 * lprod_m1 + lrest; 01381 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j]; 01382 } 01383 } 01384 01385 k++; 01386 } 01387 } 01388 01389 if (min_u_b != -1) 01390 { 01391 k = index_x_binary_search(ar_x, M, min_u_b); 01392 #ifdef OMP_ASSERT 01393 assert(ar_x[2*k] >= min_u_b || k == M-1); 01394 if (k > 0) 01395 assert(ar_x[2*k-2] < min_u_b); 01396 #endif 01397 while (k < M) 01398 { 01399 INT l0, lrest; 01400 INT u_prod = ar_x[2*k]; 01401 INT j = ar_x[2*k+1]; 01402 01403 if (u_prod < min_u_b || u_prod > max_u_b) 01404 break; 01405 01406 for (l0 = 0; l0 < 2 * m + 2; l0++) 01407 { 01408 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1]; 01409 01410 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1) 01411 continue; 01412 for (lrest = 0; lrest < lprod_m1; lrest++) 01413 { 01414 const INT l = l0 * lprod_m1 + lrest; 01415 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j]; 01416 } 01417 } 01418 01419 k++; 01420 } 01421 } 01422 } /* omp parallel */ 01423 return; 01424 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ 01425 #endif 01426 01427 #ifdef _OPENMP 01428 #pragma omp parallel for default(shared) private(k) 01429 #endif 01430 for (k = 0; k < M; k++) 01431 { 01432 INT l; 01433 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k; 01434 01435 for (l = 0; l < lprod; l++) 01436 { 01437 #ifdef _OPENMP 01438 C val = psi[j * lprod + l] * f[j]; 01439 C *gref = g + psi_index_g[j * lprod + l]; 01440 R *gref_real = (R*) gref; 01441 01442 #pragma omp atomic 01443 gref_real[0] += CREAL(val); 01444 01445 #pragma omp atomic 01446 gref_real[1] += CIMAG(val); 01447 #else 01448 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j]; 01449 #endif 01450 } 01451 } 01452 } 01453 01454 #ifndef _OPENMP 01455 MACRO_B(T) 01456 #endif 01457 01458 #ifdef _OPENMP 01459 static inline void B_openmp_T(X(plan) *ths) 01460 { 01461 INT lprod; /* 'regular bandwidth' of matrix B */ 01462 INT k; 01463 01464 memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C)); 01465 01466 for (k = 0, lprod = 1; k < ths->d; k++) 01467 lprod *= (2*ths->m+2); 01468 01469 if (ths->flags & PRE_FULL_PSI) 01470 { 01471 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f, 01472 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x); 01473 return; 01474 } 01475 01476 if (ths->flags & PRE_PSI) 01477 { 01478 #pragma omp parallel for default(shared) private(k) 01479 for (k = 0; k < ths->M_total; k++) 01480 { 01481 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01482 INT t, t2; /* index dimensions */ 01483 INT l_L; /* index one row of B */ 01484 INT l[ths->d]; /* multi index u<=l<=o */ 01485 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01486 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01487 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01488 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01489 01490 phi_prod[0] = K(1.0); 01491 ll_plain[0] = 0; 01492 01493 MACRO_init_uo_l_lj_t; 01494 01495 for (l_L = 0; l_L < lprod; l_L++) 01496 { 01497 C *lhs; 01498 R *lhs_real; 01499 C val; 01500 01501 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); 01502 01503 lhs = ths->g + ll_plain[ths->d]; 01504 lhs_real = (R*)lhs; 01505 val = phi_prod[ths->d] * ths->f[j]; 01506 01507 #pragma omp atomic 01508 lhs_real[0] += CREAL(val); 01509 01510 #pragma omp atomic 01511 lhs_real[1] += CIMAG(val); 01512 01513 MACRO_count_uo_l_lj_t; 01514 } /* for(l_L) */ 01515 } /* for(j) */ 01516 return; 01517 } /* if(PRE_PSI) */ 01518 01519 if (ths->flags & PRE_FG_PSI) 01520 { 01521 INT t, t2; /* index dimensions */ 01522 R fg_exp_l[ths->d][2*ths->m+2]; 01523 for(t2 = 0; t2 < ths->d; t2++) 01524 { 01525 INT lj_fg; 01526 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); 01527 R tmpEXP2sq = tmpEXP2*tmpEXP2; 01528 R tmp2 = K(1.0); 01529 R tmp3 = K(1.0); 01530 fg_exp_l[t2][0] = K(1.0); 01531 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) 01532 { 01533 tmp3 = tmp2*tmpEXP2; 01534 tmp2 *= tmpEXP2sq; 01535 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; 01536 } 01537 } 01538 01539 #pragma omp parallel for default(shared) private(k,t,t2) 01540 for (k = 0; k < ths->M_total; k++) 01541 { 01542 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01543 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01544 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01545 INT l[ths->d]; /* multi index u<=l<=o */ 01546 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01547 R fg_psi[ths->d][2*ths->m+2]; 01548 R tmpEXP1, tmp1; 01549 INT l_fg,lj_fg; 01550 INT l_L; 01551 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01552 01553 phi_prod[0] = K(1.0); 01554 ll_plain[0] = 0; 01555 01556 MACRO_init_uo_l_lj_t; 01557 01558 for (t2 = 0; t2 < ths->d; t2++) 01559 { 01560 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; 01561 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; 01562 tmp1 = K(1.0); 01563 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) 01564 { 01565 tmp1 *= tmpEXP1; 01566 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; 01567 } 01568 } 01569 01570 for (l_L= 0; l_L < lprod; l_L++) 01571 { 01572 C *lhs; 01573 R *lhs_real; 01574 C val; 01575 01576 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01577 01578 lhs = ths->g + ll_plain[ths->d]; 01579 lhs_real = (R*)lhs; 01580 val = phi_prod[ths->d] * ths->f[j]; 01581 01582 #pragma omp atomic 01583 lhs_real[0] += CREAL(val); 01584 01585 #pragma omp atomic 01586 lhs_real[1] += CIMAG(val); 01587 01588 MACRO_count_uo_l_lj_t; 01589 } /* for(l_L) */ 01590 } /* for(j) */ 01591 return; 01592 } /* if(PRE_FG_PSI) */ 01593 01594 if (ths->flags & FG_PSI) 01595 { 01596 INT t, t2; /* index dimensions */ 01597 R fg_exp_l[ths->d][2*ths->m+2]; 01598 01599 sort(ths); 01600 01601 for (t2 = 0; t2 < ths->d; t2++) 01602 { 01603 INT lj_fg; 01604 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); 01605 R tmpEXP2sq = tmpEXP2*tmpEXP2; 01606 R tmp2 = K(1.0); 01607 R tmp3 = K(1.0); 01608 fg_exp_l[t2][0] = K(1.0); 01609 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) 01610 { 01611 tmp3 = tmp2*tmpEXP2; 01612 tmp2 *= tmpEXP2sq; 01613 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; 01614 } 01615 } 01616 01617 #pragma omp parallel for default(shared) private(k,t,t2) 01618 for (k = 0; k < ths->M_total; k++) 01619 { 01620 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01621 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01622 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01623 INT l[ths->d]; /* multi index u<=l<=o */ 01624 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01625 R fg_psi[ths->d][2*ths->m+2]; 01626 R tmpEXP1, tmp1; 01627 INT l_fg,lj_fg; 01628 INT l_L; 01629 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01630 01631 phi_prod[0] = K(1.0); 01632 ll_plain[0] = 0; 01633 01634 MACRO_init_uo_l_lj_t; 01635 01636 for (t2 = 0; t2 < ths->d; t2++) 01637 { 01638 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2)); 01639 01640 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) 01641 /ths->b[t2]); 01642 tmp1 = K(1.0); 01643 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) 01644 { 01645 tmp1 *= tmpEXP1; 01646 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; 01647 } 01648 } 01649 01650 for (l_L = 0; l_L < lprod; l_L++) 01651 { 01652 C *lhs; 01653 R *lhs_real; 01654 C val; 01655 01656 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01657 01658 lhs = ths->g + ll_plain[ths->d]; 01659 lhs_real = (R*)lhs; 01660 val = phi_prod[ths->d] * ths->f[j]; 01661 01662 #pragma omp atomic 01663 lhs_real[0] += CREAL(val); 01664 01665 #pragma omp atomic 01666 lhs_real[1] += CIMAG(val); 01667 01668 MACRO_count_uo_l_lj_t; 01669 } /* for(l_L) */ 01670 } /* for(j) */ 01671 return; 01672 } /* if(FG_PSI) */ 01673 01674 if (ths->flags & PRE_LIN_PSI) 01675 { 01676 sort(ths); 01677 01678 #pragma omp parallel for default(shared) private(k) 01679 for (k = 0; k<ths->M_total; k++) 01680 { 01681 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01682 INT t, t2; /* index dimensions */ 01683 INT l_L; /* index one row of B */ 01684 INT l[ths->d]; /* multi index u<=l<=o */ 01685 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01686 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01687 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01688 R y[ths->d]; 01689 R fg_psi[ths->d][2*ths->m+2]; 01690 INT l_fg,lj_fg; 01691 R ip_w; 01692 INT ip_u; 01693 INT ip_s = ths->K/(ths->m+2); 01694 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01695 01696 phi_prod[0] = K(1.0); 01697 ll_plain[0] = 0; 01698 01699 MACRO_init_uo_l_lj_t; 01700 01701 for (t2 = 0; t2 < ths->d; t2++) 01702 { 01703 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) 01704 * ((R)ths->K))/(ths->m+2); 01705 ip_u = LRINT(FLOOR(y[t2])); 01706 ip_w = y[t2]-ip_u; 01707 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) 01708 { 01709 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] 01710 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] 01711 * (ip_w); 01712 } 01713 } 01714 01715 for (l_L = 0; l_L < lprod; l_L++) 01716 { 01717 C *lhs; 01718 R *lhs_real; 01719 C val; 01720 01721 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 01722 01723 lhs = ths->g + ll_plain[ths->d]; 01724 lhs_real = (R*)lhs; 01725 val = phi_prod[ths->d] * ths->f[j]; 01726 01727 #pragma omp atomic 01728 lhs_real[0] += CREAL(val); 01729 01730 #pragma omp atomic 01731 lhs_real[1] += CIMAG(val); 01732 01733 MACRO_count_uo_l_lj_t; 01734 } /* for(l_L) */ 01735 } /* for(j) */ 01736 return; 01737 } /* if(PRE_LIN_PSI) */ 01738 01739 /* no precomputed psi at all */ 01740 sort(ths); 01741 01742 #pragma omp parallel for default(shared) private(k) 01743 for (k = 0; k < ths->M_total; k++) 01744 { 01745 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ 01746 INT t, t2; /* index dimensions */ 01747 INT l_L; /* index one row of B */ 01748 INT l[ths->d]; /* multi index u<=l<=o */ 01749 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 01750 INT ll_plain[ths->d+1]; /* postfix plain index in g */ 01751 R phi_prod[ths->d+1]; /* postfix product of PHI */ 01752 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01753 01754 phi_prod[0] = K(1.0); 01755 ll_plain[0] = 0; 01756 01757 MACRO_init_uo_l_lj_t; 01758 01759 for (l_L = 0; l_L < lprod; l_L++) 01760 { 01761 C *lhs; 01762 R *lhs_real; 01763 C val; 01764 01765 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); 01766 01767 lhs = ths->g + ll_plain[ths->d]; 01768 lhs_real = (R*)lhs; 01769 val = phi_prod[ths->d] * ths->f[j]; 01770 01771 #pragma omp atomic 01772 lhs_real[0] += CREAL(val); 01773 01774 #pragma omp atomic 01775 lhs_real[1] += CIMAG(val); 01776 01777 MACRO_count_uo_l_lj_t; 01778 } /* for(l_L) */ 01779 } /* for(j) */ 01780 } 01781 #endif 01782 01783 static void B_T(X(plan) *ths) 01784 { 01785 #ifdef _OPENMP 01786 B_openmp_T(ths); 01787 #else 01788 B_serial_T(ths); 01789 #endif 01790 } 01791 01792 /* ## specialized version for d=1 ########################################### */ 01793 01794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b) 01795 { 01796 const INT tmp2 = 2*m+2; 01797 INT l; 01798 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq; 01799 01800 fg_exp_b0 = EXP(K(-1.0)/b); 01801 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0; 01802 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0); 01803 01804 for (l = 1; l < tmp2; l++) 01805 { 01806 fg_exp_b2 = fg_exp_b1*fg_exp_b0; 01807 fg_exp_b1 *= fg_exp_b0_sq; 01808 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2; 01809 } 01810 } 01811 01812 01813 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const, 01814 const R *xj, const INT n, const INT m) 01815 { 01816 INT u, o, l; 01817 const C *gj; 01818 const R *psij; 01819 psij = psij_const; 01820 01821 uo2(&u, &o, *xj, n, m); 01822 01823 if (u < o) 01824 { 01825 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++) 01826 (*fj) += (*psij++) * (*gj++); 01827 } 01828 else 01829 { 01830 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++) 01831 (*fj) += (*psij++) * (*gj++); 01832 for (l = 0, gj = g; l <= o; l++) 01833 (*fj) += (*psij++) * (*gj++); 01834 } 01835 } 01836 01837 #ifndef _OPENMP 01838 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g, 01839 const R *psij_const, const R *xj, const INT n, const INT m) 01840 { 01841 INT u,o,l; 01842 C *gj; 01843 const R *psij; 01844 psij = psij_const; 01845 01846 uo2(&u,&o,*xj, n, m); 01847 01848 if (u < o) 01849 { 01850 for (l = 0, gj = g+u; l <= 2*m+1; l++) 01851 (*gj++) += (*psij++) * (*fj); 01852 } 01853 else 01854 { 01855 for (l = 0, gj = g+u; l < 2*m+1-o; l++) 01856 (*gj++) += (*psij++) * (*fj); 01857 for (l = 0, gj = g; l <= o; l++) 01858 (*gj++) += (*psij++) * (*fj); 01859 } 01860 } 01861 #endif 01862 01863 #ifdef _OPENMP 01864 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */ 01865 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g, 01866 const R *psij_const, const R *xj, const INT n, const INT m) 01867 { 01868 INT u,o,l; 01869 C *gj; 01870 INT index_temp[2*m+2]; 01871 01872 uo2(&u,&o,*xj, n, m); 01873 01874 for (l=0; l<=2*m+1; l++) 01875 index_temp[l] = (l+u)%n; 01876 01877 for (l = 0, gj = g+u; l <= 2*m+1; l++) 01878 { 01879 INT i = index_temp[l]; 01880 C *lhs = g+i; 01881 R *lhs_real = (R*)lhs; 01882 C val = psij_const[l] * f; 01883 #pragma omp atomic 01884 lhs_real[0] += CREAL(val); 01885 01886 #pragma omp atomic 01887 lhs_real[1] += CIMAG(val); 01888 } 01889 } 01890 #endif 01891 01892 #ifdef _OPENMP 01893 01908 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g, 01909 const R *psij_const, const R *xj, const INT n, const INT m, 01910 const INT my_u0, const INT my_o0) 01911 { 01912 INT ar_u,ar_o,l; 01913 01914 uo2(&ar_u,&ar_o,*xj, n, m); 01915 01916 if (ar_u < ar_o) 01917 { 01918 INT u = MAX(my_u0,ar_u); 01919 INT o = MIN(my_o0,ar_o); 01920 INT offset_psij = u-ar_u; 01921 #ifdef OMP_ASSERT 01922 assert(offset_psij >= 0); 01923 assert(o-u <= 2*m+1); 01924 assert(offset_psij+o-u <= 2*m+1); 01925 #endif 01926 01927 for (l = 0; l <= o-u; l++) 01928 g[u+l] += psij_const[offset_psij+l] * f; 01929 } 01930 else 01931 { 01932 INT u = MAX(my_u0,ar_u); 01933 INT o = my_o0; 01934 INT offset_psij = u-ar_u; 01935 #ifdef OMP_ASSERT 01936 assert(offset_psij >= 0); 01937 assert(o-u <= 2*m+1); 01938 assert(offset_psij+o-u <= 2*m+1); 01939 #endif 01940 01941 for (l = 0; l <= o-u; l++) 01942 g[u+l] += psij_const[offset_psij+l] * f; 01943 01944 u = my_u0; 01945 o = MIN(my_o0,ar_o); 01946 offset_psij += my_u0-ar_u+n; 01947 01948 #ifdef OMP_ASSERT 01949 if (u <= o) 01950 { 01951 assert(o-u <= 2*m+1); 01952 if (offset_psij+o-u > 2*m+1) 01953 { 01954 fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij); 01955 } 01956 assert(offset_psij+o-u <= 2*m+1); 01957 } 01958 #endif 01959 for (l = 0; l <= o-u; l++) 01960 g[u+l] += psij_const[offset_psij+l] * f; 01961 } 01962 } 01963 #endif 01964 01965 static void nfft_trafo_1d_B(X(plan) *ths) 01966 { 01967 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2; 01968 const C *g = (C*)ths->g; 01969 01970 if (ths->flags & PRE_FULL_PSI) 01971 { 01972 INT k; 01973 #ifdef _OPENMP 01974 #pragma omp parallel for default(shared) private(k) 01975 #endif 01976 for (k = 0; k < M; k++) 01977 { 01978 INT l; 01979 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01980 ths->f[j] = K(0.0); 01981 for (l = 0; l < m2p2; l++) 01982 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]]; 01983 } 01984 return; 01985 } /* if(PRE_FULL_PSI) */ 01986 01987 if (ths->flags & PRE_PSI) 01988 { 01989 INT k; 01990 #ifdef _OPENMP 01991 #pragma omp parallel for default(shared) private(k) 01992 #endif 01993 for (k = 0; k < M; k++) 01994 { 01995 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 01996 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2), 01997 &ths->x[j], n, m); 01998 } 01999 return; 02000 } /* if(PRE_PSI) */ 02001 02002 if (ths->flags & PRE_FG_PSI) 02003 { 02004 INT k; 02005 R fg_exp_l[m2p2]; 02006 02007 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 02008 02009 #ifdef _OPENMP 02010 #pragma omp parallel for default(shared) private(k) 02011 #endif 02012 for (k = 0; k < M; k++) 02013 { 02014 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02015 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1]; 02016 R fg_psij2 = K(1.0); 02017 R psij_const[m2p2]; 02018 INT l; 02019 02020 psij_const[0] = fg_psij0; 02021 02022 for (l = 1; l < m2p2; l++) 02023 { 02024 fg_psij2 *= fg_psij1; 02025 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; 02026 } 02027 02028 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m); 02029 } 02030 02031 return; 02032 } /* if(PRE_FG_PSI) */ 02033 02034 if (ths->flags & FG_PSI) 02035 { 02036 INT k; 02037 R fg_exp_l[m2p2]; 02038 02039 sort(ths); 02040 02041 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 02042 02043 #ifdef _OPENMP 02044 #pragma omp parallel for default(shared) private(k) 02045 #endif 02046 for (k = 0; k < M; k++) 02047 { 02048 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02049 INT u, o, l; 02050 R fg_psij0, fg_psij1, fg_psij2; 02051 R psij_const[m2p2]; 02052 02053 uo(ths, (INT)j, &u, &o, (INT)0); 02054 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0)); 02055 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]); 02056 fg_psij2 = K(1.0); 02057 02058 psij_const[0] = fg_psij0; 02059 02060 for (l = 1; l < m2p2; l++) 02061 { 02062 fg_psij2 *= fg_psij1; 02063 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; 02064 } 02065 02066 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m); 02067 } 02068 return; 02069 } /* if(FG_PSI) */ 02070 02071 if (ths->flags & PRE_LIN_PSI) 02072 { 02073 const INT K = ths->K, ip_s = K / (m + 2); 02074 INT k; 02075 02076 sort(ths); 02077 02078 #ifdef _OPENMP 02079 #pragma omp parallel for default(shared) private(k) 02080 #endif 02081 for (k = 0; k < M; k++) 02082 { 02083 INT u, o, l; 02084 R ip_y, ip_w; 02085 INT ip_u; 02086 R psij_const[m2p2]; 02087 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02088 02089 uo(ths, (INT)j, &u, &o, (INT)0); 02090 02091 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s); 02092 ip_u = (INT)(LRINT(FLOOR(ip_y))); 02093 ip_w = ip_y - (R)(ip_u); 02094 02095 for (l = 0; l < m2p2; l++) 02096 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) 02097 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); 02098 02099 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m); 02100 } 02101 return; 02102 } /* if(PRE_LIN_PSI) */ 02103 else 02104 { 02105 /* no precomputed psi at all */ 02106 INT k; 02107 02108 sort(ths); 02109 02110 #ifdef _OPENMP 02111 #pragma omp parallel for default(shared) private(k) 02112 #endif 02113 for (k = 0; k < M; k++) 02114 { 02115 R psij_const[m2p2]; 02116 INT u, o, l; 02117 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02118 02119 uo(ths, (INT)j, &u, &o, (INT)0); 02120 02121 for (l = 0; l < m2p2; l++) 02122 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0)); 02123 02124 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m); 02125 } 02126 } 02127 } 02128 02129 02130 #ifdef OMP_ASSERT 02131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \ 02132 { \ 02133 assert(ar_x[2*k] >= min_u_a || k == M-1); \ 02134 if (k > 0) \ 02135 assert(ar_x[2*k-2] < min_u_a); \ 02136 } 02137 #else 02138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A 02139 #endif 02140 02141 #ifdef OMP_ASSERT 02142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \ 02143 { \ 02144 assert(ar_x[2*k] >= min_u_b || k == M-1); \ 02145 if (k > 0) \ 02146 assert(ar_x[2*k-2] < min_u_b); \ 02147 } 02148 #else 02149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B 02150 #endif 02151 02152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 02153 { \ 02154 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \ 02155 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \ 02156 } 02157 02158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 02159 { \ 02160 R psij_const[2 * m + 2]; \ 02161 INT u, o, l; \ 02162 R fg_psij0 = ths->psi[2 * j]; \ 02163 R fg_psij1 = ths->psi[2 * j + 1]; \ 02164 R fg_psij2 = K(1.0); \ 02165 \ 02166 psij_const[0] = fg_psij0; \ 02167 for (l = 1; l <= 2 * m + 1; l++) \ 02168 { \ 02169 fg_psij2 *= fg_psij1; \ 02170 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \ 02171 } \ 02172 \ 02173 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 02174 ths->x + j, n, m, my_u0, my_o0); \ 02175 } 02176 02177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 02178 { \ 02179 R psij_const[2 * m + 2]; \ 02180 R fg_psij0, fg_psij1, fg_psij2; \ 02181 INT u, o, l; \ 02182 \ 02183 uo(ths, j, &u, &o, (INT)0); \ 02184 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \ 02185 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \ 02186 fg_psij2 = K(1.0); \ 02187 psij_const[0] = fg_psij0; \ 02188 for (l = 1; l <= 2 * m + 1; l++) \ 02189 { \ 02190 fg_psij2 *= fg_psij1; \ 02191 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \ 02192 } \ 02193 \ 02194 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 02195 ths->x + j, n, m, my_u0, my_o0); \ 02196 } 02197 02198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 02199 { \ 02200 R psij_const[2 * m + 2]; \ 02201 INT ip_u; \ 02202 R ip_y, ip_w; \ 02203 INT u, o, l; \ 02204 \ 02205 uo(ths, j, &u, &o, (INT)0); \ 02206 \ 02207 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \ 02208 ip_u = LRINT(FLOOR(ip_y)); \ 02209 ip_w = ip_y - ip_u; \ 02210 for (l = 0; l < 2 * m + 2; l++) \ 02211 psij_const[l] \ 02212 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \ 02213 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \ 02214 \ 02215 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 02216 ths->x + j, n, m, my_u0, my_o0); \ 02217 } 02218 02219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 02220 { \ 02221 R psij_const[2 * m + 2]; \ 02222 INT u, o, l; \ 02223 \ 02224 uo(ths, j, &u, &o, (INT)0); \ 02225 \ 02226 for (l = 0; l <= 2 * m + 1; l++) \ 02227 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \ 02228 \ 02229 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 02230 ths->x + j, n, m, my_u0, my_o0); \ 02231 } 02232 02233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \ 02234 { \ 02235 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 02236 { \ 02237 _Pragma("omp parallel private(k)") \ 02238 { \ 02239 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 02240 INT *ar_x = ths->index_x; \ 02241 \ 02242 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 02243 &min_u_b, &max_u_b, 1, &n, m); \ 02244 \ 02245 if (min_u_a != -1) \ 02246 { \ 02247 k = index_x_binary_search(ar_x, M, min_u_a); \ 02248 \ 02249 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \ 02250 \ 02251 while (k < M) \ 02252 { \ 02253 INT u_prod = ar_x[2*k]; \ 02254 INT j = ar_x[2*k+1]; \ 02255 \ 02256 if (u_prod < min_u_a || u_prod > max_u_a) \ 02257 break; \ 02258 \ 02259 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 02260 \ 02261 k++; \ 02262 } \ 02263 } \ 02264 \ 02265 if (min_u_b != -1) \ 02266 { \ 02267 k = index_x_binary_search(ar_x, M, min_u_b); \ 02268 \ 02269 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \ 02270 \ 02271 while (k < M) \ 02272 { \ 02273 INT u_prod = ar_x[2*k]; \ 02274 INT j = ar_x[2*k+1]; \ 02275 \ 02276 if (u_prod < min_u_b || u_prod > max_u_b) \ 02277 break; \ 02278 \ 02279 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 02280 \ 02281 k++; \ 02282 } \ 02283 } \ 02284 } /* omp parallel */ \ 02285 return; \ 02286 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \ 02287 } 02288 02289 static void nfft_adjoint_1d_B(X(plan) *ths) 02290 { 02291 const INT n = ths->n[0], M = ths->M_total, m = ths->m; 02292 INT k; 02293 C *g = (C*)ths->g; 02294 02295 memset(g, 0, (size_t)(ths->n_total) * sizeof(C)); 02296 02297 if (ths->flags & PRE_FULL_PSI) 02298 { 02299 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M, 02300 (INT)1, ths->n, m, ths->flags, ths->index_x); 02301 return; 02302 } /* if(PRE_FULL_PSI) */ 02303 02304 if (ths->flags & PRE_PSI) 02305 { 02306 #ifdef _OPENMP 02307 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI) 02308 #endif 02309 02310 #ifdef _OPENMP 02311 #pragma omp parallel for default(shared) private(k) 02312 #endif 02313 for (k = 0; k < M; k++) 02314 { 02315 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02316 #ifdef _OPENMP 02317 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m); 02318 #else 02319 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m); 02320 #endif 02321 } 02322 02323 return; 02324 } /* if(PRE_PSI) */ 02325 02326 if (ths->flags & PRE_FG_PSI) 02327 { 02328 R fg_exp_l[2 * m + 2]; 02329 02330 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 02331 02332 #ifdef _OPENMP 02333 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI) 02334 #endif 02335 02336 02337 #ifdef _OPENMP 02338 #pragma omp parallel for default(shared) private(k) 02339 #endif 02340 for (k = 0; k < M; k++) 02341 { 02342 R psij_const[2 * m + 2]; 02343 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02344 INT l; 02345 R fg_psij0 = ths->psi[2 * j]; 02346 R fg_psij1 = ths->psi[2 * j + 1]; 02347 R fg_psij2 = K(1.0); 02348 02349 psij_const[0] = fg_psij0; 02350 for (l = 1; l <= 2 * m + 1; l++) 02351 { 02352 fg_psij2 *= fg_psij1; 02353 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; 02354 } 02355 02356 #ifdef _OPENMP 02357 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m); 02358 #else 02359 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m); 02360 #endif 02361 } 02362 02363 return; 02364 } /* if(PRE_FG_PSI) */ 02365 02366 if (ths->flags & FG_PSI) 02367 { 02368 R fg_exp_l[2 * m + 2]; 02369 02370 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 02371 02372 sort(ths); 02373 02374 #ifdef _OPENMP 02375 MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI) 02376 #endif 02377 02378 #ifdef _OPENMP 02379 #pragma omp parallel for default(shared) private(k) 02380 #endif 02381 for (k = 0; k < M; k++) 02382 { 02383 INT u,o,l; 02384 R psij_const[2 * m + 2]; 02385 R fg_psij0, fg_psij1, fg_psij2; 02386 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02387 02388 uo(ths, j, &u, &o, (INT)0); 02389 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0)); 02390 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]); 02391 fg_psij2 = K(1.0); 02392 psij_const[0] = fg_psij0; 02393 for (l = 1; l <= 2 * m + 1; l++) 02394 { 02395 fg_psij2 *= fg_psij1; 02396 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; 02397 } 02398 02399 #ifdef _OPENMP 02400 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m); 02401 #else 02402 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m); 02403 #endif 02404 } 02405 02406 return; 02407 } /* if(FG_PSI) */ 02408 02409 if (ths->flags & PRE_LIN_PSI) 02410 { 02411 const INT K = ths->K; 02412 const INT ip_s = K / (m + 2); 02413 02414 sort(ths); 02415 02416 #ifdef _OPENMP 02417 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI) 02418 #endif 02419 02420 #ifdef _OPENMP 02421 #pragma omp parallel for default(shared) private(k) 02422 #endif 02423 for (k = 0; k < M; k++) 02424 { 02425 INT u,o,l; 02426 INT ip_u; 02427 R ip_y, ip_w; 02428 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02429 R psij_const[2 * m + 2]; 02430 02431 uo(ths, j, &u, &o, (INT)0); 02432 02433 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s); 02434 ip_u = (INT)(LRINT(FLOOR(ip_y))); 02435 ip_w = ip_y - (R)(ip_u); 02436 for (l = 0; l < 2 * m + 2; l++) 02437 psij_const[l] 02438 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) 02439 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); 02440 02441 #ifdef _OPENMP 02442 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m); 02443 #else 02444 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m); 02445 #endif 02446 } 02447 return; 02448 } /* if(PRE_LIN_PSI) */ 02449 02450 /* no precomputed psi at all */ 02451 sort(ths); 02452 02453 #ifdef _OPENMP 02454 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI) 02455 #endif 02456 02457 #ifdef _OPENMP 02458 #pragma omp parallel for default(shared) private(k) 02459 #endif 02460 for (k = 0; k < M; k++) 02461 { 02462 INT u,o,l; 02463 R psij_const[2 * m + 2]; 02464 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02465 02466 uo(ths, j, &u, &o, (INT)0); 02467 02468 for (l = 0; l <= 2 * m + 1; l++) 02469 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0)); 02470 02471 #ifdef _OPENMP 02472 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m); 02473 #else 02474 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m); 02475 #endif 02476 } 02477 } 02478 02479 void X(trafo_1d)(X(plan) *ths) 02480 { 02481 const INT N = ths->N[0], N2 = N/2, n = ths->n[0]; 02482 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2]; 02483 02484 ths->g_hat = ths->g1; 02485 ths->g = ths->g2; 02486 02487 { 02488 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat; 02489 R *c_phi_inv1, *c_phi_inv2; 02490 02491 TIC(0) 02492 #ifdef _OPENMP 02493 { 02494 INT k; 02495 #pragma omp parallel for default(shared) private(k) 02496 for (k = 0; k < ths->n_total; k++) 02497 ths->g_hat[k] = 0.0; 02498 } 02499 #else 02500 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C)); 02501 #endif 02502 if(ths->flags & PRE_PHI_HUT) 02503 { 02504 INT k; 02505 c_phi_inv1 = ths->c_phi_inv[0]; 02506 c_phi_inv2 = &ths->c_phi_inv[0][N2]; 02507 02508 #ifdef _OPENMP 02509 #pragma omp parallel for default(shared) private(k) 02510 #endif 02511 for (k = 0; k < N2; k++) 02512 { 02513 g_hat1[k] = f_hat1[k] * c_phi_inv1[k]; 02514 g_hat2[k] = f_hat2[k] * c_phi_inv2[k]; 02515 } 02516 } 02517 else 02518 { 02519 INT k; 02520 #ifdef _OPENMP 02521 #pragma omp parallel for default(shared) private(k) 02522 #endif 02523 for (k = 0; k < N2; k++) 02524 { 02525 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0)); 02526 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0)); 02527 } 02528 } 02529 TOC(0) 02530 02531 TIC_FFTW(1) 02532 FFTW(execute)(ths->my_fftw_plan1); 02533 TOC_FFTW(1); 02534 02535 TIC(2); 02536 nfft_trafo_1d_B(ths); 02537 TOC(2); 02538 } 02539 } 02540 02541 void X(adjoint_1d)(X(plan) *ths) 02542 { 02543 INT n,N; 02544 C *g_hat1,*g_hat2,*f_hat1,*f_hat2; 02545 R *c_phi_inv1, *c_phi_inv2; 02546 02547 N=ths->N[0]; 02548 n=ths->n[0]; 02549 02550 ths->g_hat=ths->g1; 02551 ths->g=ths->g2; 02552 02553 f_hat1=(C*)ths->f_hat; 02554 f_hat2=(C*)&ths->f_hat[N/2]; 02555 g_hat1=(C*)&ths->g_hat[n-N/2]; 02556 g_hat2=(C*)ths->g_hat; 02557 02558 TIC(2) 02559 nfft_adjoint_1d_B(ths); 02560 TOC(2) 02561 02562 TIC_FFTW(1) 02563 FFTW(execute)(ths->my_fftw_plan2); 02564 TOC_FFTW(1); 02565 02566 TIC(0) 02567 if(ths->flags & PRE_PHI_HUT) 02568 { 02569 INT k; 02570 c_phi_inv1=ths->c_phi_inv[0]; 02571 c_phi_inv2=&ths->c_phi_inv[0][N/2]; 02572 02573 #ifdef _OPENMP 02574 #pragma omp parallel for default(shared) private(k) 02575 #endif 02576 for (k = 0; k < N/2; k++) 02577 { 02578 f_hat1[k] = g_hat1[k] * c_phi_inv1[k]; 02579 f_hat2[k] = g_hat2[k] * c_phi_inv2[k]; 02580 } 02581 } 02582 else 02583 { 02584 INT k; 02585 02586 #ifdef _OPENMP 02587 #pragma omp parallel for default(shared) private(k) 02588 #endif 02589 for (k = 0; k < N/2; k++) 02590 { 02591 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0)); 02592 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0)); 02593 } 02594 } 02595 TOC(0) 02596 } 02597 02598 02599 /* ################################################ SPECIFIC VERSIONS FOR d=2 */ 02600 02601 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b) 02602 { 02603 INT l; 02604 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq; 02605 02606 fg_exp_b0 = EXP(K(-1.0)/b); 02607 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0; 02608 fg_exp_b1 = K(1.0); 02609 fg_exp_b2 = K(1.0); 02610 fg_exp_l[0] = K(1.0); 02611 for(l=1; l <= 2*m+1; l++) 02612 { 02613 fg_exp_b2 = fg_exp_b1*fg_exp_b0; 02614 fg_exp_b1 *= fg_exp_b0_sq; 02615 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2; 02616 } 02617 } 02618 02619 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0, 02620 const R *psij_const1, const R *xj0, const R *xj1, const INT n0, 02621 const INT n1, const INT m) 02622 { 02623 INT u0,o0,l0,u1,o1,l1; 02624 const C *gj; 02625 const R *psij0,*psij1; 02626 02627 psij0=psij_const0; 02628 psij1=psij_const1; 02629 02630 uo2(&u0,&o0,*xj0, n0, m); 02631 uo2(&u1,&o1,*xj1, n1, m); 02632 02633 *fj=0; 02634 02635 if (u0 < o0) 02636 if(u1 < o1) 02637 for(l0=0; l0<=2*m+1; l0++,psij0++) 02638 { 02639 psij1=psij_const1; 02640 gj=g+(u0+l0)*n1+u1; 02641 for(l1=0; l1<=2*m+1; l1++) 02642 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02643 } 02644 else 02645 for(l0=0; l0<=2*m+1; l0++,psij0++) 02646 { 02647 psij1=psij_const1; 02648 gj=g+(u0+l0)*n1+u1; 02649 for(l1=0; l1<2*m+1-o1; l1++) 02650 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02651 gj=g+(u0+l0)*n1; 02652 for(l1=0; l1<=o1; l1++) 02653 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02654 } 02655 else 02656 if(u1<o1) 02657 { 02658 for(l0=0; l0<2*m+1-o0; l0++,psij0++) 02659 { 02660 psij1=psij_const1; 02661 gj=g+(u0+l0)*n1+u1; 02662 for(l1=0; l1<=2*m+1; l1++) 02663 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02664 } 02665 for(l0=0; l0<=o0; l0++,psij0++) 02666 { 02667 psij1=psij_const1; 02668 gj=g+l0*n1+u1; 02669 for(l1=0; l1<=2*m+1; l1++) 02670 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02671 } 02672 } 02673 else 02674 { 02675 for(l0=0; l0<2*m+1-o0; l0++,psij0++) 02676 { 02677 psij1=psij_const1; 02678 gj=g+(u0+l0)*n1+u1; 02679 for(l1=0; l1<2*m+1-o1; l1++) 02680 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02681 gj=g+(u0+l0)*n1; 02682 for(l1=0; l1<=o1; l1++) 02683 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02684 } 02685 for(l0=0; l0<=o0; l0++,psij0++) 02686 { 02687 psij1=psij_const1; 02688 gj=g+l0*n1+u1; 02689 for(l1=0; l1<2*m+1-o1; l1++) 02690 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02691 gj=g+l0*n1; 02692 for(l1=0; l1<=o1; l1++) 02693 (*fj) += (*psij0) * (*psij1++) * (*gj++); 02694 } 02695 } 02696 } 02697 02698 #ifdef _OPENMP 02699 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */ 02700 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g, 02701 const R *psij_const0, const R *psij_const1, const R *xj0, 02702 const R *xj1, const INT n0, const INT n1, const INT m) 02703 { 02704 INT u0,o0,l0,u1,o1,l1; 02705 const INT lprod = (2*m+2) * (2*m+2); 02706 02707 INT index_temp0[2*m+2]; 02708 INT index_temp1[2*m+2]; 02709 02710 uo2(&u0,&o0,*xj0, n0, m); 02711 uo2(&u1,&o1,*xj1, n1, m); 02712 02713 for (l0=0; l0<=2*m+1; l0++) 02714 index_temp0[l0] = (u0+l0)%n0; 02715 02716 for (l1=0; l1<=2*m+1; l1++) 02717 index_temp1[l1] = (u1+l1)%n1; 02718 02719 for(l0=0; l0<=2*m+1; l0++) 02720 { 02721 for(l1=0; l1<=2*m+1; l1++) 02722 { 02723 INT i = index_temp0[l0] * n1 + index_temp1[l1]; 02724 C *lhs = g+i; 02725 R *lhs_real = (R*)lhs; 02726 C val = psij_const0[l0] * psij_const1[l1] * f; 02727 02728 #pragma omp atomic 02729 lhs_real[0] += CREAL(val); 02730 02731 #pragma omp atomic 02732 lhs_real[1] += CIMAG(val); 02733 } 02734 } 02735 } 02736 #endif 02737 02738 #ifdef _OPENMP 02739 02757 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g, 02758 const R *psij_const0, const R *psij_const1, const R *xj0, 02759 const R *xj1, const INT n0, const INT n1, const INT m, 02760 const INT my_u0, const INT my_o0) 02761 { 02762 INT ar_u0,ar_o0,l0,u1,o1,l1; 02763 const INT lprod = (2*m+2) * (2*m+2); 02764 INT index_temp1[2*m+2]; 02765 02766 uo2(&ar_u0,&ar_o0,*xj0, n0, m); 02767 uo2(&u1,&o1,*xj1, n1, m); 02768 02769 for (l1 = 0; l1 <= 2*m+1; l1++) 02770 index_temp1[l1] = (u1+l1)%n1; 02771 02772 if(ar_u0 < ar_o0) 02773 { 02774 INT u0 = MAX(my_u0,ar_u0); 02775 INT o0 = MIN(my_o0,ar_o0); 02776 INT offset_psij = u0-ar_u0; 02777 #ifdef OMP_ASSERT 02778 assert(offset_psij >= 0); 02779 assert(o0-u0 <= 2*m+1); 02780 assert(offset_psij+o0-u0 <= 2*m+1); 02781 #endif 02782 02783 for (l0 = 0; l0 <= o0-u0; l0++) 02784 { 02785 INT i0 = (u0+l0) * n1; 02786 const C val0 = psij_const0[offset_psij+l0]; 02787 02788 for(l1=0; l1<=2*m+1; l1++) 02789 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f; 02790 } 02791 } 02792 else 02793 { 02794 INT u0 = MAX(my_u0,ar_u0); 02795 INT o0 = my_o0; 02796 INT offset_psij = u0-ar_u0; 02797 #ifdef OMP_ASSERT 02798 assert(offset_psij >= 0); 02799 assert(o0-u0 <= 2*m+1); 02800 assert(offset_psij+o0-u0 <= 2*m+1); 02801 #endif 02802 02803 for (l0 = 0; l0 <= o0-u0; l0++) 02804 { 02805 INT i0 = (u0+l0) * n1; 02806 const C val0 = psij_const0[offset_psij+l0]; 02807 02808 for(l1=0; l1<=2*m+1; l1++) 02809 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f; 02810 } 02811 02812 u0 = my_u0; 02813 o0 = MIN(my_o0,ar_o0); 02814 offset_psij += my_u0-ar_u0+n0; 02815 02816 #ifdef OMP_ASSERT 02817 if (u0<=o0) 02818 { 02819 assert(o0-u0 <= 2*m+1); 02820 assert(offset_psij+o0-u0 <= 2*m+1); 02821 } 02822 #endif 02823 02824 for (l0 = 0; l0 <= o0-u0; l0++) 02825 { 02826 INT i0 = (u0+l0) * n1; 02827 const C val0 = psij_const0[offset_psij+l0]; 02828 02829 for(l1=0; l1<=2*m+1; l1++) 02830 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f; 02831 } 02832 } 02833 } 02834 #endif 02835 02836 #ifndef _OPENMP 02837 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g, 02838 const R *psij_const0, const R *psij_const1, const R *xj0, 02839 const R *xj1, const INT n0, const INT n1, const INT m) 02840 { 02841 INT u0,o0,l0,u1,o1,l1; 02842 C *gj; 02843 const R *psij0,*psij1; 02844 02845 psij0=psij_const0; 02846 psij1=psij_const1; 02847 02848 uo2(&u0,&o0,*xj0, n0, m); 02849 uo2(&u1,&o1,*xj1, n1, m); 02850 02851 if(u0<o0) 02852 if(u1<o1) 02853 for(l0=0; l0<=2*m+1; l0++,psij0++) 02854 { 02855 psij1=psij_const1; 02856 gj=g+(u0+l0)*n1+u1; 02857 for(l1=0; l1<=2*m+1; l1++) 02858 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02859 } 02860 else 02861 for(l0=0; l0<=2*m+1; l0++,psij0++) 02862 { 02863 psij1=psij_const1; 02864 gj=g+(u0+l0)*n1+u1; 02865 for(l1=0; l1<2*m+1-o1; l1++) 02866 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02867 gj=g+(u0+l0)*n1; 02868 for(l1=0; l1<=o1; l1++) 02869 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02870 } 02871 else 02872 if(u1<o1) 02873 { 02874 for(l0=0; l0<2*m+1-o0; l0++,psij0++) 02875 { 02876 psij1=psij_const1; 02877 gj=g+(u0+l0)*n1+u1; 02878 for(l1=0; l1<=2*m+1; l1++) 02879 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02880 } 02881 for(l0=0; l0<=o0; l0++,psij0++) 02882 { 02883 psij1=psij_const1; 02884 gj=g+l0*n1+u1; 02885 for(l1=0; l1<=2*m+1; l1++) 02886 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02887 } 02888 } 02889 else 02890 { 02891 for(l0=0; l0<2*m+1-o0; l0++,psij0++) 02892 { 02893 psij1=psij_const1; 02894 gj=g+(u0+l0)*n1+u1; 02895 for(l1=0; l1<2*m+1-o1; l1++) 02896 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02897 gj=g+(u0+l0)*n1; 02898 for(l1=0; l1<=o1; l1++) 02899 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02900 } 02901 for(l0=0; l0<=o0; l0++,psij0++) 02902 { 02903 psij1=psij_const1; 02904 gj=g+l0*n1+u1; 02905 for(l1=0; l1<2*m+1-o1; l1++) 02906 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02907 gj=g+l0*n1; 02908 for(l1=0; l1<=o1; l1++) 02909 (*gj++) += (*psij0) * (*psij1++) * (*fj); 02910 } 02911 } 02912 } 02913 #endif 02914 02915 static void nfft_trafo_2d_B(X(plan) *ths) 02916 { 02917 const C *g = (C*)ths->g; 02918 const INT n0 = ths->n[0]; 02919 const INT n1 = ths->n[1]; 02920 const INT M = ths->M_total; 02921 const INT m = ths->m; 02922 02923 INT k; 02924 02925 if(ths->flags & PRE_FULL_PSI) 02926 { 02927 const INT lprod = (2*m+2) * (2*m+2); 02928 #ifdef _OPENMP 02929 #pragma omp parallel for default(shared) private(k) 02930 #endif 02931 for (k = 0; k < M; k++) 02932 { 02933 INT l; 02934 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02935 ths->f[j] = K(0.0); 02936 for (l = 0; l < lprod; l++) 02937 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]]; 02938 } 02939 return; 02940 } /* if(PRE_FULL_PSI) */ 02941 02942 if(ths->flags & PRE_PSI) 02943 { 02944 #ifdef _OPENMP 02945 #pragma omp parallel for default(shared) private(k) 02946 #endif 02947 for (k = 0; k < M; k++) 02948 { 02949 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02950 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m); 02951 } 02952 02953 return; 02954 } /* if(PRE_PSI) */ 02955 02956 if(ths->flags & PRE_FG_PSI) 02957 { 02958 R fg_exp_l[2*(2*m+2)]; 02959 02960 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 02961 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 02962 02963 #ifdef _OPENMP 02964 #pragma omp parallel for default(shared) private(k) 02965 #endif 02966 for (k = 0; k < M; k++) 02967 { 02968 R psij_const[2*(2*m+2)]; 02969 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 02970 INT l; 02971 R fg_psij0 = ths->psi[2*j*2]; 02972 R fg_psij1 = ths->psi[2*j*2+1]; 02973 R fg_psij2 = K(1.0); 02974 02975 psij_const[0] = fg_psij0; 02976 for (l = 1; l <= 2*m+1; l++) 02977 { 02978 fg_psij2 *= fg_psij1; 02979 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 02980 } 02981 02982 fg_psij0 = ths->psi[2*(j*2+1)]; 02983 fg_psij1 = ths->psi[2*(j*2+1)+1]; 02984 fg_psij2 = K(1.0); 02985 psij_const[2*m+2] = fg_psij0; 02986 for (l = 1; l <= 2*m+1; l++) 02987 { 02988 fg_psij2 *= fg_psij1; 02989 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 02990 } 02991 02992 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 02993 } 02994 02995 return; 02996 } /* if(PRE_FG_PSI) */ 02997 02998 if(ths->flags & FG_PSI) 02999 { 03000 R fg_exp_l[2*(2*m+2)]; 03001 03002 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 03003 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 03004 03005 sort(ths); 03006 03007 #ifdef _OPENMP 03008 #pragma omp parallel for default(shared) private(k) 03009 #endif 03010 for (k = 0; k < M; k++) 03011 { 03012 INT u, o, l; 03013 R fg_psij0, fg_psij1, fg_psij2; 03014 R psij_const[2*(2*m+2)]; 03015 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03016 03017 uo(ths, j, &u, &o, (INT)0); 03018 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0)); 03019 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]); 03020 fg_psij2 = K(1.0); 03021 psij_const[0] = fg_psij0; 03022 for (l = 1; l <= 2*m+1; l++) 03023 { 03024 fg_psij2 *= fg_psij1; 03025 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 03026 } 03027 03028 uo(ths,j,&u,&o, (INT)1); 03029 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1)); 03030 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]); 03031 fg_psij2 = K(1.0); 03032 psij_const[2*m+2] = fg_psij0; 03033 for(l=1; l<=2*m+1; l++) 03034 { 03035 fg_psij2 *= fg_psij1; 03036 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 03037 } 03038 03039 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03040 } 03041 03042 return; 03043 } /* if(FG_PSI) */ 03044 03045 if(ths->flags & PRE_LIN_PSI) 03046 { 03047 const INT K = ths->K, ip_s = K / (m + 2); 03048 03049 sort(ths); 03050 03051 #ifdef _OPENMP 03052 #pragma omp parallel for default(shared) private(k) 03053 #endif 03054 for (k = 0; k < M; k++) 03055 { 03056 INT u, o, l; 03057 R ip_y, ip_w; 03058 INT ip_u; 03059 R psij_const[2*(2*m+2)]; 03060 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03061 03062 uo(ths,j,&u,&o,(INT)0); 03063 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s); 03064 ip_u = (INT)LRINT(FLOOR(ip_y)); 03065 ip_w = ip_y - (R)(ip_u); 03066 for (l = 0; l < 2*m+2; l++) 03067 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); 03068 03069 uo(ths,j,&u,&o,(INT)1); 03070 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s); 03071 ip_u = (INT)(LRINT(FLOOR(ip_y))); 03072 ip_w = ip_y - (R)(ip_u); 03073 for (l = 0; l < 2*m+2; l++) 03074 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 03075 03076 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03077 } 03078 return; 03079 } /* if(PRE_LIN_PSI) */ 03080 03081 /* no precomputed psi at all */ 03082 03083 sort(ths); 03084 03085 #ifdef _OPENMP 03086 #pragma omp parallel for default(shared) private(k) 03087 #endif 03088 for (k = 0; k < M; k++) 03089 { 03090 R psij_const[2*(2*m+2)]; 03091 INT u, o, l; 03092 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03093 03094 uo(ths,j,&u,&o,(INT)0); 03095 for (l = 0; l <= 2*m+1; l++) 03096 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0)); 03097 03098 uo(ths,j,&u,&o,(INT)1); 03099 for (l = 0; l <= 2*m+1; l++) 03100 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1)); 03101 03102 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03103 } 03104 } 03105 03106 #ifdef OMP_ASSERT 03107 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \ 03108 { \ 03109 assert(ar_x[2*k] >= min_u_a || k == M-1); \ 03110 if (k > 0) \ 03111 assert(ar_x[2*k-2] < min_u_a); \ 03112 } 03113 #else 03114 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A 03115 #endif 03116 03117 #ifdef OMP_ASSERT 03118 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \ 03119 { \ 03120 assert(ar_x[2*k] >= min_u_b || k == M-1); \ 03121 if (k > 0) \ 03122 assert(ar_x[2*k-2] < min_u_b); \ 03123 } 03124 #else 03125 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B 03126 #endif 03127 03128 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 03129 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 03130 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \ 03131 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0); 03132 03133 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 03134 { \ 03135 R psij_const[2*(2*m+2)]; \ 03136 INT u, o, l; \ 03137 R fg_psij0 = ths->psi[2*j*2]; \ 03138 R fg_psij1 = ths->psi[2*j*2+1]; \ 03139 R fg_psij2 = K(1.0); \ 03140 \ 03141 psij_const[0] = fg_psij0; \ 03142 for(l=1; l<=2*m+1; l++) \ 03143 { \ 03144 fg_psij2 *= fg_psij1; \ 03145 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 03146 } \ 03147 \ 03148 fg_psij0 = ths->psi[2*(j*2+1)]; \ 03149 fg_psij1 = ths->psi[2*(j*2+1)+1]; \ 03150 fg_psij2 = K(1.0); \ 03151 psij_const[2*m+2] = fg_psij0; \ 03152 for(l=1; l<=2*m+1; l++) \ 03153 { \ 03154 fg_psij2 *= fg_psij1; \ 03155 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 03156 } \ 03157 \ 03158 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 03159 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 03160 n0, n1, m, my_u0, my_o0); \ 03161 } 03162 03163 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 03164 { \ 03165 R psij_const[2*(2*m+2)]; \ 03166 R fg_psij0, fg_psij1, fg_psij2; \ 03167 INT u, o, l; \ 03168 \ 03169 uo(ths,j,&u,&o,(INT)0); \ 03170 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \ 03171 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \ 03172 fg_psij2 = K(1.0); \ 03173 psij_const[0] = fg_psij0; \ 03174 for(l=1; l<=2*m+1; l++) \ 03175 { \ 03176 fg_psij2 *= fg_psij1; \ 03177 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 03178 } \ 03179 \ 03180 uo(ths,j,&u,&o,(INT)1); \ 03181 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \ 03182 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \ 03183 fg_psij2 = K(1.0); \ 03184 psij_const[2*m+2] = fg_psij0; \ 03185 for(l=1; l<=2*m+1; l++) \ 03186 { \ 03187 fg_psij2 *= fg_psij1; \ 03188 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 03189 } \ 03190 \ 03191 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 03192 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 03193 n0, n1, m, my_u0, my_o0); \ 03194 } 03195 03196 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 03197 { \ 03198 R psij_const[2*(2*m+2)]; \ 03199 INT u, o, l; \ 03200 INT ip_u; \ 03201 R ip_y, ip_w; \ 03202 \ 03203 uo(ths,j,&u,&o,(INT)0); \ 03204 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \ 03205 ip_u = LRINT(FLOOR(ip_y)); \ 03206 ip_w = ip_y-ip_u; \ 03207 for(l=0; l < 2*m+2; l++) \ 03208 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 03209 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \ 03210 \ 03211 uo(ths,j,&u,&o,(INT)1); \ 03212 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \ 03213 ip_u = LRINT(FLOOR(ip_y)); \ 03214 ip_w = ip_y-ip_u; \ 03215 for(l=0; l < 2*m+2; l++) \ 03216 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 03217 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 03218 \ 03219 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 03220 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 03221 n0, n1, m, my_u0, my_o0); \ 03222 } 03223 03224 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 03225 { \ 03226 R psij_const[2*(2*m+2)]; \ 03227 INT u, o, l; \ 03228 \ 03229 uo(ths,j,&u,&o,(INT)0); \ 03230 for(l=0;l<=2*m+1;l++) \ 03231 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \ 03232 \ 03233 uo(ths,j,&u,&o,(INT)1); \ 03234 for(l=0;l<=2*m+1;l++) \ 03235 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \ 03236 \ 03237 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 03238 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 03239 n0, n1, m, my_u0, my_o0); \ 03240 } 03241 03242 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \ 03243 { \ 03244 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 03245 { \ 03246 _Pragma("omp parallel private(k)") \ 03247 { \ 03248 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 03249 INT *ar_x = ths->index_x; \ 03250 \ 03251 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 03252 &min_u_b, &max_u_b, 2, ths->n, m); \ 03253 \ 03254 if (min_u_a != -1) \ 03255 { \ 03256 k = index_x_binary_search(ar_x, M, min_u_a); \ 03257 \ 03258 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \ 03259 \ 03260 while (k < M) \ 03261 { \ 03262 INT u_prod = ar_x[2*k]; \ 03263 INT j = ar_x[2*k+1]; \ 03264 \ 03265 if (u_prod < min_u_a || u_prod > max_u_a) \ 03266 break; \ 03267 \ 03268 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 03269 \ 03270 k++; \ 03271 } \ 03272 } \ 03273 \ 03274 if (min_u_b != -1) \ 03275 { \ 03276 INT k = index_x_binary_search(ar_x, M, min_u_b); \ 03277 \ 03278 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \ 03279 \ 03280 while (k < M) \ 03281 { \ 03282 INT u_prod = ar_x[2*k]; \ 03283 INT j = ar_x[2*k+1]; \ 03284 \ 03285 if (u_prod < min_u_b || u_prod > max_u_b) \ 03286 break; \ 03287 \ 03288 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 03289 \ 03290 k++; \ 03291 } \ 03292 } \ 03293 } /* omp parallel */ \ 03294 return; \ 03295 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \ 03296 } 03297 03298 03299 static void nfft_adjoint_2d_B(X(plan) *ths) 03300 { 03301 const INT n0 = ths->n[0]; 03302 const INT n1 = ths->n[1]; 03303 const INT M = ths->M_total; 03304 const INT m = ths->m; 03305 C* g = (C*) ths->g; 03306 INT k; 03307 03308 memset(g, 0, (size_t)(ths->n_total) * sizeof(C)); 03309 03310 if(ths->flags & PRE_FULL_PSI) 03311 { 03312 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M, 03313 (INT)2, ths->n, m, ths->flags, ths->index_x); 03314 return; 03315 } /* if(PRE_FULL_PSI) */ 03316 03317 if(ths->flags & PRE_PSI) 03318 { 03319 #ifdef _OPENMP 03320 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI) 03321 #endif 03322 03323 #ifdef _OPENMP 03324 #pragma omp parallel for default(shared) private(k) 03325 #endif 03326 for (k = 0; k < M; k++) 03327 { 03328 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03329 #ifdef _OPENMP 03330 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03331 #else 03332 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03333 #endif 03334 } 03335 return; 03336 } /* if(PRE_PSI) */ 03337 03338 if(ths->flags & PRE_FG_PSI) 03339 { 03340 R fg_exp_l[2*(2*m+2)]; 03341 03342 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 03343 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 03344 03345 #ifdef _OPENMP 03346 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI) 03347 #endif 03348 03349 03350 #ifdef _OPENMP 03351 #pragma omp parallel for default(shared) private(k) 03352 #endif 03353 for (k = 0; k < M; k++) 03354 { 03355 R psij_const[2*(2*m+2)]; 03356 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03357 INT l; 03358 R fg_psij0 = ths->psi[2*j*2]; 03359 R fg_psij1 = ths->psi[2*j*2+1]; 03360 R fg_psij2 = K(1.0); 03361 03362 psij_const[0] = fg_psij0; 03363 for(l=1; l<=2*m+1; l++) 03364 { 03365 fg_psij2 *= fg_psij1; 03366 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 03367 } 03368 03369 fg_psij0 = ths->psi[2*(j*2+1)]; 03370 fg_psij1 = ths->psi[2*(j*2+1)+1]; 03371 fg_psij2 = K(1.0); 03372 psij_const[2*m+2] = fg_psij0; 03373 for(l=1; l<=2*m+1; l++) 03374 { 03375 fg_psij2 *= fg_psij1; 03376 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 03377 } 03378 03379 #ifdef _OPENMP 03380 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03381 #else 03382 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03383 #endif 03384 } 03385 03386 return; 03387 } /* if(PRE_FG_PSI) */ 03388 03389 if(ths->flags & FG_PSI) 03390 { 03391 R fg_exp_l[2*(2*m+2)]; 03392 03393 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 03394 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 03395 03396 sort(ths); 03397 03398 #ifdef _OPENMP 03399 MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI) 03400 #endif 03401 03402 #ifdef _OPENMP 03403 #pragma omp parallel for default(shared) private(k) 03404 #endif 03405 for (k = 0; k < M; k++) 03406 { 03407 INT u, o, l; 03408 R fg_psij0, fg_psij1, fg_psij2; 03409 R psij_const[2*(2*m+2)]; 03410 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03411 03412 uo(ths,j,&u,&o,(INT)0); 03413 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0)); 03414 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]); 03415 fg_psij2 = K(1.0); 03416 psij_const[0] = fg_psij0; 03417 for(l=1; l<=2*m+1; l++) 03418 { 03419 fg_psij2 *= fg_psij1; 03420 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 03421 } 03422 03423 uo(ths,j,&u,&o,(INT)1); 03424 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1)); 03425 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]); 03426 fg_psij2 = K(1.0); 03427 psij_const[2*m+2] = fg_psij0; 03428 for(l=1; l<=2*m+1; l++) 03429 { 03430 fg_psij2 *= fg_psij1; 03431 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 03432 } 03433 03434 #ifdef _OPENMP 03435 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03436 #else 03437 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03438 #endif 03439 } 03440 03441 return; 03442 } /* if(FG_PSI) */ 03443 03444 if(ths->flags & PRE_LIN_PSI) 03445 { 03446 const INT K = ths->K; 03447 const INT ip_s = K / (m + 2); 03448 03449 sort(ths); 03450 03451 #ifdef _OPENMP 03452 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI) 03453 #endif 03454 03455 #ifdef _OPENMP 03456 #pragma omp parallel for default(shared) private(k) 03457 #endif 03458 for (k = 0; k < M; k++) 03459 { 03460 INT u,o,l; 03461 INT ip_u; 03462 R ip_y, ip_w; 03463 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03464 R psij_const[2*(2*m+2)]; 03465 03466 uo(ths,j,&u,&o,(INT)0); 03467 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s); 03468 ip_u = (INT)(LRINT(FLOOR(ip_y))); 03469 ip_w = ip_y - (R)(ip_u); 03470 for(l=0; l < 2*m+2; l++) 03471 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 03472 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); 03473 03474 uo(ths,j,&u,&o,(INT)1); 03475 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s); 03476 ip_u = (INT)(LRINT(FLOOR(ip_y))); 03477 ip_w = ip_y - (R)(ip_u); 03478 for(l=0; l < 2*m+2; l++) 03479 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 03480 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 03481 03482 #ifdef _OPENMP 03483 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03484 #else 03485 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03486 #endif 03487 } 03488 return; 03489 } /* if(PRE_LIN_PSI) */ 03490 03491 /* no precomputed psi at all */ 03492 sort(ths); 03493 03494 #ifdef _OPENMP 03495 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI) 03496 #endif 03497 03498 #ifdef _OPENMP 03499 #pragma omp parallel for default(shared) private(k) 03500 #endif 03501 for (k = 0; k < M; k++) 03502 { 03503 INT u,o,l; 03504 R psij_const[2*(2*m+2)]; 03505 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 03506 03507 uo(ths,j,&u,&o,(INT)0); 03508 for(l=0;l<=2*m+1;l++) 03509 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0)); 03510 03511 uo(ths,j,&u,&o,(INT)1); 03512 for(l=0;l<=2*m+1;l++) 03513 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1)); 03514 03515 #ifdef _OPENMP 03516 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03517 #else 03518 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m); 03519 #endif 03520 } 03521 } 03522 03523 03524 void X(trafo_2d)(X(plan) *ths) 03525 { 03526 INT k0,k1,n0,n1,N0,N1; 03527 C *g_hat,*f_hat; 03528 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12; 03529 R ck01, ck02, ck11, ck12; 03530 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22; 03531 03532 ths->g_hat=ths->g1; 03533 ths->g=ths->g2; 03534 03535 N0=ths->N[0]; 03536 N1=ths->N[1]; 03537 n0=ths->n[0]; 03538 n1=ths->n[1]; 03539 03540 f_hat=(C*)ths->f_hat; 03541 g_hat=(C*)ths->g_hat; 03542 03543 TIC(0) 03544 #ifdef _OPENMP 03545 #pragma omp parallel for default(shared) private(k0) 03546 for (k0 = 0; k0 < ths->n_total; k0++) 03547 ths->g_hat[k0] = 0.0; 03548 #else 03549 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C)); 03550 #endif 03551 if(ths->flags & PRE_PHI_HUT) 03552 { 03553 c_phi_inv01=ths->c_phi_inv[0]; 03554 c_phi_inv02=&ths->c_phi_inv[0][N0/2]; 03555 03556 #ifdef _OPENMP 03557 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12) 03558 #endif 03559 for(k0=0;k0<N0/2;k0++) 03560 { 03561 ck01=c_phi_inv01[k0]; 03562 ck02=c_phi_inv02[k0]; 03563 03564 c_phi_inv11=ths->c_phi_inv[1]; 03565 c_phi_inv12=&ths->c_phi_inv[1][N1/2]; 03566 03567 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2); 03568 f_hat11=f_hat + k0*N1; 03569 g_hat21=g_hat + k0*n1+n1-(N1/2); 03570 f_hat21=f_hat + ((N0/2)+k0)*N1; 03571 g_hat12=g_hat + (n0-(N0/2)+k0)*n1; 03572 f_hat12=f_hat + k0*N1+(N1/2); 03573 g_hat22=g_hat + k0*n1; 03574 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2); 03575 03576 for(k1=0;k1<N1/2;k1++) 03577 { 03578 ck11=c_phi_inv11[k1]; 03579 ck12=c_phi_inv12[k1]; 03580 03581 g_hat11[k1] = f_hat11[k1] * ck01 * ck11; 03582 g_hat21[k1] = f_hat21[k1] * ck02 * ck11; 03583 g_hat12[k1] = f_hat12[k1] * ck01 * ck12; 03584 g_hat22[k1] = f_hat22[k1] * ck02 * ck12; 03585 } 03586 } 03587 } 03588 else 03589 #ifdef _OPENMP 03590 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12) 03591 #endif 03592 for(k0=0;k0<N0/2;k0++) 03593 { 03594 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0)); 03595 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0)); 03596 for(k1=0;k1<N1/2;k1++) 03597 { 03598 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1)); 03599 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1)); 03600 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11; 03601 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11; 03602 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12; 03603 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12; 03604 } 03605 } 03606 03607 TOC(0) 03608 03609 TIC_FFTW(1) 03610 FFTW(execute)(ths->my_fftw_plan1); 03611 TOC_FFTW(1); 03612 03613 TIC(2); 03614 nfft_trafo_2d_B(ths); 03615 TOC(2); 03616 } 03617 03618 void X(adjoint_2d)(X(plan) *ths) 03619 { 03620 INT k0,k1,n0,n1,N0,N1; 03621 C *g_hat,*f_hat; 03622 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12; 03623 R ck01, ck02, ck11, ck12; 03624 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22; 03625 03626 ths->g_hat=ths->g1; 03627 ths->g=ths->g2; 03628 03629 N0=ths->N[0]; 03630 N1=ths->N[1]; 03631 n0=ths->n[0]; 03632 n1=ths->n[1]; 03633 03634 f_hat=(C*)ths->f_hat; 03635 g_hat=(C*)ths->g_hat; 03636 03637 TIC(2); 03638 nfft_adjoint_2d_B(ths); 03639 TOC(2); 03640 03641 TIC_FFTW(1) 03642 FFTW(execute)(ths->my_fftw_plan2); 03643 TOC_FFTW(1); 03644 03645 TIC(0) 03646 if(ths->flags & PRE_PHI_HUT) 03647 { 03648 c_phi_inv01=ths->c_phi_inv[0]; 03649 c_phi_inv02=&ths->c_phi_inv[0][N0/2]; 03650 03651 #ifdef _OPENMP 03652 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12) 03653 #endif 03654 for(k0=0;k0<N0/2;k0++) 03655 { 03656 ck01=c_phi_inv01[k0]; 03657 ck02=c_phi_inv02[k0]; 03658 03659 c_phi_inv11=ths->c_phi_inv[1]; 03660 c_phi_inv12=&ths->c_phi_inv[1][N1/2]; 03661 03662 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2); 03663 f_hat11=f_hat + k0*N1; 03664 g_hat21=g_hat + k0*n1+n1-(N1/2); 03665 f_hat21=f_hat + ((N0/2)+k0)*N1; 03666 g_hat12=g_hat + (n0-(N0/2)+k0)*n1; 03667 f_hat12=f_hat + k0*N1+(N1/2); 03668 g_hat22=g_hat + k0*n1; 03669 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2); 03670 03671 for(k1=0;k1<N1/2;k1++) 03672 { 03673 ck11=c_phi_inv11[k1]; 03674 ck12=c_phi_inv12[k1]; 03675 03676 f_hat11[k1] = g_hat11[k1] * ck01 * ck11; 03677 f_hat21[k1] = g_hat21[k1] * ck02 * ck11; 03678 f_hat12[k1] = g_hat12[k1] * ck01 * ck12; 03679 f_hat22[k1] = g_hat22[k1] * ck02 * ck12; 03680 } 03681 } 03682 } 03683 else 03684 #ifdef _OPENMP 03685 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12) 03686 #endif 03687 for(k0=0;k0<N0/2;k0++) 03688 { 03689 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0)); 03690 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0)); 03691 for(k1=0;k1<N1/2;k1++) 03692 { 03693 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1)); 03694 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1)); 03695 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11; 03696 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11; 03697 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12; 03698 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12; 03699 } 03700 } 03701 TOC(0) 03702 } 03703 03704 /* ################################################ SPECIFIC VERSIONS FOR d=3 */ 03705 03706 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b) 03707 { 03708 INT l; 03709 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq; 03710 03711 fg_exp_b0 = EXP(-K(1.0) / b); 03712 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0; 03713 fg_exp_b1 = K(1.0); 03714 fg_exp_b2 = K(1.0); 03715 fg_exp_l[0] = K(1.0); 03716 for(l=1; l <= 2*m+1; l++) 03717 { 03718 fg_exp_b2 = fg_exp_b1*fg_exp_b0; 03719 fg_exp_b1 *= fg_exp_b0_sq; 03720 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2; 03721 } 03722 } 03723 03724 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0, 03725 const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1, 03726 const R *xj2, const INT n0, const INT n1, const INT n2, const INT m) 03727 { 03728 INT u0, o0, l0, u1, o1, l1, u2, o2, l2; 03729 const C *gj; 03730 const R *psij0, *psij1, *psij2; 03731 03732 psij0 = psij_const0; 03733 psij1 = psij_const1; 03734 psij2 = psij_const2; 03735 03736 uo2(&u0, &o0, *xj0, n0, m); 03737 uo2(&u1, &o1, *xj1, n1, m); 03738 uo2(&u2, &o2, *xj2, n2, m); 03739 03740 *fj = 0; 03741 03742 if (u0 < o0) 03743 if (u1 < o1) 03744 if (u2 < o2) 03745 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 03746 { 03747 psij1 = psij_const1; 03748 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03749 { 03750 psij2 = psij_const2; 03751 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03752 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03753 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03754 } 03755 } 03756 else 03757 /* asserts (u2>o2)*/ 03758 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 03759 { 03760 psij1 = psij_const1; 03761 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03762 { 03763 psij2 = psij_const2; 03764 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03765 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03766 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03767 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 03768 for (l2 = 0; l2 <= o2; l2++) 03769 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03770 } 03771 } 03772 else /* asserts (u1>o1)*/ 03773 if (u2 < o2) 03774 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 03775 { 03776 psij1 = psij_const1; 03777 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03778 { 03779 psij2 = psij_const2; 03780 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03781 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03782 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03783 } 03784 for (l1 = 0; l1 <= o1; l1++, psij1++) 03785 { 03786 psij2 = psij_const2; 03787 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 03788 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03789 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03790 } 03791 } 03792 else/* asserts (u2>o2) */ 03793 { 03794 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 03795 { 03796 psij1 = psij_const1; 03797 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03798 { 03799 psij2 = psij_const2; 03800 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03801 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03802 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03803 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 03804 for (l2 = 0; l2 <= o2; l2++) 03805 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03806 } 03807 for (l1 = 0; l1 <= o1; l1++, psij1++) 03808 { 03809 psij2 = psij_const2; 03810 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 03811 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03812 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03813 gj = g + ((u0 + l0) * n1 + l1) * n2; 03814 for (l2 = 0; l2 <= o2; l2++) 03815 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03816 } 03817 } 03818 } 03819 else /* asserts (u0>o0) */ 03820 if (u1 < o1) 03821 if (u2 < o2) 03822 { 03823 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 03824 { 03825 psij1 = psij_const1; 03826 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03827 { 03828 psij2 = psij_const2; 03829 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03830 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03831 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03832 } 03833 } 03834 03835 for (l0 = 0; l0 <= o0; l0++, psij0++) 03836 { 03837 psij1 = psij_const1; 03838 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03839 { 03840 psij2 = psij_const2; 03841 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 03842 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03843 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03844 } 03845 } 03846 } else/* asserts (u2>o2) */ 03847 { 03848 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 03849 { 03850 psij1 = psij_const1; 03851 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03852 { 03853 psij2 = psij_const2; 03854 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03855 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03856 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03857 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 03858 for (l2 = 0; l2 <= o2; l2++) 03859 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03860 } 03861 } 03862 03863 for (l0 = 0; l0 <= o0; l0++, psij0++) 03864 { 03865 psij1 = psij_const1; 03866 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 03867 { 03868 psij2 = psij_const2; 03869 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 03870 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03871 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03872 gj = g + (l0 * n1 + (u1 + l1)) * n2; 03873 for (l2 = 0; l2 <= o2; l2++) 03874 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03875 } 03876 } 03877 } 03878 else /* asserts (u1>o1) */ 03879 if (u2 < o2) 03880 { 03881 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 03882 { 03883 psij1 = psij_const1; 03884 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03885 { 03886 psij2 = psij_const2; 03887 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03888 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03889 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03890 } 03891 for (l1 = 0; l1 <= o1; l1++, psij1++) 03892 { 03893 psij2 = psij_const2; 03894 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 03895 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03896 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03897 } 03898 } 03899 for (l0 = 0; l0 <= o0; l0++, psij0++) 03900 { 03901 psij1 = psij_const1; 03902 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03903 { 03904 psij2 = psij_const2; 03905 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 03906 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03907 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03908 } 03909 for (l1 = 0; l1 <= o1; l1++, psij1++) 03910 { 03911 psij2 = psij_const2; 03912 gj = g + (l0 * n1 + l1) * n2 + u2; 03913 for (l2 = 0; l2 <= 2 * m + 1; l2++) 03914 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03915 } 03916 } 03917 } else/* asserts (u2>o2) */ 03918 { 03919 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 03920 { 03921 psij1 = psij_const1; 03922 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03923 { 03924 psij2 = psij_const2; 03925 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 03926 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03927 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03928 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 03929 for (l2 = 0; l2 <= o2; l2++) 03930 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03931 } 03932 for (l1 = 0; l1 <= o1; l1++, psij1++) 03933 { 03934 psij2 = psij_const2; 03935 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 03936 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03937 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03938 gj = g + ((u0 + l0) * n1 + l1) * n2; 03939 for (l2 = 0; l2 <= o2; l2++) 03940 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03941 } 03942 } 03943 03944 for (l0 = 0; l0 <= o0; l0++, psij0++) 03945 { 03946 psij1 = psij_const1; 03947 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 03948 { 03949 psij2 = psij_const2; 03950 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 03951 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03952 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03953 gj = g + (l0 * n1 + (u1 + l1)) * n2; 03954 for (l2 = 0; l2 <= o2; l2++) 03955 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03956 } 03957 for (l1 = 0; l1 <= o1; l1++, psij1++) 03958 { 03959 psij2 = psij_const2; 03960 gj = g + (l0 * n1 + l1) * n2 + u2; 03961 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 03962 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03963 gj = g + (l0 * n1 + l1) * n2; 03964 for (l2 = 0; l2 <= o2; l2++) 03965 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++); 03966 } 03967 } 03968 } 03969 } 03970 03971 #ifdef _OPENMP 03972 03993 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g, 03994 const R *psij_const0, const R *psij_const1, const R *psij_const2, 03995 const R *xj0, const R *xj1, const R *xj2, 03996 const INT n0, const INT n1, const INT n2, const INT m, 03997 const INT my_u0, const INT my_o0) 03998 { 03999 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2; 04000 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2); 04001 04002 INT index_temp1[2*m+2]; 04003 INT index_temp2[2*m+2]; 04004 04005 uo2(&ar_u0,&ar_o0,*xj0, n0, m); 04006 uo2(&u1,&o1,*xj1, n1, m); 04007 uo2(&u2,&o2,*xj2, n2, m); 04008 04009 for (l1=0; l1<=2*m+1; l1++) 04010 index_temp1[l1] = (u1+l1)%n1; 04011 04012 for (l2=0; l2<=2*m+1; l2++) 04013 index_temp2[l2] = (u2+l2)%n2; 04014 04015 if(ar_u0<ar_o0) 04016 { 04017 INT u0 = MAX(my_u0,ar_u0); 04018 INT o0 = MIN(my_o0,ar_o0); 04019 INT offset_psij = u0-ar_u0; 04020 #ifdef OMP_ASSERT 04021 assert(offset_psij >= 0); 04022 assert(o0-u0 <= 2*m+1); 04023 assert(offset_psij+o0-u0 <= 2*m+1); 04024 #endif 04025 04026 for (l0 = 0; l0 <= o0-u0; l0++) 04027 { 04028 const INT i0 = (u0+l0) * n1; 04029 const C val0 = psij_const0[offset_psij+l0]; 04030 04031 for(l1=0; l1<=2*m+1; l1++) 04032 { 04033 const INT i1 = (i0 + index_temp1[l1]) * n2; 04034 const C val1 = psij_const1[l1]; 04035 04036 for(l2=0; l2<=2*m+1; l2++) 04037 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f; 04038 } 04039 } 04040 } 04041 else 04042 { 04043 INT u0 = MAX(my_u0,ar_u0); 04044 INT o0 = my_o0; 04045 INT offset_psij = u0-ar_u0; 04046 #ifdef OMP_ASSERT 04047 assert(offset_psij >= 0); 04048 assert(o0-u0 <= 2*m+1); 04049 assert(offset_psij+o0-u0 <= 2*m+1); 04050 #endif 04051 04052 for (l0 = 0; l0 <= o0-u0; l0++) 04053 { 04054 INT i0 = (u0+l0) * n1; 04055 const C val0 = psij_const0[offset_psij+l0]; 04056 04057 for(l1=0; l1<=2*m+1; l1++) 04058 { 04059 const INT i1 = (i0 + index_temp1[l1]) * n2; 04060 const C val1 = psij_const1[l1]; 04061 04062 for(l2=0; l2<=2*m+1; l2++) 04063 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f; 04064 } 04065 } 04066 04067 u0 = my_u0; 04068 o0 = MIN(my_o0,ar_o0); 04069 offset_psij += my_u0-ar_u0+n0; 04070 04071 #ifdef OMP_ASSERT 04072 if (u0<=o0) 04073 { 04074 assert(o0-u0 <= 2*m+1); 04075 assert(offset_psij+o0-u0 <= 2*m+1); 04076 } 04077 #endif 04078 for (l0 = 0; l0 <= o0-u0; l0++) 04079 { 04080 INT i0 = (u0+l0) * n1; 04081 const C val0 = psij_const0[offset_psij+l0]; 04082 04083 for(l1=0; l1<=2*m+1; l1++) 04084 { 04085 const INT i1 = (i0 + index_temp1[l1]) * n2; 04086 const C val1 = psij_const1[l1]; 04087 04088 for(l2=0; l2<=2*m+1; l2++) 04089 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f; 04090 } 04091 } 04092 } 04093 } 04094 #endif 04095 04096 #ifdef _OPENMP 04097 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */ 04098 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g, 04099 const R *psij_const0, const R *psij_const1, const R *psij_const2, 04100 const R *xj0, const R *xj1, const R *xj2, 04101 const INT n0, const INT n1, const INT n2, const INT m) 04102 { 04103 INT u0,o0,l0,u1,o1,l1,u2,o2,l2; 04104 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2); 04105 04106 INT index_temp0[2*m+2]; 04107 INT index_temp1[2*m+2]; 04108 INT index_temp2[2*m+2]; 04109 04110 uo2(&u0,&o0,*xj0, n0, m); 04111 uo2(&u1,&o1,*xj1, n1, m); 04112 uo2(&u2,&o2,*xj2, n2, m); 04113 04114 for (l0=0; l0<=2*m+1; l0++) 04115 index_temp0[l0] = (u0+l0)%n0; 04116 04117 for (l1=0; l1<=2*m+1; l1++) 04118 index_temp1[l1] = (u1+l1)%n1; 04119 04120 for (l2=0; l2<=2*m+1; l2++) 04121 index_temp2[l2] = (u2+l2)%n2; 04122 04123 for(l0=0; l0<=2*m+1; l0++) 04124 { 04125 for(l1=0; l1<=2*m+1; l1++) 04126 { 04127 for(l2=0; l2<=2*m+1; l2++) 04128 { 04129 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2]; 04130 C *lhs = g+i; 04131 R *lhs_real = (R*)lhs; 04132 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f; 04133 04134 #pragma omp atomic 04135 lhs_real[0] += CREAL(val); 04136 04137 #pragma omp atomic 04138 lhs_real[1] += CIMAG(val); 04139 } 04140 } 04141 } 04142 } 04143 #endif 04144 04145 #ifndef _OPENMP 04146 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g, 04147 const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0, 04148 const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2, 04149 const INT m) 04150 { 04151 INT u0, o0, l0, u1, o1, l1, u2, o2, l2; 04152 C *gj; 04153 const R *psij0, *psij1, *psij2; 04154 04155 psij0 = psij_const0; 04156 psij1 = psij_const1; 04157 psij2 = psij_const2; 04158 04159 uo2(&u0, &o0, *xj0, n0, m); 04160 uo2(&u1, &o1, *xj1, n1, m); 04161 uo2(&u2, &o2, *xj2, n2, m); 04162 04163 if (u0 < o0) 04164 if (u1 < o1) 04165 if (u2 < o2) 04166 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 04167 { 04168 psij1 = psij_const1; 04169 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04170 { 04171 psij2 = psij_const2; 04172 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04173 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04174 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04175 } 04176 } 04177 else 04178 /* asserts (u2>o2)*/ 04179 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 04180 { 04181 psij1 = psij_const1; 04182 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04183 { 04184 psij2 = psij_const2; 04185 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04186 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04187 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04188 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 04189 for (l2 = 0; l2 <= o2; l2++) 04190 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04191 } 04192 } 04193 else /* asserts (u1>o1)*/ 04194 if (u2 < o2) 04195 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 04196 { 04197 psij1 = psij_const1; 04198 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04199 { 04200 psij2 = psij_const2; 04201 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04202 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04203 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04204 } 04205 for (l1 = 0; l1 <= o1; l1++, psij1++) 04206 { 04207 psij2 = psij_const2; 04208 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 04209 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04210 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04211 } 04212 } 04213 else/* asserts (u2>o2) */ 04214 { 04215 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++) 04216 { 04217 psij1 = psij_const1; 04218 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04219 { 04220 psij2 = psij_const2; 04221 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04222 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04223 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04224 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 04225 for (l2 = 0; l2 <= o2; l2++) 04226 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04227 } 04228 for (l1 = 0; l1 <= o1; l1++, psij1++) 04229 { 04230 psij2 = psij_const2; 04231 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 04232 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04233 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04234 gj = g + ((u0 + l0) * n1 + l1) * n2; 04235 for (l2 = 0; l2 <= o2; l2++) 04236 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04237 } 04238 } 04239 } 04240 else /* asserts (u0>o0) */ 04241 if (u1 < o1) 04242 if (u2 < o2) 04243 { 04244 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 04245 { 04246 psij1 = psij_const1; 04247 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04248 { 04249 psij2 = psij_const2; 04250 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04251 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04252 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04253 } 04254 } 04255 04256 for (l0 = 0; l0 <= o0; l0++, psij0++) 04257 { 04258 psij1 = psij_const1; 04259 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04260 { 04261 psij2 = psij_const2; 04262 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 04263 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04264 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04265 } 04266 } 04267 } else/* asserts (u2>o2) */ 04268 { 04269 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 04270 { 04271 psij1 = psij_const1; 04272 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04273 { 04274 psij2 = psij_const2; 04275 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04276 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04277 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04278 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 04279 for (l2 = 0; l2 <= o2; l2++) 04280 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04281 } 04282 } 04283 04284 for (l0 = 0; l0 <= o0; l0++, psij0++) 04285 { 04286 psij1 = psij_const1; 04287 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++) 04288 { 04289 psij2 = psij_const2; 04290 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 04291 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04292 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04293 gj = g + (l0 * n1 + (u1 + l1)) * n2; 04294 for (l2 = 0; l2 <= o2; l2++) 04295 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04296 } 04297 } 04298 } 04299 else /* asserts (u1>o1) */ 04300 if (u2 < o2) 04301 { 04302 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 04303 { 04304 psij1 = psij_const1; 04305 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04306 { 04307 psij2 = psij_const2; 04308 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04309 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04310 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04311 } 04312 for (l1 = 0; l1 <= o1; l1++, psij1++) 04313 { 04314 psij2 = psij_const2; 04315 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 04316 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04317 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04318 } 04319 } 04320 for (l0 = 0; l0 <= o0; l0++, psij0++) 04321 { 04322 psij1 = psij_const1; 04323 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04324 { 04325 psij2 = psij_const2; 04326 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 04327 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04328 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04329 } 04330 for (l1 = 0; l1 <= o1; l1++, psij1++) 04331 { 04332 psij2 = psij_const2; 04333 gj = g + (l0 * n1 + l1) * n2 + u2; 04334 for (l2 = 0; l2 <= 2 * m + 1; l2++) 04335 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04336 } 04337 } 04338 } else/* asserts (u2>o2) */ 04339 { 04340 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++) 04341 { 04342 psij1 = psij_const1; 04343 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04344 { 04345 psij2 = psij_const2; 04346 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2; 04347 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04348 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04349 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2; 04350 for (l2 = 0; l2 <= o2; l2++) 04351 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04352 } 04353 for (l1 = 0; l1 <= o1; l1++, psij1++) 04354 { 04355 psij2 = psij_const2; 04356 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2; 04357 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04358 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04359 gj = g + ((u0 + l0) * n1 + l1) * n2; 04360 for (l2 = 0; l2 <= o2; l2++) 04361 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04362 } 04363 } 04364 04365 for (l0 = 0; l0 <= o0; l0++, psij0++) 04366 { 04367 psij1 = psij_const1; 04368 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++) 04369 { 04370 psij2 = psij_const2; 04371 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2; 04372 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04373 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04374 gj = g + (l0 * n1 + (u1 + l1)) * n2; 04375 for (l2 = 0; l2 <= o2; l2++) 04376 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04377 } 04378 for (l1 = 0; l1 <= o1; l1++, psij1++) 04379 { 04380 psij2 = psij_const2; 04381 gj = g + (l0 * n1 + l1) * n2 + u2; 04382 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++) 04383 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04384 gj = g + (l0 * n1 + l1) * n2; 04385 for (l2 = 0; l2 <= o2; l2++) 04386 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj); 04387 } 04388 } 04389 } 04390 } 04391 #endif 04392 04393 static void nfft_trafo_3d_B(X(plan) *ths) 04394 { 04395 const INT n0 = ths->n[0]; 04396 const INT n1 = ths->n[1]; 04397 const INT n2 = ths->n[2]; 04398 const INT M = ths->M_total; 04399 const INT m = ths->m; 04400 04401 const C* g = (C*) ths->g; 04402 04403 INT k; 04404 04405 if(ths->flags & PRE_FULL_PSI) 04406 { 04407 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2); 04408 #ifdef _OPENMP 04409 #pragma omp parallel for default(shared) private(k) 04410 #endif 04411 for (k = 0; k < M; k++) 04412 { 04413 INT l; 04414 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04415 ths->f[j] = K(0.0); 04416 for (l = 0; l < lprod; l++) 04417 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]]; 04418 } 04419 return; 04420 } /* if(PRE_FULL_PSI) */ 04421 04422 if(ths->flags & PRE_PSI) 04423 { 04424 #ifdef _OPENMP 04425 #pragma omp parallel for default(shared) private(k) 04426 #endif 04427 for (k = 0; k < M; k++) 04428 { 04429 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04430 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04431 } 04432 return; 04433 } /* if(PRE_PSI) */ 04434 04435 if(ths->flags & PRE_FG_PSI) 04436 { 04437 R fg_exp_l[3*(2*m+2)]; 04438 04439 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 04440 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 04441 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]); 04442 04443 #ifdef _OPENMP 04444 #pragma omp parallel for default(shared) private(k) 04445 #endif 04446 for (k = 0; k < M; k++) 04447 { 04448 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04449 INT l; 04450 R psij_const[3*(2*m+2)]; 04451 R fg_psij0 = ths->psi[2*j*3]; 04452 R fg_psij1 = ths->psi[2*j*3+1]; 04453 R fg_psij2 = K(1.0); 04454 04455 psij_const[0] = fg_psij0; 04456 for(l=1; l<=2*m+1; l++) 04457 { 04458 fg_psij2 *= fg_psij1; 04459 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 04460 } 04461 04462 fg_psij0 = ths->psi[2*(j*3+1)]; 04463 fg_psij1 = ths->psi[2*(j*3+1)+1]; 04464 fg_psij2 = K(1.0); 04465 psij_const[2*m+2] = fg_psij0; 04466 for(l=1; l<=2*m+1; l++) 04467 { 04468 fg_psij2 *= fg_psij1; 04469 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 04470 } 04471 04472 fg_psij0 = ths->psi[2*(j*3+2)]; 04473 fg_psij1 = ths->psi[2*(j*3+2)+1]; 04474 fg_psij2 = K(1.0); 04475 psij_const[2*(2*m+2)] = fg_psij0; 04476 for(l=1; l<=2*m+1; l++) 04477 { 04478 fg_psij2 *= fg_psij1; 04479 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; 04480 } 04481 04482 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04483 } 04484 04485 return; 04486 } /* if(PRE_FG_PSI) */ 04487 04488 if(ths->flags & FG_PSI) 04489 { 04490 R fg_exp_l[3*(2*m+2)]; 04491 04492 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 04493 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 04494 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]); 04495 04496 sort(ths); 04497 04498 #ifdef _OPENMP 04499 #pragma omp parallel for default(shared) private(k) 04500 #endif 04501 for (k = 0; k < M; k++) 04502 { 04503 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04504 INT u, o, l; 04505 R psij_const[3*(2*m+2)]; 04506 R fg_psij0, fg_psij1, fg_psij2; 04507 04508 uo(ths,j,&u,&o,(INT)0); 04509 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0)); 04510 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]); 04511 fg_psij2 = K(1.0); 04512 psij_const[0] = fg_psij0; 04513 for(l=1; l<=2*m+1; l++) 04514 { 04515 fg_psij2 *= fg_psij1; 04516 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 04517 } 04518 04519 uo(ths,j,&u,&o,(INT)1); 04520 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1)); 04521 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]); 04522 fg_psij2 = K(1.0); 04523 psij_const[2*m+2] = fg_psij0; 04524 for(l=1; l<=2*m+1; l++) 04525 { 04526 fg_psij2 *= fg_psij1; 04527 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 04528 } 04529 04530 uo(ths,j,&u,&o,(INT)2); 04531 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2)); 04532 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]); 04533 fg_psij2 = K(1.0); 04534 psij_const[2*(2*m+2)] = fg_psij0; 04535 for(l=1; l<=2*m+1; l++) 04536 { 04537 fg_psij2 *= fg_psij1; 04538 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; 04539 } 04540 04541 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04542 } 04543 04544 return; 04545 } /* if(FG_PSI) */ 04546 04547 if(ths->flags & PRE_LIN_PSI) 04548 { 04549 const INT K = ths->K, ip_s = K / (m + 2); 04550 04551 sort(ths); 04552 04553 #ifdef _OPENMP 04554 #pragma omp parallel for default(shared) private(k) 04555 #endif 04556 for (k = 0; k < M; k++) 04557 { 04558 INT u, o, l; 04559 R ip_y, ip_w; 04560 INT ip_u; 04561 R psij_const[3*(2*m+2)]; 04562 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04563 04564 uo(ths,j,&u,&o,(INT)0); 04565 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s); 04566 ip_u = (INT)(LRINT(FLOOR(ip_y))); 04567 ip_w = ip_y - (R)(ip_u); 04568 for(l=0; l < 2*m+2; l++) 04569 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 04570 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); 04571 04572 uo(ths,j,&u,&o,(INT)1); 04573 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s); 04574 ip_u = (INT)(LRINT(FLOOR(ip_y))); 04575 ip_w = ip_y - (R)(ip_u); 04576 for(l=0; l < 2*m+2; l++) 04577 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 04578 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 04579 04580 uo(ths,j,&u,&o,(INT)2); 04581 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s); 04582 ip_u = (INT)(LRINT(FLOOR(ip_y))); 04583 ip_w = ip_y - (R)(ip_u); 04584 for(l=0; l < 2*m+2; l++) 04585 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 04586 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 04587 04588 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04589 } 04590 return; 04591 } /* if(PRE_LIN_PSI) */ 04592 04593 /* no precomputed psi at all */ 04594 04595 sort(ths); 04596 04597 #ifdef _OPENMP 04598 #pragma omp parallel for default(shared) private(k) 04599 #endif 04600 for (k = 0; k < M; k++) 04601 { 04602 R psij_const[3*(2*m+2)]; 04603 INT u, o, l; 04604 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04605 04606 uo(ths,j,&u,&o,(INT)0); 04607 for(l=0;l<=2*m+1;l++) 04608 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0)); 04609 04610 uo(ths,j,&u,&o,(INT)1); 04611 for(l=0;l<=2*m+1;l++) 04612 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1)); 04613 04614 uo(ths,j,&u,&o,(INT)2); 04615 for(l=0;l<=2*m+1;l++) 04616 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2)); 04617 04618 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04619 } 04620 } 04621 04622 #ifdef OMP_ASSERT 04623 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \ 04624 { \ 04625 assert(ar_x[2*k] >= min_u_a || k == M-1); \ 04626 if (k > 0) \ 04627 assert(ar_x[2*k-2] < min_u_a); \ 04628 } 04629 #else 04630 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A 04631 #endif 04632 04633 #ifdef OMP_ASSERT 04634 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \ 04635 { \ 04636 assert(ar_x[2*k] >= min_u_b || k == M-1); \ 04637 if (k > 0) \ 04638 assert(ar_x[2*k-2] < min_u_b); \ 04639 } 04640 #else 04641 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B 04642 #endif 04643 04644 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 04645 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 04646 ths->psi+j*3*(2*m+2), \ 04647 ths->psi+(j*3+1)*(2*m+2), \ 04648 ths->psi+(j*3+2)*(2*m+2), \ 04649 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 04650 n0, n1, n2, m, my_u0, my_o0); 04651 04652 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 04653 { \ 04654 INT u, o, l; \ 04655 R psij_const[3*(2*m+2)]; \ 04656 R fg_psij0 = ths->psi[2*j*3]; \ 04657 R fg_psij1 = ths->psi[2*j*3+1]; \ 04658 R fg_psij2 = K(1.0); \ 04659 \ 04660 psij_const[0] = fg_psij0; \ 04661 for(l=1; l<=2*m+1; l++) \ 04662 { \ 04663 fg_psij2 *= fg_psij1; \ 04664 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 04665 } \ 04666 \ 04667 fg_psij0 = ths->psi[2*(j*3+1)]; \ 04668 fg_psij1 = ths->psi[2*(j*3+1)+1]; \ 04669 fg_psij2 = K(1.0); \ 04670 psij_const[2*m+2] = fg_psij0; \ 04671 for(l=1; l<=2*m+1; l++) \ 04672 { \ 04673 fg_psij2 *= fg_psij1; \ 04674 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 04675 } \ 04676 \ 04677 fg_psij0 = ths->psi[2*(j*3+2)]; \ 04678 fg_psij1 = ths->psi[2*(j*3+2)+1]; \ 04679 fg_psij2 = K(1.0); \ 04680 psij_const[2*(2*m+2)] = fg_psij0; \ 04681 for(l=1; l<=2*m+1; l++) \ 04682 { \ 04683 fg_psij2 *= fg_psij1; \ 04684 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \ 04685 } \ 04686 \ 04687 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 04688 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 04689 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 04690 n0, n1, n2, m, my_u0, my_o0); \ 04691 } 04692 04693 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 04694 { \ 04695 INT u, o, l; \ 04696 R psij_const[3*(2*m+2)]; \ 04697 R fg_psij0, fg_psij1, fg_psij2; \ 04698 \ 04699 uo(ths,j,&u,&o,(INT)0); \ 04700 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \ 04701 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \ 04702 fg_psij2 = K(1.0); \ 04703 psij_const[0] = fg_psij0; \ 04704 for(l=1; l<=2*m+1; l++) \ 04705 { \ 04706 fg_psij2 *= fg_psij1; \ 04707 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 04708 } \ 04709 \ 04710 uo(ths,j,&u,&o,(INT)1); \ 04711 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \ 04712 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \ 04713 fg_psij2 = K(1.0); \ 04714 psij_const[2*m+2] = fg_psij0; \ 04715 for(l=1; l<=2*m+1; l++) \ 04716 { \ 04717 fg_psij2 *= fg_psij1; \ 04718 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 04719 } \ 04720 \ 04721 uo(ths,j,&u,&o,(INT)2); \ 04722 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \ 04723 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \ 04724 fg_psij2 = K(1.0); \ 04725 psij_const[2*(2*m+2)] = fg_psij0; \ 04726 for(l=1; l<=2*m+1; l++) \ 04727 { \ 04728 fg_psij2 *= fg_psij1; \ 04729 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \ 04730 } \ 04731 \ 04732 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 04733 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 04734 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 04735 n0, n1, n2, m, my_u0, my_o0); \ 04736 } 04737 04738 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 04739 { \ 04740 INT u, o, l; \ 04741 R psij_const[3*(2*m+2)]; \ 04742 INT ip_u; \ 04743 R ip_y, ip_w; \ 04744 \ 04745 uo(ths,j,&u,&o,(INT)0); \ 04746 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \ 04747 ip_u = LRINT(FLOOR(ip_y)); \ 04748 ip_w = ip_y-ip_u; \ 04749 for(l=0; l < 2*m+2; l++) \ 04750 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 04751 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \ 04752 \ 04753 uo(ths,j,&u,&o,(INT)1); \ 04754 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \ 04755 ip_u = LRINT(FLOOR(ip_y)); \ 04756 ip_w = ip_y-ip_u; \ 04757 for(l=0; l < 2*m+2; l++) \ 04758 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 04759 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 04760 \ 04761 uo(ths,j,&u,&o,(INT)2); \ 04762 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \ 04763 ip_u = LRINT(FLOOR(ip_y)); \ 04764 ip_w = ip_y-ip_u; \ 04765 for(l=0; l < 2*m+2; l++) \ 04766 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 04767 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 04768 \ 04769 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 04770 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 04771 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 04772 n0, n1, n2, m, my_u0, my_o0); \ 04773 } 04774 04775 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 04776 { \ 04777 INT u, o, l; \ 04778 R psij_const[3*(2*m+2)]; \ 04779 \ 04780 uo(ths,j,&u,&o,(INT)0); \ 04781 for(l=0;l<=2*m+1;l++) \ 04782 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \ 04783 \ 04784 uo(ths,j,&u,&o,(INT)1); \ 04785 for(l=0;l<=2*m+1;l++) \ 04786 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \ 04787 \ 04788 uo(ths,j,&u,&o,(INT)2); \ 04789 for(l=0;l<=2*m+1;l++) \ 04790 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \ 04791 \ 04792 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 04793 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 04794 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 04795 n0, n1, n2, m, my_u0, my_o0); \ 04796 } 04797 04798 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \ 04799 { \ 04800 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 04801 { \ 04802 _Pragma("omp parallel private(k)") \ 04803 { \ 04804 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 04805 INT *ar_x = ths->index_x; \ 04806 \ 04807 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 04808 &min_u_b, &max_u_b, 3, ths->n, m); \ 04809 \ 04810 if (min_u_a != -1) \ 04811 { \ 04812 k = index_x_binary_search(ar_x, M, min_u_a); \ 04813 \ 04814 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \ 04815 \ 04816 while (k < M) \ 04817 { \ 04818 INT u_prod = ar_x[2*k]; \ 04819 INT j = ar_x[2*k+1]; \ 04820 \ 04821 if (u_prod < min_u_a || u_prod > max_u_a) \ 04822 break; \ 04823 \ 04824 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 04825 \ 04826 k++; \ 04827 } \ 04828 } \ 04829 \ 04830 if (min_u_b != -1) \ 04831 { \ 04832 INT k = index_x_binary_search(ar_x, M, min_u_b); \ 04833 \ 04834 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \ 04835 \ 04836 while (k < M) \ 04837 { \ 04838 INT u_prod = ar_x[2*k]; \ 04839 INT j = ar_x[2*k+1]; \ 04840 \ 04841 if (u_prod < min_u_b || u_prod > max_u_b) \ 04842 break; \ 04843 \ 04844 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 04845 \ 04846 k++; \ 04847 } \ 04848 } \ 04849 } /* omp parallel */ \ 04850 return; \ 04851 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \ 04852 } 04853 04854 static void nfft_adjoint_3d_B(X(plan) *ths) 04855 { 04856 INT k; 04857 const INT n0 = ths->n[0]; 04858 const INT n1 = ths->n[1]; 04859 const INT n2 = ths->n[2]; 04860 const INT M = ths->M_total; 04861 const INT m = ths->m; 04862 04863 C* g = (C*) ths->g; 04864 04865 memset(g, 0, (size_t)(ths->n_total) * sizeof(C)); 04866 04867 if(ths->flags & PRE_FULL_PSI) 04868 { 04869 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M, 04870 (INT)3, ths->n, m, ths->flags, ths->index_x); 04871 return; 04872 } /* if(PRE_FULL_PSI) */ 04873 04874 if(ths->flags & PRE_PSI) 04875 { 04876 #ifdef _OPENMP 04877 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI) 04878 #endif 04879 04880 #ifdef _OPENMP 04881 #pragma omp parallel for default(shared) private(k) 04882 #endif 04883 for (k = 0; k < M; k++) 04884 { 04885 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04886 #ifdef _OPENMP 04887 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04888 #else 04889 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04890 #endif 04891 } 04892 return; 04893 } /* if(PRE_PSI) */ 04894 04895 if(ths->flags & PRE_FG_PSI) 04896 { 04897 R fg_exp_l[3*(2*m+2)]; 04898 04899 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 04900 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 04901 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]); 04902 04903 #ifdef _OPENMP 04904 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI) 04905 #endif 04906 04907 #ifdef _OPENMP 04908 #pragma omp parallel for default(shared) private(k) 04909 #endif 04910 for (k = 0; k < M; k++) 04911 { 04912 R psij_const[3*(2*m+2)]; 04913 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04914 INT l; 04915 R fg_psij0 = ths->psi[2*j*3]; 04916 R fg_psij1 = ths->psi[2*j*3+1]; 04917 R fg_psij2 = K(1.0); 04918 04919 psij_const[0] = fg_psij0; 04920 for(l=1; l<=2*m+1; l++) 04921 { 04922 fg_psij2 *= fg_psij1; 04923 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 04924 } 04925 04926 fg_psij0 = ths->psi[2*(j*3+1)]; 04927 fg_psij1 = ths->psi[2*(j*3+1)+1]; 04928 fg_psij2 = K(1.0); 04929 psij_const[2*m+2] = fg_psij0; 04930 for(l=1; l<=2*m+1; l++) 04931 { 04932 fg_psij2 *= fg_psij1; 04933 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 04934 } 04935 04936 fg_psij0 = ths->psi[2*(j*3+2)]; 04937 fg_psij1 = ths->psi[2*(j*3+2)+1]; 04938 fg_psij2 = K(1.0); 04939 psij_const[2*(2*m+2)] = fg_psij0; 04940 for(l=1; l<=2*m+1; l++) 04941 { 04942 fg_psij2 *= fg_psij1; 04943 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; 04944 } 04945 04946 #ifdef _OPENMP 04947 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04948 #else 04949 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 04950 #endif 04951 } 04952 04953 return; 04954 } /* if(PRE_FG_PSI) */ 04955 04956 if(ths->flags & FG_PSI) 04957 { 04958 R fg_exp_l[3*(2*m+2)]; 04959 04960 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]); 04961 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]); 04962 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]); 04963 04964 sort(ths); 04965 04966 #ifdef _OPENMP 04967 MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI) 04968 #endif 04969 04970 #ifdef _OPENMP 04971 #pragma omp parallel for default(shared) private(k) 04972 #endif 04973 for (k = 0; k < M; k++) 04974 { 04975 INT u,o,l; 04976 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 04977 R psij_const[3*(2*m+2)]; 04978 R fg_psij0, fg_psij1, fg_psij2; 04979 04980 uo(ths,j,&u,&o,(INT)0); 04981 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0)); 04982 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]); 04983 fg_psij2 = K(1.0); 04984 psij_const[0] = fg_psij0; 04985 for(l=1; l<=2*m+1; l++) 04986 { 04987 fg_psij2 *= fg_psij1; 04988 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; 04989 } 04990 04991 uo(ths,j,&u,&o,(INT)1); 04992 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1)); 04993 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]); 04994 fg_psij2 = K(1.0); 04995 psij_const[2*m+2] = fg_psij0; 04996 for(l=1; l<=2*m+1; l++) 04997 { 04998 fg_psij2 *= fg_psij1; 04999 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; 05000 } 05001 05002 uo(ths,j,&u,&o,(INT)2); 05003 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2)); 05004 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]); 05005 fg_psij2 = K(1.0); 05006 psij_const[2*(2*m+2)] = fg_psij0; 05007 for(l=1; l<=2*m+1; l++) 05008 { 05009 fg_psij2 *= fg_psij1; 05010 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; 05011 } 05012 05013 #ifdef _OPENMP 05014 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05015 #else 05016 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05017 #endif 05018 } 05019 05020 return; 05021 } /* if(FG_PSI) */ 05022 05023 if(ths->flags & PRE_LIN_PSI) 05024 { 05025 const INT K = ths->K; 05026 const INT ip_s = K / (m + 2); 05027 05028 sort(ths); 05029 05030 #ifdef _OPENMP 05031 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI) 05032 #endif 05033 05034 #ifdef _OPENMP 05035 #pragma omp parallel for default(shared) private(k) 05036 #endif 05037 for (k = 0; k < M; k++) 05038 { 05039 INT u,o,l; 05040 INT ip_u; 05041 R ip_y, ip_w; 05042 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 05043 R psij_const[3*(2*m+2)]; 05044 05045 uo(ths,j,&u,&o,(INT)0); 05046 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s); 05047 ip_u = (INT)(LRINT(FLOOR(ip_y))); 05048 ip_w = ip_y - (R)(ip_u); 05049 for(l=0; l < 2*m+2; l++) 05050 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 05051 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); 05052 05053 uo(ths,j,&u,&o,(INT)1); 05054 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s); 05055 ip_u = (INT)(LRINT(FLOOR(ip_y))); 05056 ip_w = ip_y - (R)(ip_u); 05057 for(l=0; l < 2*m+2; l++) 05058 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 05059 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 05060 05061 uo(ths,j,&u,&o,(INT)2); 05062 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s); 05063 ip_u = (INT)(LRINT(FLOOR(ip_y))); 05064 ip_w = ip_y - (R)(ip_u); 05065 for(l=0; l < 2*m+2; l++) 05066 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + 05067 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); 05068 05069 #ifdef _OPENMP 05070 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05071 #else 05072 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05073 #endif 05074 } 05075 return; 05076 } /* if(PRE_LIN_PSI) */ 05077 05078 /* no precomputed psi at all */ 05079 sort(ths); 05080 05081 #ifdef _OPENMP 05082 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI) 05083 #endif 05084 05085 #ifdef _OPENMP 05086 #pragma omp parallel for default(shared) private(k) 05087 #endif 05088 for (k = 0; k < M; k++) 05089 { 05090 INT u,o,l; 05091 R psij_const[3*(2*m+2)]; 05092 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; 05093 05094 uo(ths,j,&u,&o,(INT)0); 05095 for(l=0;l<=2*m+1;l++) 05096 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0)); 05097 05098 uo(ths,j,&u,&o,(INT)1); 05099 for(l=0;l<=2*m+1;l++) 05100 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1)); 05101 05102 uo(ths,j,&u,&o,(INT)2); 05103 for(l=0;l<=2*m+1;l++) 05104 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2)); 05105 05106 #ifdef _OPENMP 05107 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05108 #else 05109 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m); 05110 #endif 05111 } 05112 } 05113 05114 05115 void X(trafo_3d)(X(plan) *ths) 05116 { 05117 INT k0,k1,k2,n0,n1,n2,N0,N1,N2; 05118 C *g_hat,*f_hat; 05119 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22; 05120 R ck01, ck02, ck11, ck12, ck21, ck22; 05121 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221; 05122 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222; 05123 05124 ths->g_hat=ths->g1; 05125 ths->g=ths->g2; 05126 05127 N0=ths->N[0]; 05128 N1=ths->N[1]; 05129 N2=ths->N[2]; 05130 n0=ths->n[0]; 05131 n1=ths->n[1]; 05132 n2=ths->n[2]; 05133 05134 f_hat=(C*)ths->f_hat; 05135 g_hat=(C*)ths->g_hat; 05136 05137 TIC(0) 05138 #ifdef _OPENMP 05139 #pragma omp parallel for default(shared) private(k0) 05140 for (k0 = 0; k0 < ths->n_total; k0++) 05141 ths->g_hat[k0] = 0.0; 05142 #else 05143 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C)); 05144 #endif 05145 05146 if(ths->flags & PRE_PHI_HUT) 05147 { 05148 c_phi_inv01=ths->c_phi_inv[0]; 05149 c_phi_inv02=&ths->c_phi_inv[0][N0/2]; 05150 05151 #ifdef _OPENMP 05152 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22) 05153 #endif 05154 for(k0=0;k0<N0/2;k0++) 05155 { 05156 ck01=c_phi_inv01[k0]; 05157 ck02=c_phi_inv02[k0]; 05158 c_phi_inv11=ths->c_phi_inv[1]; 05159 c_phi_inv12=&ths->c_phi_inv[1][N1/2]; 05160 05161 for(k1=0;k1<N1/2;k1++) 05162 { 05163 ck11=c_phi_inv11[k1]; 05164 ck12=c_phi_inv12[k1]; 05165 c_phi_inv21=ths->c_phi_inv[2]; 05166 c_phi_inv22=&ths->c_phi_inv[2][N2/2]; 05167 05168 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2); 05169 f_hat111=f_hat + (k0*N1+k1)*N2; 05170 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2); 05171 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2; 05172 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2); 05173 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2; 05174 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2); 05175 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2; 05176 05177 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2; 05178 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2); 05179 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2; 05180 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2); 05181 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2; 05182 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2); 05183 g_hat222=g_hat + (k0*n1+k1)*n2; 05184 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2); 05185 05186 for(k2=0;k2<N2/2;k2++) 05187 { 05188 ck21=c_phi_inv21[k2]; 05189 ck22=c_phi_inv22[k2]; 05190 05191 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21; 05192 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21; 05193 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21; 05194 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21; 05195 05196 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22; 05197 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22; 05198 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22; 05199 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22; 05200 } 05201 } 05202 } 05203 } 05204 else 05205 #ifdef _OPENMP 05206 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22) 05207 #endif 05208 for(k0=0;k0<N0/2;k0++) 05209 { 05210 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0)); 05211 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0)); 05212 for(k1=0;k1<N1/2;k1++) 05213 { 05214 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1)); 05215 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1)); 05216 05217 for(k2=0;k2<N2/2;k2++) 05218 { 05219 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2)); 05220 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2)); 05221 05222 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21; 05223 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21; 05224 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21; 05225 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21; 05226 05227 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22; 05228 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22; 05229 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22; 05230 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22; 05231 } 05232 } 05233 } 05234 05235 TOC(0) 05236 05237 TIC_FFTW(1) 05238 FFTW(execute)(ths->my_fftw_plan1); 05239 TOC_FFTW(1); 05240 05241 TIC(2); 05242 nfft_trafo_3d_B(ths); 05243 TOC(2); 05244 } 05245 05246 void X(adjoint_3d)(X(plan) *ths) 05247 { 05248 INT k0,k1,k2,n0,n1,n2,N0,N1,N2; 05249 C *g_hat,*f_hat; 05250 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22; 05251 R ck01, ck02, ck11, ck12, ck21, ck22; 05252 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221; 05253 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222; 05254 05255 ths->g_hat=ths->g1; 05256 ths->g=ths->g2; 05257 05258 N0=ths->N[0]; 05259 N1=ths->N[1]; 05260 N2=ths->N[2]; 05261 n0=ths->n[0]; 05262 n1=ths->n[1]; 05263 n2=ths->n[2]; 05264 05265 f_hat=(C*)ths->f_hat; 05266 g_hat=(C*)ths->g_hat; 05267 05268 TIC(2); 05269 nfft_adjoint_3d_B(ths); 05270 TOC(2); 05271 05272 TIC_FFTW(1) 05273 FFTW(execute)(ths->my_fftw_plan2); 05274 TOC_FFTW(1); 05275 05276 TIC(0) 05277 if(ths->flags & PRE_PHI_HUT) 05278 { 05279 c_phi_inv01=ths->c_phi_inv[0]; 05280 c_phi_inv02=&ths->c_phi_inv[0][N0/2]; 05281 05282 #ifdef _OPENMP 05283 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22) 05284 #endif 05285 for(k0=0;k0<N0/2;k0++) 05286 { 05287 ck01=c_phi_inv01[k0]; 05288 ck02=c_phi_inv02[k0]; 05289 c_phi_inv11=ths->c_phi_inv[1]; 05290 c_phi_inv12=&ths->c_phi_inv[1][N1/2]; 05291 05292 for(k1=0;k1<N1/2;k1++) 05293 { 05294 ck11=c_phi_inv11[k1]; 05295 ck12=c_phi_inv12[k1]; 05296 c_phi_inv21=ths->c_phi_inv[2]; 05297 c_phi_inv22=&ths->c_phi_inv[2][N2/2]; 05298 05299 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2); 05300 f_hat111=f_hat + (k0*N1+k1)*N2; 05301 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2); 05302 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2; 05303 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2); 05304 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2; 05305 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2); 05306 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2; 05307 05308 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2; 05309 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2); 05310 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2; 05311 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2); 05312 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2; 05313 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2); 05314 g_hat222=g_hat + (k0*n1+k1)*n2; 05315 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2); 05316 05317 for(k2=0;k2<N2/2;k2++) 05318 { 05319 ck21=c_phi_inv21[k2]; 05320 ck22=c_phi_inv22[k2]; 05321 05322 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21; 05323 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21; 05324 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21; 05325 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21; 05326 05327 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22; 05328 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22; 05329 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22; 05330 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22; 05331 } 05332 } 05333 } 05334 } 05335 else 05336 #ifdef _OPENMP 05337 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22) 05338 #endif 05339 for(k0=0;k0<N0/2;k0++) 05340 { 05341 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0)); 05342 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0)); 05343 for(k1=0;k1<N1/2;k1++) 05344 { 05345 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1)); 05346 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1)); 05347 05348 for(k2=0;k2<N2/2;k2++) 05349 { 05350 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2)); 05351 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2)); 05352 05353 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21; 05354 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21; 05355 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21; 05356 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21; 05357 05358 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22; 05359 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22; 05360 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22; 05361 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22; 05362 } 05363 } 05364 } 05365 05366 TOC(0) 05367 } 05368 05371 void X(trafo)(X(plan) *ths) 05372 { 05373 switch(ths->d) 05374 { 05375 case 1: X(trafo_1d)(ths); break; 05376 case 2: X(trafo_2d)(ths); break; 05377 case 3: X(trafo_3d)(ths); break; 05378 default: 05379 { 05380 /* use ths->my_fftw_plan1 */ 05381 ths->g_hat = ths->g1; 05382 ths->g = ths->g2; 05383 05387 TIC(0) 05388 D_A(ths); 05389 TOC(0) 05390 05395 TIC_FFTW(1) 05396 FFTW(execute)(ths->my_fftw_plan1); 05397 TOC_FFTW(1) 05398 05402 TIC(2) 05403 B_A(ths); 05404 TOC(2) 05405 } 05406 } 05407 } /* nfft_trafo */ 05408 05409 void X(adjoint)(X(plan) *ths) 05410 { 05411 switch(ths->d) 05412 { 05413 case 1: X(adjoint_1d)(ths); break; 05414 case 2: X(adjoint_2d)(ths); break; 05415 case 3: X(adjoint_3d)(ths); break; 05416 default: 05417 { 05418 /* use ths->my_fftw_plan2 */ 05419 ths->g_hat=ths->g1; 05420 ths->g=ths->g2; 05421 05425 TIC(2) 05426 B_T(ths); 05427 TOC(2) 05428 05433 TIC_FFTW(1) 05434 FFTW(execute)(ths->my_fftw_plan2); 05435 TOC_FFTW(1) 05436 05440 TIC(0) 05441 D_T(ths); 05442 TOC(0) 05443 } 05444 } 05445 } /* nfft_adjoint */ 05446 05447 05450 static void precompute_phi_hut(X(plan) *ths) 05451 { 05452 INT ks[ths->d]; /* index over all frequencies */ 05453 INT t; /* index over all dimensions */ 05454 05455 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*)); 05456 05457 for (t = 0; t < ths->d; t++) 05458 { 05459 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R)); 05460 05461 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++) 05462 { 05463 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t)); 05464 } 05465 } 05466 } /* nfft_phi_hut */ 05467 05472 void X(precompute_lin_psi)(X(plan) *ths) 05473 { 05474 INT t; 05475 INT j; 05476 R step; 05478 for (t=0; t<ths->d; t++) 05479 { 05480 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t])); 05481 for(j = 0;j <= ths->K; j++) 05482 { 05483 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t); 05484 } /* for(j) */ 05485 } /* for(t) */ 05486 } 05487 05488 void X(precompute_fg_psi)(X(plan) *ths) 05489 { 05490 INT t; 05491 INT u, o; 05493 sort(ths); 05494 05495 for (t=0; t<ths->d; t++) 05496 { 05497 INT j; 05498 #ifdef _OPENMP 05499 #pragma omp parallel for default(shared) private(j,u,o) 05500 #endif 05501 for (j = 0; j < ths->M_total; j++) 05502 { 05503 uo(ths,j,&u,&o,t); 05504 05505 ths->psi[2*(j*ths->d+t)]= 05506 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t)); 05507 05508 ths->psi[2*(j*ths->d+t)+1]= 05509 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]); 05510 } /* for(j) */ 05511 } 05512 /* for(t) */ 05513 } /* nfft_precompute_fg_psi */ 05514 05515 void X(precompute_psi)(X(plan) *ths) 05516 { 05517 INT t; /* index over all dimensions */ 05518 INT l; /* index u<=l<=o */ 05519 INT lj; /* index 0<=lj<u+o+1 */ 05520 INT u, o; /* depends on x_j */ 05521 05522 sort(ths); 05523 05524 for (t=0; t<ths->d; t++) 05525 { 05526 INT j; 05527 #ifdef _OPENMP 05528 #pragma omp parallel for default(shared) private(j,l,lj,u,o) 05529 #endif 05530 for (j = 0; j < ths->M_total; j++) 05531 { 05532 uo(ths,j,&u,&o,t); 05533 05534 for(l = u, lj = 0; l <= o; l++, lj++) 05535 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] = 05536 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t)); 05537 } /* for(j) */ 05538 } 05539 /* for(t) */ 05540 } /* nfft_precompute_psi */ 05541 05542 #ifdef _OPENMP 05543 static void nfft_precompute_full_psi_omp(X(plan) *ths) 05544 { 05545 INT j; 05546 INT lprod; 05548 { 05549 INT t; 05550 for(t=0,lprod = 1; t<ths->d; t++) 05551 lprod *= 2*ths->m+2; 05552 } 05553 05554 #pragma omp parallel for default(shared) private(j) 05555 for(j=0; j<ths->M_total; j++) 05556 { 05557 INT t,t2; 05558 INT l_L; 05559 INT l[ths->d]; 05560 INT lj[ths->d]; 05561 INT ll_plain[ths->d+1]; 05563 INT u[ths->d], o[ths->d]; 05565 R phi_prod[ths->d+1]; 05566 INT ix = j*lprod; 05567 05568 phi_prod[0]=1; 05569 ll_plain[0]=0; 05570 05571 MACRO_init_uo_l_lj_t; 05572 05573 for(l_L=0; l_L<lprod; l_L++, ix++) 05574 { 05575 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); 05576 05577 ths->psi_index_g[ix]=ll_plain[ths->d]; 05578 ths->psi[ix]=phi_prod[ths->d]; 05579 05580 MACRO_count_uo_l_lj_t; 05581 } /* for(l_L) */ 05582 05583 ths->psi_index_f[j]=lprod; 05584 } /* for(j) */ 05585 } 05586 #endif 05587 05588 void X(precompute_full_psi)(X(plan) *ths) 05589 { 05590 #ifdef _OPENMP 05591 sort(ths); 05592 05593 nfft_precompute_full_psi_omp(ths); 05594 #else 05595 INT t, t2; /* index over all dimensions */ 05596 INT j; /* index over all nodes */ 05597 INT l_L; /* plain index 0 <= l_L < lprod */ 05598 INT l[ths->d]; /* multi index u<=l<=o */ 05599 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ 05600 INT ll_plain[ths->d+1]; /* postfix plain index */ 05601 INT lprod; /* 'bandwidth' of matrix B */ 05602 INT u[ths->d], o[ths->d]; /* depends on x_j */ 05603 05604 R phi_prod[ths->d+1]; 05605 05606 INT ix, ix_old; 05607 05608 sort(ths); 05609 05610 phi_prod[0] = K(1.0); 05611 ll_plain[0] = 0; 05612 05613 for (t = 0, lprod = 1; t < ths->d; t++) 05614 lprod *= 2 * ths->m + 2; 05615 05616 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++) 05617 { 05618 MACRO_init_uo_l_lj_t; 05619 05620 for (l_L = 0; l_L < lprod; l_L++, ix++) 05621 { 05622 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); 05623 05624 ths->psi_index_g[ix] = ll_plain[ths->d]; 05625 ths->psi[ix] = phi_prod[ths->d]; 05626 05627 MACRO_count_uo_l_lj_t; 05628 } /* for(l_L) */ 05629 05630 ths->psi_index_f[j] = ix - ix_old; 05631 ix_old = ix; 05632 } /* for(j) */ 05633 #endif 05634 } 05635 05636 void X(precompute_one_psi)(X(plan) *ths) 05637 { 05638 if(ths->flags & PRE_LIN_PSI) 05639 X(precompute_lin_psi)(ths); 05640 if(ths->flags & PRE_FG_PSI) 05641 X(precompute_fg_psi)(ths); 05642 if(ths->flags & PRE_PSI) 05643 X(precompute_psi)(ths); 05644 if(ths->flags & PRE_FULL_PSI) 05645 X(precompute_full_psi)(ths); 05646 } 05647 05648 static void init_help(X(plan) *ths) 05649 { 05650 INT t; /* index over all dimensions */ 05651 INT lprod; /* 'bandwidth' of matrix B */ 05652 05653 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) 05654 ths->flags |= NFFT_SORT_NODES; 05655 05656 ths->N_total = intprod(ths->N, 0, ths->d); 05657 ths->n_total = intprod(ths->n, 0, ths->d); 05658 05659 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R)); 05660 05661 for(t = 0;t < ths->d; t++) 05662 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]); 05663 05664 WINDOW_HELP_INIT; 05665 05666 if(ths->flags & MALLOC_X) 05667 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R)); 05668 05669 if(ths->flags & MALLOC_F_HAT) 05670 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C)); 05671 05672 if(ths->flags & MALLOC_F) 05673 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C)); 05674 05675 if(ths->flags & PRE_PHI_HUT) 05676 precompute_phi_hut(ths); 05677 05678 if (ths->flags & PRE_LIN_PSI) 05679 { 05680 if (ths->K == 0) 05681 { 05682 ths->K = Y(m2K)(ths->m); 05683 } 05684 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R)); 05685 } 05686 05687 if(ths->flags & PRE_FG_PSI) 05688 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R)); 05689 05690 if(ths->flags & PRE_PSI) 05691 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R)); 05692 05693 if(ths->flags & PRE_FULL_PSI) 05694 { 05695 for (t = 0, lprod = 1; t < ths->d; t++) 05696 lprod *= 2 * ths->m + 2; 05697 05698 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R)); 05699 05700 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT)); 05701 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT)); 05702 } 05703 05704 if(ths->flags & FFTW_INIT) 05705 { 05706 #ifdef _OPENMP 05707 INT nthreads = Y(get_num_threads)(); 05708 #endif 05709 05710 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C)); 05711 05712 if(ths->flags & FFT_OUT_OF_PLACE) 05713 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C)); 05714 else 05715 ths->g2 = ths->g1; 05716 05717 #ifdef _OPENMP 05718 #pragma omp critical (nfft_omp_critical_fftw_plan) 05719 { 05720 FFTW(plan_with_nthreads)(nthreads); 05721 #endif 05722 { 05723 int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int)); 05724 05725 for (t = 0; t < ths->d; t++) 05726 _n[t] = (int)(ths->n[t]); 05727 05728 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags); 05729 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags); 05730 Y(free)(_n); 05731 } 05732 #ifdef _OPENMP 05733 } 05734 #endif 05735 } 05736 05737 if(ths->flags & NFFT_SORT_NODES) 05738 ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total)); 05739 else 05740 ths->index_x = NULL; 05741 05742 ths->mv_trafo = (void (*) (void* ))X(trafo); 05743 ths->mv_adjoint = (void (*) (void* ))X(adjoint); 05744 } 05745 05746 void X(init)(X(plan) *ths, int d, int *N, int M_total) 05747 { 05748 INT t; /* index over all dimensions */ 05749 05750 ths->d = (INT)d; 05751 05752 ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT)); 05753 05754 for (t = 0; t < d; t++) 05755 ths->N[t] = (INT)N[t]; 05756 05757 ths->M_total = (INT)M_total; 05758 05759 ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT)); 05760 05761 for (t = 0; t < d; t++) 05762 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t])); 05763 05764 ths->m = WINDOW_HELP_ESTIMATE_m; 05765 05766 if (d > 1) 05767 { 05768 #ifdef _OPENMP 05769 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F | 05770 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES | 05771 NFFT_OMP_BLOCKWISE_ADJOINT; 05772 #else 05773 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F | 05774 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES; 05775 #endif 05776 } 05777 else 05778 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F | 05779 FFTW_INIT | FFT_OUT_OF_PLACE; 05780 05781 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT; 05782 05783 ths->K = 0; 05784 init_help(ths); 05785 } 05786 05787 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, 05788 unsigned flags, unsigned fftw_flags) 05789 { 05790 INT t; /* index over all dimensions */ 05791 05792 ths->d = (INT)d; 05793 ths->M_total = (INT)M_total; 05794 ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT)); 05795 05796 for (t = 0; t < d; t++) 05797 ths->N[t] = (INT)N[t]; 05798 05799 ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT)); 05800 05801 for (t = 0; t < d; t++) 05802 ths->n[t] = (INT)n[t]; 05803 05804 ths->m = (INT)m; 05805 05806 ths->flags = flags; 05807 ths->fftw_flags = fftw_flags; 05808 05809 ths->K = 0; 05810 init_help(ths); 05811 } 05812 05813 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K, 05814 unsigned flags, unsigned fftw_flags) 05815 { 05816 INT t; /* index over all dimensions */ 05817 05818 ths->d = (INT)d; 05819 ths->M_total = (INT)M_total; 05820 ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT)); 05821 05822 for (t = 0; t < d; t++) 05823 ths->N[t] = (INT)N[t]; 05824 05825 ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT)); 05826 05827 for (t = 0; t < d; t++) 05828 ths->n[t] = (INT)n[t]; 05829 05830 ths->m = (INT)m; 05831 05832 ths->flags = flags; 05833 ths->fftw_flags = fftw_flags; 05834 05835 ths->K = K; 05836 init_help(ths); 05837 } 05838 05839 void X(init_1d)(X(plan) *ths, int N1, int M_total) 05840 { 05841 int N[1]; 05842 05843 N[0] = N1; 05844 05845 X(init)(ths, 1, N, M_total); 05846 } 05847 05848 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total) 05849 { 05850 int N[2]; 05851 05852 N[0] = N1; 05853 N[1] = N2; 05854 X(init)(ths, 2, N, M_total); 05855 } 05856 05857 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total) 05858 { 05859 int N[3]; 05860 05861 N[0] = N1; 05862 N[1] = N2; 05863 N[2] = N3; 05864 X(init)(ths, 3, N, M_total); 05865 } 05866 05867 const char* X(check)(X(plan) *ths) 05868 { 05869 INT j; 05870 05871 if (!ths->f) 05872 return "Member f not initialized."; 05873 05874 if (!ths->x) 05875 return "Member x not initialized."; 05876 05877 if (!ths->f_hat) 05878 return "Member f_hat not initialized."; 05879 05880 if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total) 05881 return "Number of nodes too small to use PRE_LIN_PSI."; 05882 05883 for (j = 0; j < ths->M_total * ths->d; j++) 05884 { 05885 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5))) 05886 { 05887 return "ths->x out of range [-0.5,0.5)"; 05888 } 05889 } 05890 05891 for (j = 0; j < ths->d; j++) 05892 { 05893 if (ths->sigma[j] <= 1) 05894 return "Oversampling factor too small"; 05895 05896 if(ths->N[j] <= ths->m) 05897 return "Polynomial degree N is smaller than cut-off m"; 05898 05899 if(ths->N[j]%2 == 1) 05900 return "polynomial degree N has to be even"; 05901 } 05902 return 0; 05903 } 05904 05905 void X(finalize)(X(plan) *ths) 05906 { 05907 INT t; /* index over dimensions */ 05908 05909 if(ths->flags & NFFT_SORT_NODES) 05910 Y(free)(ths->index_x); 05911 05912 if(ths->flags & FFTW_INIT) 05913 { 05914 #ifdef _OPENMP 05915 #pragma omp critical (nfft_omp_critical_fftw_plan) 05916 #endif 05917 FFTW(destroy_plan)(ths->my_fftw_plan2); 05918 #ifdef _OPENMP 05919 #pragma omp critical (nfft_omp_critical_fftw_plan) 05920 #endif 05921 FFTW(destroy_plan)(ths->my_fftw_plan1); 05922 05923 if(ths->flags & FFT_OUT_OF_PLACE) 05924 Y(free)(ths->g2); 05925 05926 Y(free)(ths->g1); 05927 } 05928 05929 if(ths->flags & PRE_FULL_PSI) 05930 { 05931 Y(free)(ths->psi_index_g); 05932 Y(free)(ths->psi_index_f); 05933 Y(free)(ths->psi); 05934 } 05935 05936 if(ths->flags & PRE_PSI) 05937 Y(free)(ths->psi); 05938 05939 if(ths->flags & PRE_FG_PSI) 05940 Y(free)(ths->psi); 05941 05942 if(ths->flags & PRE_LIN_PSI) 05943 Y(free)(ths->psi); 05944 05945 if(ths->flags & PRE_PHI_HUT) 05946 { 05947 for (t = 0; t < ths->d; t++) 05948 Y(free)(ths->c_phi_inv[t]); 05949 Y(free)(ths->c_phi_inv); 05950 } 05951 05952 if(ths->flags & MALLOC_F) 05953 Y(free)(ths->f); 05954 05955 if(ths->flags & MALLOC_F_HAT) 05956 Y(free)(ths->f_hat); 05957 05958 if(ths->flags & MALLOC_X) 05959 Y(free)(ths->x); 05960 05961 WINDOW_HELP_FINALIZE; 05962 05963 Y(free)(ths->sigma); 05964 Y(free)(ths->n); 05965 Y(free)(ths->N); 05966 }