All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
threefry.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _threefry_dot_h_
33 #define _threefry_dot_h_
35 #include "array.h"
36 
38 /* Significant parts of this file were copied from
39  from:
40  Skein_FinalRnd/ReferenceImplementation/skein.h
41  Skein_FinalRnd/ReferenceImplementation/skein_block.c
42 
43  in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44 
45  This file has been modified so that it may no longer perform its originally
46  intended function. If you're looking for a Skein or Threefish source code,
47  please consult the original file.
48 
49  The original file had the following header:
50 **************************************************************************
51 **
52 ** Interface declarations and internal definitions for Skein hashing.
53 **
54 ** Source code author: Doug Whiting, 2008.
55 **
56 ** This algorithm and source code is released to the public domain.
57 **
58 ***************************************************************************
59 
60 */
61 
62 /* See comment at the top of philox.h for the macro pre-process
63  strategy. */
64 
65 /* Rotation constants: */
66 enum r123_enum_threefry64x4 {
67  /* These are the R_256 constants from the Threefish reference sources
68  with names changed to R_64x4... */
69  R_64x4_0_0=14, R_64x4_0_1=16,
70  R_64x4_1_0=52, R_64x4_1_1=57,
71  R_64x4_2_0=23, R_64x4_2_1=40,
72  R_64x4_3_0= 5, R_64x4_3_1=37,
73  R_64x4_4_0=25, R_64x4_4_1=33,
74  R_64x4_5_0=46, R_64x4_5_1=12,
75  R_64x4_6_0=58, R_64x4_6_1=22,
76  R_64x4_7_0=32, R_64x4_7_1=32
77 };
78 
79 enum r123_enum_threefry64x2 {
80  /*
81  // Output from skein_rot_search: (srs64_B64-X1000)
82  // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
83  // Start: Tue Mar 1 10:07:48 2011
84  // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
85  */
86  R_64x2_0_0=16,
87  R_64x2_1_0=42,
88  R_64x2_2_0=12,
89  R_64x2_3_0=31,
90  R_64x2_4_0=16,
91  R_64x2_5_0=32,
92  R_64x2_6_0=24,
93  R_64x2_7_0=21
94  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
95  // 5 rounds: minHW = 8 [ 8 8 8 8 ]
96  // 6 rounds: minHW = 16 [ 16 16 16 16 ]
97  // 7 rounds: minHW = 32 [ 32 32 32 32 ]
98  // 8 rounds: minHW = 64 [ 64 64 64 64 ]
99  // 9 rounds: minHW = 64 [ 64 64 64 64 ]
100  //10 rounds: minHW = 64 [ 64 64 64 64 ]
101  //11 rounds: minHW = 64 [ 64 64 64 64 ] */
102 };
103 
104 enum r123_enum_threefry32x4 {
105  /* Output from skein_rot_search: (srs-B128-X5000.out)
106  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
107  // Start: Mon Aug 24 22:41:36 2009
108  // ...
109  // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
110  R_32x4_0_0=10, R_32x4_0_1=26,
111  R_32x4_1_0=11, R_32x4_1_1=21,
112  R_32x4_2_0=13, R_32x4_2_1=27,
113  R_32x4_3_0=23, R_32x4_3_1= 5,
114  R_32x4_4_0= 6, R_32x4_4_1=20,
115  R_32x4_5_0=17, R_32x4_5_1=11,
116  R_32x4_6_0=25, R_32x4_6_1=10,
117  R_32x4_7_0=18, R_32x4_7_1=20
118 
119  /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
120  // 5 rounds: minHW = 7 [ 7 7 7 7 ]
121  // 6 rounds: minHW = 12 [ 13 12 13 12 ]
122  // 7 rounds: minHW = 22 [ 22 23 22 23 ]
123  // 8 rounds: minHW = 31 [ 31 31 31 31 ]
124  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
125  //10 rounds: minHW = 32 [ 32 32 32 32 ]
126  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
127 
128 };
129 
130 enum r123_enum_threefry32x2 {
131  /* Output from skein_rot_search (srs32x2-X5000.out)
132  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
133  // Start: Tue Jul 12 11:11:33 2011
134  // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
135  R_32x2_0_0=13,
136  R_32x2_1_0=15,
137  R_32x2_2_0=26,
138  R_32x2_3_0= 6,
139  R_32x2_4_0=17,
140  R_32x2_5_0=29,
141  R_32x2_6_0=16,
142  R_32x2_7_0=24
143 
144  /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
145  // 5 rounds: minHW = 6 [ 6 8 6 8 ]
146  // 6 rounds: minHW = 9 [ 9 12 9 12 ]
147  // 7 rounds: minHW = 16 [ 16 24 16 24 ]
148  // 8 rounds: minHW = 32 [ 32 32 32 32 ]
149  // 9 rounds: minHW = 32 [ 32 32 32 32 ]
150  //10 rounds: minHW = 32 [ 32 32 32 32 ]
151  //11 rounds: minHW = 32 [ 32 32 32 32 ] */
152  };
153 
154 enum r123_enum_threefry_wcnt {
155  WCNT2=2,
156  WCNT4=4
157 };
158 
159 #if R123_USE_64BIT
160 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
161 R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
162 {
163  return (x << (N & 63)) | (x >> ((64-N) & 63));
164 }
165 #endif
166 
167 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
168 R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
169 {
170  return (x << (N & 31)) | (x >> ((32-N) & 31));
171 }
172 
173 #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
174 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
175 #define SKEIN_KS_PARITY32 0x1BD11BDA
176 
179 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
180 #define THREEFRY2x32_DEFAULT_ROUNDS 20
181 #endif
182 
183 #ifndef THREEFRY2x64_DEFAULT_ROUNDS
184 #define THREEFRY2x64_DEFAULT_ROUNDS 20
185 #endif
186 
187 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
188 #define THREEFRY4x32_DEFAULT_ROUNDS 20
189 #endif
190 
191 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
192 #define THREEFRY4x64_DEFAULT_ROUNDS 20
193 #endif
194 
195 #define _threefry2x_tpl(W) \
196 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
197 typedef struct r123array2x##W threefry2x##W##_key_t; \
198 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
199 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
200 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
201 R123_CUDA_DEVICE R123_STATIC_INLINE \
202 threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
203  threefry2x##W##_ctr_t X; \
204  uint##W##_t ks[2+1]; \
205  int i; /* avoid size_t to avoid need for stddef.h */ \
206  R123_ASSERT(Nrounds<=32); \
207  ks[2] = SKEIN_KS_PARITY##W; \
208  for (i=0;i < 2; i++) \
209  { \
210  ks[i] = k.v[i]; \
211  X.v[i] = in.v[i]; \
212  ks[2] ^= k.v[i]; \
213  } \
214  \
215  /* Insert initial key before round 0 */ \
216  X.v[0] += ks[0]; X.v[1] += ks[1]; \
217  \
218  if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
219  if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
220  if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
221  if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
222  if(Nrounds>3){ \
223  /* InjectKey(r=1) */ \
224  X.v[0] += ks[1]; X.v[1] += ks[2]; \
225  X.v[1] += 1; /* X.v[2-1] += r */ \
226  } \
227  if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
228  if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
229  if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
230  if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
231  if(Nrounds>7){ \
232  /* InjectKey(r=2) */ \
233  X.v[0] += ks[2]; X.v[1] += ks[0]; \
234  X.v[1] += 2; \
235  } \
236  if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
237  if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
238  if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
239  if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
240  if(Nrounds>11){ \
241  /* InjectKey(r=3) */ \
242  X.v[0] += ks[0]; X.v[1] += ks[1]; \
243  X.v[1] += 3; \
244  } \
245  if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
246  if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
247  if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
248  if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
249  if(Nrounds>15){ \
250  /* InjectKey(r=4) */ \
251  X.v[0] += ks[1]; X.v[1] += ks[2]; \
252  X.v[1] += 4; \
253  } \
254  if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
255  if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
256  if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
257  if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
258  if(Nrounds>19){ \
259  /* InjectKey(r=5) */ \
260  X.v[0] += ks[2]; X.v[1] += ks[0]; \
261  X.v[1] += 5; \
262  } \
263  if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
264  if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
265  if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
266  if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
267  if(Nrounds>23){ \
268  /* InjectKey(r=6) */ \
269  X.v[0] += ks[0]; X.v[1] += ks[1]; \
270  X.v[1] += 6; \
271  } \
272  if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
273  if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
274  if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
275  if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
276  if(Nrounds>27){ \
277  /* InjectKey(r=7) */ \
278  X.v[0] += ks[1]; X.v[1] += ks[2]; \
279  X.v[1] += 7; \
280  } \
281  if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
282  if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
283  if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
284  if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
285  if(Nrounds>31){ \
286  /* InjectKey(r=8) */ \
287  X.v[0] += ks[2]; X.v[1] += ks[0]; \
288  X.v[1] += 8; \
289  } \
290  return X; \
291 } \
292  \
293 enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
294 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
295 R123_CUDA_DEVICE R123_STATIC_INLINE \
296 threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
297  return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
298 }
299 
300 
301 #define _threefry4x_tpl(W) \
302 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
303 typedef struct r123array4x##W threefry4x##W##_key_t; \
304 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
305 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
306 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
307 R123_CUDA_DEVICE R123_STATIC_INLINE \
308 threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
309  threefry4x##W##_ctr_t X; \
310  uint##W##_t ks[4+1]; \
311  int i; /* avoid size_t to avoid need for stddef.h */ \
312  R123_ASSERT(Nrounds<=72); \
313  ks[4] = SKEIN_KS_PARITY##W; \
314  for (i=0;i < 4; i++) \
315  { \
316  ks[i] = k.v[i]; \
317  X.v[i] = in.v[i]; \
318  ks[4] ^= k.v[i]; \
319  } \
320  \
321  /* Insert initial key before round 0 */ \
322  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
323  \
324  if(Nrounds>0){ \
325  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
326  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
327  } \
328  if(Nrounds>1){ \
329  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
330  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
331  } \
332  if(Nrounds>2){ \
333  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
334  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
335  } \
336  if(Nrounds>3){ \
337  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
338  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
339  } \
340  if(Nrounds>3){ \
341  /* InjectKey(r=1) */ \
342  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
343  X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \
344  } \
345  \
346  if(Nrounds>4){ \
347  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
348  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
349  } \
350  if(Nrounds>5){ \
351  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
352  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
353  } \
354  if(Nrounds>6){ \
355  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
356  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
357  } \
358  if(Nrounds>7){ \
359  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
360  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
361  } \
362  if(Nrounds>7){ \
363  /* InjectKey(r=2) */ \
364  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
365  X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \
366  } \
367  \
368  if(Nrounds>8){ \
369  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
370  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
371  } \
372  if(Nrounds>9){ \
373  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
374  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
375  } \
376  if(Nrounds>10){ \
377  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
378  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
379  } \
380  if(Nrounds>11){ \
381  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
382  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
383  } \
384  if(Nrounds>11){ \
385  /* InjectKey(r=3) */ \
386  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
387  X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \
388  } \
389  \
390  if(Nrounds>12){ \
391  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
392  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
393  } \
394  if(Nrounds>13){ \
395  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
396  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
397  } \
398  if(Nrounds>14){ \
399  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
400  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
401  } \
402  if(Nrounds>15){ \
403  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
404  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
405  } \
406  if(Nrounds>15){ \
407  /* InjectKey(r=1) */ \
408  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
409  X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \
410  } \
411  \
412  if(Nrounds>16){ \
413  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
414  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
415  } \
416  if(Nrounds>17){ \
417  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
418  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
419  } \
420  if(Nrounds>18){ \
421  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
422  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
423  } \
424  if(Nrounds>19){ \
425  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
426  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
427  } \
428  if(Nrounds>19){ \
429  /* InjectKey(r=1) */ \
430  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
431  X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \
432  } \
433  \
434  if(Nrounds>20){ \
435  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
436  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
437  } \
438  if(Nrounds>21){ \
439  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
440  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
441  } \
442  if(Nrounds>22){ \
443  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
444  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
445  } \
446  if(Nrounds>23){ \
447  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
448  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
449  } \
450  if(Nrounds>23){ \
451  /* InjectKey(r=1) */ \
452  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
453  X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \
454  } \
455  \
456  if(Nrounds>24){ \
457  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
458  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
459  } \
460  if(Nrounds>25){ \
461  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
462  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
463  } \
464  if(Nrounds>26){ \
465  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
466  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
467  } \
468  if(Nrounds>27){ \
469  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
470  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
471  } \
472  if(Nrounds>27){ \
473  /* InjectKey(r=1) */ \
474  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
475  X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \
476  } \
477  \
478  if(Nrounds>28){ \
479  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
480  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
481  } \
482  if(Nrounds>29){ \
483  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
484  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
485  } \
486  if(Nrounds>30){ \
487  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
488  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
489  } \
490  if(Nrounds>31){ \
491  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
492  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
493  } \
494  if(Nrounds>31){ \
495  /* InjectKey(r=1) */ \
496  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
497  X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \
498  } \
499  \
500  if(Nrounds>32){ \
501  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
502  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
503  } \
504  if(Nrounds>33){ \
505  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
506  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
507  } \
508  if(Nrounds>34){ \
509  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
510  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
511  } \
512  if(Nrounds>35){ \
513  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
514  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
515  } \
516  if(Nrounds>35){ \
517  /* InjectKey(r=1) */ \
518  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
519  X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \
520  } \
521  \
522  if(Nrounds>36){ \
523  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
524  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
525  } \
526  if(Nrounds>37){ \
527  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
528  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
529  } \
530  if(Nrounds>38){ \
531  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
532  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
533  } \
534  if(Nrounds>39){ \
535  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
536  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
537  } \
538  if(Nrounds>39){ \
539  /* InjectKey(r=1) */ \
540  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
541  X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \
542  } \
543  \
544  if(Nrounds>40){ \
545  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
546  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
547  } \
548  if(Nrounds>41){ \
549  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
550  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
551  } \
552  if(Nrounds>42){ \
553  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
554  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
555  } \
556  if(Nrounds>43){ \
557  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
558  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
559  } \
560  if(Nrounds>43){ \
561  /* InjectKey(r=1) */ \
562  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
563  X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \
564  } \
565  \
566  if(Nrounds>44){ \
567  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
568  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
569  } \
570  if(Nrounds>45){ \
571  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
572  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
573  } \
574  if(Nrounds>46){ \
575  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
576  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
577  } \
578  if(Nrounds>47){ \
579  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
580  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
581  } \
582  if(Nrounds>47){ \
583  /* InjectKey(r=1) */ \
584  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
585  X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \
586  } \
587  \
588  if(Nrounds>48){ \
589  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
590  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
591  } \
592  if(Nrounds>49){ \
593  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
594  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
595  } \
596  if(Nrounds>50){ \
597  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
598  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
599  } \
600  if(Nrounds>51){ \
601  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
602  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
603  } \
604  if(Nrounds>51){ \
605  /* InjectKey(r=1) */ \
606  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
607  X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \
608  } \
609  \
610  if(Nrounds>52){ \
611  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
612  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
613  } \
614  if(Nrounds>53){ \
615  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
616  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
617  } \
618  if(Nrounds>54){ \
619  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
620  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
621  } \
622  if(Nrounds>55){ \
623  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
624  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
625  } \
626  if(Nrounds>55){ \
627  /* InjectKey(r=1) */ \
628  X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
629  X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \
630  } \
631  \
632  if(Nrounds>56){ \
633  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
634  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
635  } \
636  if(Nrounds>57){ \
637  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
638  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
639  } \
640  if(Nrounds>58){ \
641  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
642  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
643  } \
644  if(Nrounds>59){ \
645  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
646  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
647  } \
648  if(Nrounds>59){ \
649  /* InjectKey(r=1) */ \
650  X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
651  X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \
652  } \
653  \
654  if(Nrounds>60){ \
655  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
656  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
657  } \
658  if(Nrounds>61){ \
659  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
660  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
661  } \
662  if(Nrounds>62){ \
663  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
664  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
665  } \
666  if(Nrounds>63){ \
667  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
668  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
669  } \
670  if(Nrounds>63){ \
671  /* InjectKey(r=1) */ \
672  X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
673  X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \
674  } \
675  \
676  if(Nrounds>64){ \
677  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
678  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
679  } \
680  if(Nrounds>65){ \
681  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
682  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
683  } \
684  if(Nrounds>66){ \
685  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
686  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
687  } \
688  if(Nrounds>67){ \
689  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
690  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
691  } \
692  if(Nrounds>67){ \
693  /* InjectKey(r=1) */ \
694  X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
695  X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \
696  } \
697  \
698  if(Nrounds>68){ \
699  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
700  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
701  } \
702  if(Nrounds>69){ \
703  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
704  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
705  } \
706  if(Nrounds>70){ \
707  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
708  X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
709  } \
710  if(Nrounds>71){ \
711  X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
712  X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
713  } \
714  if(Nrounds>71){ \
715  /* InjectKey(r=1) */ \
716  X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
717  X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \
718  } \
719  \
720  return X; \
721 } \
722  \
723  \
724 enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
725 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
726 R123_CUDA_DEVICE R123_STATIC_INLINE \
727 threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
728  return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
729 }
730 
731 #if R123_USE_64BIT
732 _threefry2x_tpl(64)
733 _threefry4x_tpl(64)
734 #endif
737 
738 /* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
739  than a static inline function. Why? */
740 #define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
741 #define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
742 #define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
743 #define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
744 
745 #if defined(__cplusplus)
746 #define _threefryNxWclass_tpl(NxW) \
747 namespace r123{ \
748 template<unsigned int ROUNDS> \
749  struct Threefry##NxW##_R{ \
750  typedef threefry##NxW##_ctr_t ctr_type; \
751  typedef threefry##NxW##_key_t key_type; \
752  typedef threefry##NxW##_key_t ukey_type; \
753  static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
754  inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
755  R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
756  return threefry##NxW##_R(ROUNDS, ctr, key); \
757  } \
758 }; \
759  typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
760 } // namespace r123
761 
764 #if R123_USE_64BIT
767 #endif
768 
769 /* The _tpl macros don't quite work to do string-pasting inside comments.
770  so we just write out the boilerplate documentation four times... */
771 
868 #endif
869 
870 #endif
#define _threefry4x_tpl(W)
Definition: threefry.h:301
#define _threefry2x_tpl(W)
Definition: threefry.h:195
#define _threefryNxWclass_tpl(NxW)
Definition: threefry.h:746