1
2 """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting"""
3
4 import os,sys
5 from os import path,unlink
6 from threading import Thread,Lock,Timer
7
8 from PyFoam.Applications.Decomposer import Decomposer
9 from PyFoam.Applications.Runner import Runner
10 from PyFoam.Applications.SteadyRunner import SteadyRunner
11 from PyFoam.Applications.CloneCase import CloneCase
12 from PyFoam.FoamInformation import changeFoamVersion
13 from PyFoam.Error import error,warning
14 from PyFoam import configuration as config
15 from PyFoam.FoamInformation import oldAppConvention as oldApp
16
30
31
33 """ All Cluster-jobs are to be derived from this base-class
34
35 The actual jobs are implemented by overriding methods
36
37 There is a number of variables in this class that are used to
38 'communicate' information between the various stages"""
39
40 - def __init__(self,basename,
41 arrayJob=False,
42 hardRestart=False,
43 autoParallel=True,
44 doAutoReconstruct=True,
45 foamVersion=None,
46 useFoamMPI=False,
47 multiRegion=False):
48 """Initializes the Job
49 @param basename: Basis name of the job
50 @param arrayJob: this job is a parameter variation. The tasks
51 are identified by their task-id
52 @param hardRestart: treat the job as restarted
53 @param autoParallel: Parallelization is handled by the base-class
54 @param doAutoReconstruct: Automatically reconstruct the case if autoParalellel is set
55 @param foamVersion: The foam-Version that is to be used
56 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM
57 @param multiRegion: This job consists of multiple regions"""
58
59
60
61 if not os.environ.has_key("JOB_ID"):
62 error("Not an SGE-job. Environment variable JOB_ID is missing")
63 self.jobID=int(os.environ["JOB_ID"])
64 self.jobName=os.environ["JOB_NAME"]
65
66 self.basename=path.join(path.abspath(path.curdir),basename)
67
68 sgeRestarted=False
69 if os.environ.has_key("RESTARTED"):
70 sgeRestarted=(int(os.environ["RESTARTED"])!=0)
71
72 if sgeRestarted or hardRestart:
73 self.restarted=True
74 else:
75 self.restarted=False
76
77 if foamVersion==None:
78 foamVersion=config().get("OpenFOAM","Version")
79
80 changeFoamVersion(foamVersion)
81
82 if not os.environ.has_key("WM_PROJECT_VERSION"):
83 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter")
84
85 self.autoParallel=autoParallel
86 self.doAutoReconstruct=doAutoReconstruct
87 self.multiRegion=multiRegion
88
89 self.hostfile=None
90 self.nproc=1
91
92 if os.environ.has_key("NSLOTS"):
93 self.nproc=int(os.environ["NSLOTS"])
94 self.message("Running on",self.nproc,"CPUs")
95 if self.nproc>1:
96
97 self.hostfile=path.join(os.environ["TMP"],"machines")
98 self.message("Using the machinefile",self.hostfile)
99 self.message("Contents of the machinefile:",open(self.hostfile).readlines())
100
101 self.ordinaryEnd=True
102 self.listenToTimer=False
103
104 self.taskID=None
105 self.arrayJob=arrayJob
106
107 if self.arrayJob:
108 self.taskID=int(os.environ["SGE_TASK_ID"])
109
110 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')):
111
112 self.message("Adding Cluster-specific paths")
113 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"]
114 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"]
115
116 self.isDecomposed=False
117
119 """Return a string with the full job-ID"""
120 result=str(self.jobID)
121 if self.arrayJob:
122 result+=":"+str(self.taskID)
123 return result
124
126 print "=== CLUSTERJOB: ",
127 for t in txt:
128 print t,
129 print " ==="
130 sys.stdout.flush()
131
133 self.message("Setting Job state to",txt)
134 fName=path.join(self.casedir(),"ClusterJobState")
135 f=open(fName,"w")
136 f.write(txt+"\n")
137 f.close()
138
140 """The file with the job information"""
141 jobfile="%s.%d" % (self.jobName,self.jobID)
142 if self.arrayJob:
143 jobfile+=".%d" % self.taskID
144 jobfile+=".pyFoam.clusterjob"
145 jobfile=path.join(path.dirname(self.basename),jobfile)
146
147 return jobfile
148
150 """The file that makes the job write a checkpoint"""
151 return self.jobFile()+".checkpoint"
152
154 """The file that makes the job write a checkpoint and end"""
155 return self.jobFile()+".stop"
156
223
225 """Returns the actual directory of the case
226 To be overridden if appropriate"""
227 if self.arrayJob:
228 return "%s.%05d" % (self.basename,self.taskID)
229 else:
230 return self.basename
231
233 """Returns just the name of the case"""
234 return path.basename(self.casedir())
235
236 - def foamRun(self,application,
237 args=[],
238 foamArgs=[],
239 steady=False,
240 multiRegion=None,
241 progress=False,
242 noLog=False):
243 """Runs a foam utility on the case.
244 If it is a parallel job and the grid has
245 already been decomposed (and not yet reconstructed) it is run in
246 parallel
247 @param application: the Foam-Application that is to be run
248 @param foamArgs: A list if with the additional arguments for the
249 Foam-Application
250 @param args: A list with additional arguments for the Runner-object
251 @param steady: Use the steady-runner
252 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this)
253 @param progress: Only output the time and nothing else
254 @param noLog: Do not generate a logfile"""
255
256 arglist=args[:]
257 arglist+=["--job-id=%s" % self.fullJobId()]
258
259 if self.isDecomposed and self.nproc>1:
260 arglist+=["--procnr=%d" % self.nproc,
261 "--machinefile=%s" % self.hostfile]
262 if progress:
263 arglist+=["--progress"]
264 if noLog:
265 arglist+=["--no-log"]
266
267 if self.multiRegion:
268 if multiRegion==None or multiRegion==True:
269 arglist+=["--all-regions"]
270 elif multiRegion and not self.multiRegion:
271 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good")
272
273 if self.restarted:
274 arglist+=["--restart"]
275
276 arglist+=[application]
277 if oldApp():
278 arglist+=[".",self.casename()]
279 else:
280 arglist+=["-case",self.casename()]
281
282 arglist+=foamArgs
283
284 self.message("Executing",arglist)
285
286 if steady:
287 self.message("Running Steady")
288 runner=SteadyRunner(args=arglist)
289 else:
290 runner=Runner(args=arglist)
291
293 """Automatically decomposes the grid with a metis-algorithm"""
294
295 if path.isdir(path.join(self.casedir(),"processor0")):
296 warning("A processor directory already exists. There might be a problem")
297 args=["--method=metis",
298 "--clear",
299 self.casename(),
300 self.nproc,
301 "--job-id=%s" % self.fullJobId()]
302
303 if self.multiRegion:
304 args.append("--all-regions")
305
306 deco=Decomposer(args=args)
307
309 """Default reconstruction of a parallel run"""
310
311 if self.doAutoReconstruct:
312 self.foamRun("reconstructPar",
313 args=["--logname=ReconstructPar"])
314 else:
315 self.message("No reconstruction (because asked to)")
316
317 - def setup(self,parameters):
318 """Set up the job. Called in the beginning if the
319 job has not been restarted
320
321 Usual tasks include grid conversion/setup, mesh decomposition etc
322
323 @param parameters: a dictionary with parameters"""
324
325 pass
326
327 - def postDecomposeSetup(self,parameters):
328 """Additional setup, to be executed when the grid is already decomposed
329
330 Usually for tasks that can be done on a decomposed grid
331
332 @param parameters: a dictionary with parameters"""
333
334 pass
335
336 - def run(self,parameters):
337 """Run the actual job. Usually the solver.
338 @param parameters: a dictionary with parameters"""
339
340 pass
341
343 """Additional cleanup, to be executed when the grid is still decomposed
344
345 Usually for tasks that can be done on a decomposed grid
346
347 @param parameters: a dictionary with parameters"""
348
349 pass
350
352 """Clean up after a job
353 @param parameters: a dictionary with parameters"""
354
355 pass
356
358 """Additional reconstruction of parallel runs (Stuff that the
359 OpenFOAM-reconstructPar doesn't do
360 @param parameters: a dictionary with parameters"""
361
362 pass
363
365 """Parameters for a specific task
366 @param id: the id of the task
367 @return: a dictionary with parameters for this task"""
368
369 error("taskParameter not implemented. Not a parameterized job")
370
371 return {}
372
383
385 if self.listenToTimer:
386 self.ordinaryEnd=False
387 f=open(path.join(self.basename,"stop"),"w")
388 f.write("Geh z'haus")
389 f.close()
390 unlink(self.stopFile())
391 else:
392 warning("I'm not listening to your callbacks")
393
395 """A Cluster-Job that executes a solver. It implements the run-function.
396 If a template-case is specified, the case is copied"""
397
398 - def __init__(self,basename,solver,
399 template=None,
400 cloneParameters=[],
401 arrayJob=False,
402 hardRestart=False,
403 autoParallel=True,
404 doAutoReconstruct=True,
405 foamVersion=None,
406 useFoamMPI=False,
407 steady=False,
408 multiRegion=False,
409 progress=False,
410 solverProgress=False,
411 solverNoLog=False):
412 """@param template: Name of the template-case. It is assumed that
413 it resides in the same directory as the actual case
414 @param cloneParameters: a list with additional parameters for the
415 CloneCase-object that copies the template
416 @param solverProgress: Only writes the current time of the solver"""
417
418 ClusterJob.__init__(self,basename,
419 arrayJob=arrayJob,
420 hardRestart=hardRestart,
421 autoParallel=autoParallel,
422 doAutoReconstruct=doAutoReconstruct,
423 foamVersion=foamVersion,
424 useFoamMPI=useFoamMPI,
425 multiRegion=multiRegion)
426 self.solver=solver
427 self.steady=steady
428 if template!=None and not self.restarted:
429 template=path.join(path.dirname(self.casedir()),template)
430 if path.abspath(basename)==path.abspath(template):
431 error("The basename",basename,"and the template",template,"are the same directory")
432 clone=CloneCase(
433 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"])
434 self.solverProgress=solverProgress
435 self.solverNoLog=solverNoLog
436
437 - def run(self,parameters):
438 self.foamRun(self.solver,
439 steady=self.steady,
440 multiRegion=False,
441 progress=self.solverProgress,
442 noLog=self.solverNoLog)
443