Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os,sys 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 16   
17 -def checkForMessageFromAbove(job):
18 if not job.listenToTimer: 19 return 20 21 if path.exists(job.stopFile()): 22 job.stopJob() 23 return 24 25 if path.exists(job.checkpointFile()): 26 job.writeCheckpoint() 27 28 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 29 job.timer.start()
30 31
32 -class ClusterJob:
33 """ All Cluster-jobs are to be derived from this base-class 34 35 The actual jobs are implemented by overriding methods 36 37 There is a number of variables in this class that are used to 38 'communicate' information between the various stages""" 39
40 - def __init__(self,basename, 41 arrayJob=False, 42 hardRestart=False, 43 autoParallel=True, 44 doAutoReconstruct=True, 45 foamVersion=None, 46 useFoamMPI=False, 47 multiRegion=False):
48 """Initializes the Job 49 @param basename: Basis name of the job 50 @param arrayJob: this job is a parameter variation. The tasks 51 are identified by their task-id 52 @param hardRestart: treat the job as restarted 53 @param autoParallel: Parallelization is handled by the base-class 54 @param doAutoReconstruct: Automatically reconstruct the case if autoParalellel is set 55 @param foamVersion: The foam-Version that is to be used 56 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 57 @param multiRegion: This job consists of multiple regions""" 58 59 # print os.environ 60 61 if not os.environ.has_key("JOB_ID"): 62 error("Not an SGE-job. Environment variable JOB_ID is missing") 63 self.jobID=int(os.environ["JOB_ID"]) 64 self.jobName=os.environ["JOB_NAME"] 65 66 self.basename=path.join(path.abspath(path.curdir),basename) 67 68 sgeRestarted=False 69 if os.environ.has_key("RESTARTED"): 70 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 71 72 if sgeRestarted or hardRestart: 73 self.restarted=True 74 else: 75 self.restarted=False 76 77 if foamVersion==None: 78 foamVersion=config().get("OpenFOAM","Version") 79 80 changeFoamVersion(foamVersion) 81 82 if not os.environ.has_key("WM_PROJECT_VERSION"): 83 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 84 85 self.autoParallel=autoParallel 86 self.doAutoReconstruct=doAutoReconstruct 87 self.multiRegion=multiRegion 88 89 self.hostfile=None 90 self.nproc=1 91 92 if os.environ.has_key("NSLOTS"): 93 self.nproc=int(os.environ["NSLOTS"]) 94 self.message("Running on",self.nproc,"CPUs") 95 if self.nproc>1: 96 # self.hostfile=os.environ["PE_HOSTFILE"] 97 self.hostfile=path.join(os.environ["TMP"],"machines") 98 self.message("Using the machinefile",self.hostfile) 99 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 100 101 self.ordinaryEnd=True 102 self.listenToTimer=False 103 104 self.taskID=None 105 self.arrayJob=arrayJob 106 107 if self.arrayJob: 108 self.taskID=int(os.environ["SGE_TASK_ID"]) 109 110 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 111 ## prepend special paths for the cluster 112 self.message("Adding Cluster-specific paths") 113 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 114 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 115 116 self.isDecomposed=False
117
118 - def fullJobId(self):
119 """Return a string with the full job-ID""" 120 result=str(self.jobID) 121 if self.arrayJob: 122 result+=":"+str(self.taskID) 123 return result
124
125 - def message(self,*txt):
126 print "=== CLUSTERJOB: ", 127 for t in txt: 128 print t, 129 print " ===" 130 sys.stdout.flush()
131
132 - def setState(self,txt):
133 self.message("Setting Job state to",txt) 134 fName=path.join(self.casedir(),"ClusterJobState") 135 f=open(fName,"w") 136 f.write(txt+"\n") 137 f.close()
138
139 - def jobFile(self):
140 """The file with the job information""" 141 jobfile="%s.%d" % (self.jobName,self.jobID) 142 if self.arrayJob: 143 jobfile+=".%d" % self.taskID 144 jobfile+=".pyFoam.clusterjob" 145 jobfile=path.join(path.dirname(self.basename),jobfile) 146 147 return jobfile
148
149 - def checkpointFile(self):
150 """The file that makes the job write a checkpoint""" 151 return self.jobFile()+".checkpoint"
152
153 - def stopFile(self):
154 """The file that makes the job write a checkpoint and end""" 155 return self.jobFile()+".stop"
156
157 - def doIt(self):
158 """The central logic. Runs the job, sets it up etc""" 159 160 f=open(self.jobFile(),"w") 161 f.write(path.basename(self.basename)+"\n") 162 f.close() 163 164 self.message() 165 self.message("Running on directory",self.casename()) 166 self.message() 167 self.setState("Starting up") 168 169 parameters=None 170 if self.arrayJob: 171 parameters=self.taskParameters(self.taskID) 172 self.message("Parameters:",parameters) 173 if not self.restarted: 174 self.setState("Setting up") 175 self.setup(parameters) 176 if self.autoParallel and self.nproc>1: 177 self.setState("Decomposing") 178 self.autoDecompose() 179 180 self.isDecomposed=True 181 182 self.setState("Setting up 2") 183 self.postDecomposeSetup(parameters) 184 else: 185 self.setState("Restarting") 186 187 self.isDecomposed=True 188 189 self.setState("Running") 190 self.listenToTimer=True 191 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 192 self.timer.start() 193 194 self.run(parameters) 195 self.listenToTimer=False 196 197 if path.exists(self.jobFile()): 198 unlink(self.jobFile()) 199 200 if self.ordinaryEnd: 201 self.setState("Post Running") 202 self.preReconstructCleanup(parameters) 203 204 self.isDecomposed=False 205 206 if self.autoParallel and self.nproc>1: 207 self.setState("Reconstructing") 208 self.autoReconstruct() 209 210 if self.nproc>0: 211 self.additionalReconstruct(parameters) 212 213 self.setState("Cleaning") 214 self.cleanup(parameters) 215 self.setState("Finished") 216 else: 217 self.setState("Suspended") 218 219 if path.exists(self.stopFile()): 220 unlink(self.stopFile()) 221 if path.exists(self.checkpointFile()): 222 unlink(self.checkpointFile())
223
224 - def casedir(self):
225 """Returns the actual directory of the case 226 To be overridden if appropriate""" 227 if self.arrayJob: 228 return "%s.%05d" % (self.basename,self.taskID) 229 else: 230 return self.basename
231
232 - def casename(self):
233 """Returns just the name of the case""" 234 return path.basename(self.casedir())
235
236 - def foamRun(self,application, 237 args=[], 238 foamArgs=[], 239 steady=False, 240 multiRegion=None, 241 progress=False, 242 noLog=False):
243 """Runs a foam utility on the case. 244 If it is a parallel job and the grid has 245 already been decomposed (and not yet reconstructed) it is run in 246 parallel 247 @param application: the Foam-Application that is to be run 248 @param foamArgs: A list if with the additional arguments for the 249 Foam-Application 250 @param args: A list with additional arguments for the Runner-object 251 @param steady: Use the steady-runner 252 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 253 @param progress: Only output the time and nothing else 254 @param noLog: Do not generate a logfile""" 255 256 arglist=args[:] 257 arglist+=["--job-id=%s" % self.fullJobId()] 258 259 if self.isDecomposed and self.nproc>1: 260 arglist+=["--procnr=%d" % self.nproc, 261 "--machinefile=%s" % self.hostfile] 262 if progress: 263 arglist+=["--progress"] 264 if noLog: 265 arglist+=["--no-log"] 266 267 if self.multiRegion: 268 if multiRegion==None or multiRegion==True: 269 arglist+=["--all-regions"] 270 elif multiRegion and not self.multiRegion: 271 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 272 273 if self.restarted: 274 arglist+=["--restart"] 275 276 arglist+=[application] 277 if oldApp(): 278 arglist+=[".",self.casename()] 279 else: 280 arglist+=["-case",self.casename()] 281 282 arglist+=foamArgs 283 284 self.message("Executing",arglist) 285 286 if steady: 287 self.message("Running Steady") 288 runner=SteadyRunner(args=arglist) 289 else: 290 runner=Runner(args=arglist)
291
292 - def autoDecompose(self):
293 """Automatically decomposes the grid with a metis-algorithm""" 294 295 if path.isdir(path.join(self.casedir(),"processor0")): 296 warning("A processor directory already exists. There might be a problem") 297 args=["--method=metis", 298 "--clear", 299 self.casename(), 300 self.nproc, 301 "--job-id=%s" % self.fullJobId()] 302 303 if self.multiRegion: 304 args.append("--all-regions") 305 306 deco=Decomposer(args=args)
307
308 - def autoReconstruct(self):
309 """Default reconstruction of a parallel run""" 310 311 if self.doAutoReconstruct: 312 self.foamRun("reconstructPar", 313 args=["--logname=ReconstructPar"]) 314 else: 315 self.message("No reconstruction (because asked to)")
316
317 - def setup(self,parameters):
318 """Set up the job. Called in the beginning if the 319 job has not been restarted 320 321 Usual tasks include grid conversion/setup, mesh decomposition etc 322 323 @param parameters: a dictionary with parameters""" 324 325 pass
326
327 - def postDecomposeSetup(self,parameters):
328 """Additional setup, to be executed when the grid is already decomposed 329 330 Usually for tasks that can be done on a decomposed grid 331 332 @param parameters: a dictionary with parameters""" 333 334 pass
335
336 - def run(self,parameters):
337 """Run the actual job. Usually the solver. 338 @param parameters: a dictionary with parameters""" 339 340 pass
341
342 - def preReconstructCleanup(self,parameters):
343 """Additional cleanup, to be executed when the grid is still decomposed 344 345 Usually for tasks that can be done on a decomposed grid 346 347 @param parameters: a dictionary with parameters""" 348 349 pass
350
351 - def cleanup(self,parameters):
352 """Clean up after a job 353 @param parameters: a dictionary with parameters""" 354 355 pass
356
357 - def additionalReconstruct(self,parameters):
358 """Additional reconstruction of parallel runs (Stuff that the 359 OpenFOAM-reconstructPar doesn't do 360 @param parameters: a dictionary with parameters""" 361 362 pass
363
364 - def taskParameters(self,id):
365 """Parameters for a specific task 366 @param id: the id of the task 367 @return: a dictionary with parameters for this task""" 368 369 error("taskParameter not implemented. Not a parameterized job") 370 371 return {}
372
373 - def writeCheckpoint(self):
374 if self.listenToTimer: 375 f=open(path.join(self.basename,"write"),"w") 376 f.write("Jetzt will ich's wissen") 377 f.close() 378 unlink(self.checkpointFile()) 379 else: 380 warning("I'm not listening to your callbacks") 381 382 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
383
384 - def stopJob(self):
385 if self.listenToTimer: 386 self.ordinaryEnd=False 387 f=open(path.join(self.basename,"stop"),"w") 388 f.write("Geh z'haus") 389 f.close() 390 unlink(self.stopFile()) 391 else: 392 warning("I'm not listening to your callbacks")
393
394 -class SolverJob(ClusterJob):
395 """A Cluster-Job that executes a solver. It implements the run-function. 396 If a template-case is specified, the case is copied""" 397
398 - def __init__(self,basename,solver, 399 template=None, 400 cloneParameters=[], 401 arrayJob=False, 402 hardRestart=False, 403 autoParallel=True, 404 doAutoReconstruct=True, 405 foamVersion=None, 406 useFoamMPI=False, 407 steady=False, 408 multiRegion=False, 409 progress=False, 410 solverProgress=False, 411 solverNoLog=False):
412 """@param template: Name of the template-case. It is assumed that 413 it resides in the same directory as the actual case 414 @param cloneParameters: a list with additional parameters for the 415 CloneCase-object that copies the template 416 @param solverProgress: Only writes the current time of the solver""" 417 418 ClusterJob.__init__(self,basename, 419 arrayJob=arrayJob, 420 hardRestart=hardRestart, 421 autoParallel=autoParallel, 422 doAutoReconstruct=doAutoReconstruct, 423 foamVersion=foamVersion, 424 useFoamMPI=useFoamMPI, 425 multiRegion=multiRegion) 426 self.solver=solver 427 self.steady=steady 428 if template!=None and not self.restarted: 429 template=path.join(path.dirname(self.casedir()),template) 430 if path.abspath(basename)==path.abspath(template): 431 error("The basename",basename,"and the template",template,"are the same directory") 432 clone=CloneCase( 433 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 434 self.solverProgress=solverProgress 435 self.solverNoLog=solverNoLog
436
437 - def run(self,parameters):
438 self.foamRun(self.solver, 439 steady=self.steady, 440 multiRegion=False, 441 progress=self.solverProgress, 442 noLog=self.solverNoLog)
443