Module gridmap

Source Code for Module gridmap

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  # Written (W) 2008-2012 Christian Widmer 
  5  # Written (W) 2008-2010 Cheng Soon Ong 
  6  # Written (W) 2012-2013 Daniel Blanchard, dblanchard@ets.org 
  7  # Copyright (C) 2008-2012 Max-Planck-Society, 2012-2013 ETS 
  8   
  9  # This file is part of Grid Map. 
 10   
 11  # Grid Map is free software: you can redistribute it and/or modify 
 12  # it under the terms of the GNU General Public License as published by 
 13  # the Free Software Foundation, either version 3 of the License, or 
 14  # (at your option) any later version. 
 15   
 16  # Grid Map is distributed in the hope that it will be useful, 
 17  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 18  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 19  # GNU General Public License for more details. 
 20   
 21  # You should have received a copy of the GNU General Public License 
 22  # along with Grid Map.  If not, see <http://www.gnu.org/licenses/>. 
 23   
 24  """ 
 25  gridmap provides a high level front-end to DRMAA-python. 
 26   
 27  This module provides wrappers that simplify submission and collection of jobs, 
 28  in a more 'pythonic' fashion. 
 29   
 30  @author: Christian Widmer 
 31  @author: Cheng Soon Ong 
 32  @author: Dan Blanchard (dblanchard@ets.org) 
 33  """ 
 34   
 35  from __future__ import print_function, unicode_literals 
 36   
 37  import argparse 
 38  import bz2 
 39  try: 
 40      import cPickle as pickle  # For Python 2.x 
 41  except ImportError: 
 42      import pickle 
 43  import inspect 
 44  import os 
 45  import re 
 46  import subprocess 
 47  import sys 
 48  import traceback 
 49  import uuid 
 50  from socket import gethostname 
 51  from time import sleep 
 52   
 53  import drmaa 
 54  from redis import StrictRedis 
 55  from redis.exceptions import ConnectionError as RedisConnectionError 
 56   
 57  # Python 2.x backward compatibility 
 58  if sys.version_info < (3, 0): 
 59      range = xrange 
 60   
 61   
 62  #### Global settings #### 
 63  # Redis settings 
 64  REDIS_DB = 2 
 65  REDIS_PORT = 7272 
 66  MAX_TRIES = 50 
 67  SLEEP_TIME = 3 
 68   
 69  # Is mem_free configured properly on the cluster? 
 70  USE_MEM_FREE = False 
 71   
 72  # Which queue should we use by default 
 73  DEFAULT_QUEUE = 'all.q' 
74 75 76 -class Job(object):
77 """ 78 Central entity that wraps a function and its data. Basically, a job consists 79 of a function, its argument list, its keyword list and a field "ret" which 80 is filled, when the execute method gets called. 81 82 @note: This can only be used to wrap picklable functions (i.e., those that 83 are defined at the module or class level). 84 """ 85 86 __slots__ = ('_f', 'args', 'jobid', 'kwlist', 'cleanup', 'ret', 'exception', 87 'environment', 'replace_env', 'working_dir', 'num_slots', 88 'mem_free', 'white_list', 'path', 'uniq_id', 'name', 'queue') 89
90 - def __init__(self, f, args, kwlist=None, cleanup=True, mem_free="1G", 91 name='gridmap_job', num_slots=1, queue=DEFAULT_QUEUE):
92 """ 93 Initializes a new Job. 94 95 @param f: a function, which should be executed. 96 @type f: function 97 @param args: argument list of function f 98 @type args: list 99 @param kwlist: dictionary of keyword arguments for f 100 @type kwlist: dict 101 @param cleanup: flag that determines the cleanup of input and log file 102 @type cleanup: boolean 103 @param mem_free: Estimate of how much memory this job will need (for 104 scheduling) 105 @type mem_free: C{basestring} 106 @param name: Name to give this job 107 @type name: C{basestring} 108 @param num_slots: Number of slots this job should use. 109 @type num_slots: C{int} 110 @param queue: SGE queue to schedule job on. 111 @type queue: C{basestring} 112 """ 113 114 self.path = None 115 self._f = None 116 self.function = f 117 self.args = args 118 self.jobid = -1 119 self.kwlist = kwlist if kwlist is not None else {} 120 self.cleanup = cleanup 121 self.ret = None 122 self.environment = None 123 self.replace_env = False 124 self.working_dir = os.getcwd() 125 self.num_slots = num_slots 126 self.mem_free = mem_free 127 self.white_list = [] 128 self.uniq_id = None 129 self.name = name 130 self.queue = queue
131 132 @property
133 - def function(self):
134 ''' Function this job will execute. ''' 135 return self._f
136 137 @function.setter
138 - def function(self, f):
139 """ 140 setter for function that carefully takes care of 141 namespace, avoiding __main__ as a module 142 """ 143 144 m = inspect.getmodule(f) 145 try: 146 self.path = _clean_path(os.path.dirname(os.path.abspath( 147 inspect.getsourcefile(f)))) 148 except TypeError: 149 self.path = '' 150 151 # if module is not __main__, all is good 152 if m.__name__ != "__main__": 153 self._f = f 154 155 else: 156 157 # determine real module name 158 mn = os.path.splitext(os.path.basename(m.__file__))[0] 159 160 # make sure module is present 161 __import__(mn) 162 163 # get module 164 mod = sys.modules[mn] 165 166 # set function from module 167 self._f = getattr(mod, f.__name__)
168
169 - def execute(self):
170 """ 171 Executes function f with given arguments 172 and writes return value to field ret. 173 If an exception is encountered during execution, ret will 174 contain a pickled version of it. 175 Input data is removed after execution to save space. 176 """ 177 try: 178 self.ret = self.function(*self.args, **self.kwlist) 179 except Exception as exception: 180 self.ret = exception 181 traceback.print_exc() 182 del self.args 183 del self.kwlist
184 185 @property
186 - def native_specification(self):
187 """ 188 define python-style getter 189 """ 190 191 ret = "" 192 193 if self.name: 194 ret += " -N {0}".format(self.name) 195 if self.mem_free and USE_MEM_FREE: 196 ret += " -l mem_free={0}".format(self.mem_free) 197 if self.num_slots and self.num_slots > 1: 198 ret += " -pe smp {0}".format(self.num_slots) 199 if self.white_list: 200 ret += " -l h={0}".format('|'.join(self.white_list)) 201 if self.queue: 202 ret += " -q {0}".format(self.queue) 203 204 return ret
205
206 207 -def _submit_jobs(jobs, uniq_id, temp_dir='/scratch', white_list=None, 208 quiet=True):
209 """ 210 Method used to send a list of jobs onto the cluster. 211 @param jobs: list of jobs to be executed 212 @type jobs: c{list} of L{Job} 213 @param uniq_id: The unique suffix for the tables corresponding to this job 214 in the database. 215 @type uniq_id: C{basestring} 216 @param temp_dir: Local temporary directory for storing output for an 217 individual job. 218 @type temp_dir: C{basestring} 219 @param white_list: List of acceptable nodes to use for scheduling job. If 220 None, all are used. 221 @type white_list: C{list} of C{basestring} 222 @param quiet: When true, do not output information about the jobs that have 223 been submitted. 224 @type quiet: C{bool} 225 """ 226 227 session = drmaa.Session() 228 session.initialize() 229 jobids = [] 230 231 for job_num, job in enumerate(jobs): 232 # set job white list 233 job.white_list = white_list 234 235 # append jobs 236 jobid = _append_job_to_session(session, job, uniq_id, job_num, 237 temp_dir=temp_dir, quiet=quiet) 238 jobids.append(jobid) 239 240 sid = session.contact 241 session.exit() 242 243 return (sid, jobids)
244
245 246 -def _append_job_to_session(session, job, uniq_id, job_num, temp_dir='/scratch/', 247 quiet=True):
248 """ 249 For an active session, append new job based on information stored in job 250 object. Also sets job.job_id to the ID of the job on the grid. 251 252 @param session: The current DRMAA session with the grid engine. 253 @type session: C{drmaa.Session} 254 @param job: The Job to add to the queue. 255 @type job: L{Job} 256 @param uniq_id: The unique suffix for the tables corresponding to this job 257 in the database. 258 @type uniq_id: C{basestring} 259 @param job_num: The row in the table to store/retrieve data on. This is only 260 non-zero for jobs created via grid_map. 261 @type job_num: C{int} 262 @param temp_dir: Local temporary directory for storing output for an 263 individual job. 264 @type temp_dir: C{basestring} 265 @param quiet: When true, do not output information about the jobs that have 266 been submitted. 267 @type quiet: C{bool} 268 """ 269 270 jt = session.createJobTemplate() 271 272 # fetch env vars from shell 273 shell_env = os.environ 274 275 if job.environment and job.replace_env: 276 # only consider defined env vars 277 jt.jobEnvironment = job.environment 278 279 elif job.environment and not job.replace_env: 280 # replace env var from shell with defined env vars 281 env = shell_env 282 env.update(job.environment) 283 jt.jobEnvironment = env 284 285 else: 286 # only consider env vars from shell 287 jt.jobEnvironment = shell_env 288 289 # Make sure to use the .py and not the .pyc version of the module. 290 jt.remoteCommand = re.sub(r'\.pyc$', '.py', 291 _clean_path(os.path.abspath(__file__))) 292 jt.args = ['{0}'.format(uniq_id), '{0}'.format(job_num), job.path, temp_dir, 293 gethostname()] 294 jt.nativeSpecification = job.native_specification 295 jt.outputPath = ":" + temp_dir 296 jt.errorPath = ":" + temp_dir 297 298 jobid = session.runJob(jt) 299 300 # set job fields that depend on the jobid assigned by grid engine 301 job.jobid = jobid 302 303 if not quiet: 304 print('Your job {0} has been submitted with id {1}'.format(job.name, 305 jobid), 306 file=sys.stderr) 307 308 session.deleteJobTemplate(jt) 309 310 return jobid
311
312 313 -def _collect_jobs(sid, jobids, joblist, redis_server, uniq_id, 314 temp_dir='/scratch/', wait=True):
315 """ 316 Collect the results from the jobids, returns a list of Jobs 317 318 @param sid: session identifier 319 @type sid: string returned by cluster 320 @param jobids: list of job identifiers returned by the cluster 321 @type jobids: list of strings 322 @param redis_server: Open connection to the database where the results will 323 be stored. 324 @type redis_server: L{StrictRedis} 325 @param wait: Wait for jobs to finish? 326 @type wait: Boolean, defaults to False 327 @param temp_dir: Local temporary directory for storing output for an 328 individual job. 329 @type temp_dir: C{basestring} 330 """ 331 332 for ix in range(len(jobids)): 333 assert(jobids[ix] == joblist[ix].jobid) 334 335 s = drmaa.Session() 336 s.initialize(sid) 337 338 if wait: 339 drmaaWait = drmaa.Session.TIMEOUT_WAIT_FOREVER 340 else: 341 drmaaWait = drmaa.Session.TIMEOUT_NO_WAIT 342 343 s.synchronize(jobids, drmaaWait, True) 344 # print("success: all jobs finished", file=sys.stderr) 345 s.exit() 346 347 # attempt to collect results 348 job_output_list = [] 349 for ix, job in enumerate(joblist): 350 351 log_stdout_fn = os.path.join(temp_dir, job.name + '.o' + jobids[ix]) 352 log_stderr_fn = os.path.join(temp_dir, job.name + '.e' + jobids[ix]) 353 354 try: 355 job_output = _zload_db(redis_server, 'output{0}'.format(uniq_id), 356 ix) 357 except Exception as detail: 358 print(("Error while unpickling output for gridmap job {1} from" + 359 " stored with key output_{0}_{1}").format(uniq_id, ix), 360 file=sys.stderr) 361 print("This could caused by a problem with the cluster " + 362 "environment, imports or environment variables.", 363 file=sys.stderr) 364 print(("Try running `gridmap.py {0} {1} {2} {3} {4}` to see " + 365 "if your job crashed before writing its " + 366 "output.").format(uniq_id, 367 ix, 368 job.path, 369 temp_dir, 370 gethostname()), 371 file=sys.stderr) 372 print("Check log files for more information: ", file=sys.stderr) 373 print("stdout:", log_stdout_fn, file=sys.stderr) 374 print("stderr:", log_stderr_fn, file=sys.stderr) 375 print("Exception: {0}".format(detail)) 376 sys.exit(2) 377 378 #print exceptions 379 if isinstance(job_output, Exception): 380 print("Exception encountered in job with log file:", 381 file=sys.stderr) 382 print(log_stdout_fn, file=sys.stderr) 383 print(job_output, file=sys.stderr) 384 print(file=sys.stderr) 385 386 job_output_list.append(job_output) 387 388 return job_output_list
389
390 391 -def process_jobs(jobs, temp_dir='/scratch/', wait=True, white_list=None, 392 quiet=True):
393 """ 394 Take a list of jobs and process them on the cluster. 395 396 @param temp_dir: Local temporary directory for storing output for an 397 individual job. 398 @type temp_dir: C{basestring} 399 @param wait: Should we wait for jobs to finish? (Should only be false if the 400 function you're running doesn't return anything) 401 @type wait: C{bool} 402 @param white_list: If specified, limit nodes used to only those in list. 403 @type white_list: C{list} of C{basestring} 404 @param quiet: When true, do not output information about the jobs that have 405 been submitted. 406 @type quiet: C{bool} 407 """ 408 # Create new connection to Redis database with pickled jobs 409 redis_server = StrictRedis(host=gethostname(), db=REDIS_DB, port=REDIS_PORT) 410 411 # Check if Redis server is launched, and spawn it if not. 412 try: 413 redis_server.set('connection_test', True) 414 except RedisConnectionError: 415 with open('/dev/null') as null_file: 416 redis_process = subprocess.Popen(['redis-server', '-'], 417 stdout=null_file, 418 stdin=subprocess.PIPE, 419 stderr=null_file) 420 redis_process.stdin.write('''daemonize yes 421 pidfile {0} 422 port {1} 423 '''.format(os.path.join(temp_dir, 424 'redis{0}.pid'.format(REDIS_PORT)), 425 REDIS_PORT)) 426 redis_process.stdin.close() 427 # Wait for things to get started 428 sleep(5) 429 430 # Generate random name for keys 431 uniq_id = uuid.uuid4() 432 433 # Save jobs to database 434 for job_id, job in enumerate(jobs): 435 _zsave_db(job, redis_server, 'job{0}'.format(uniq_id), job_id) 436 437 # Submit jobs to cluster 438 sids, jobids = _submit_jobs(jobs, uniq_id, white_list=white_list, 439 temp_dir=temp_dir, quiet=quiet) 440 441 # Reconnect and retrieve outputs 442 job_outputs = _collect_jobs(sids, jobids, jobs, redis_server, uniq_id, 443 temp_dir=temp_dir, wait=wait) 444 445 # Make sure we have enough output 446 assert(len(jobs) == len(job_outputs)) 447 448 # Delete keys from existing server or just 449 redis_server.delete(*redis_server.keys('job{0}_*'.format(uniq_id))) 450 redis_server.delete(*redis_server.keys('output{0}_*'.format(uniq_id))) 451 return job_outputs
452
453 454 ##################################################################### 455 # MapReduce Interface 456 ##################################################################### 457 -def grid_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 458 num_slots=1, temp_dir='/scratch/', white_list=None, 459 queue=DEFAULT_QUEUE, quiet=True):
460 """ 461 Maps a function onto the cluster. 462 @note: This can only be used with picklable functions (i.e., those that are 463 defined at the module or class level). 464 465 @param f: The function to map on args_list 466 @type f: C{function} 467 @param args_list: List of arguments to pass to f 468 @type args_list: C{list} 469 @param cleanup: Should we remove the stdout and stderr temporary files for 470 each job when we're done? (They are left in place if there's 471 an error.) 472 @type cleanup: C{bool} 473 @param mem_free: Estimate of how much memory each job will need (for 474 scheduling). (Not currently used, because our cluster does 475 not have that setting enabled.) 476 @type mem_free: C{basestring} 477 @param name: Base name to give each job (will have a number add to end) 478 @type name: C{basestring} 479 @param num_slots: Number of slots each job should use. 480 @type num_slots: C{int} 481 @param temp_dir: Local temporary directory for storing output for an 482 individual job. 483 @type temp_dir: C{basestring} 484 @param white_list: If specified, limit nodes used to only those in list. 485 @type white_list: C{list} of C{basestring} 486 @param queue: The SGE queue to use for scheduling. 487 @type queue: C{basestring} 488 @param quiet: When true, do not output information about the jobs that have 489 been submitted. 490 @type quiet: C{bool} 491 """ 492 493 # construct jobs 494 jobs = [Job(f, [args] if not isinstance(args, list) else args, 495 cleanup=cleanup, mem_free=mem_free, 496 name='{0}{1}'.format(name, job_num), num_slots=num_slots, 497 queue=queue) 498 for job_num, args in enumerate(args_list)] 499 500 # process jobs 501 job_results = process_jobs(jobs, temp_dir=temp_dir, white_list=white_list, 502 quiet=quiet) 503 504 return job_results
505
506 507 -def pg_map(f, args_list, cleanup=True, mem_free="1G", name='gridmap_job', 508 num_slots=1, temp_dir='/scratch/', white_list=None, 509 queue=DEFAULT_QUEUE, quiet=True):
510 """ 511 @deprecated: This function has been renamed grid_map. 512 513 @param f: The function to map on args_list 514 @type f: C{function} 515 @param args_list: List of arguments to pass to f 516 @type args_list: C{list} 517 @param cleanup: Should we remove the stdout and stderr temporary files for 518 each job when we're done? (They are left in place if there's 519 an error.) 520 @type cleanup: C{bool} 521 @param mem_free: Estimate of how much memory each job will need (for 522 scheduling). (Not currently used, because our cluster does 523 not have that setting enabled.) 524 @type mem_free: C{basestring} 525 @param name: Base name to give each job (will have a number add to end) 526 @type name: C{basestring} 527 @param num_slots: Number of slots each job should use. 528 @type num_slots: C{int} 529 @param temp_dir: Local temporary directory for storing output for an 530 individual job. 531 @type temp_dir: C{basestring} 532 @param white_list: If specified, limit nodes used to only those in list. 533 @type white_list: C{list} of C{basestring} 534 @param queue: The SGE queue to use for scheduling. 535 @type queue: C{basestring} 536 @param quiet: When true, do not output information about the jobs that have 537 been submitted. 538 @type quiet: C{bool} 539 """ 540 return grid_map(f, args_list, cleanup=cleanup, mem_free=mem_free, name=name, 541 num_slots=num_slots, temp_dir=temp_dir, 542 white_list=white_list, queue=queue, quiet=quiet)
543
544 545 ##################################################################### 546 # Data persistence 547 ##################################################################### 548 -def _clean_path(path):
549 ''' Replace all weird SAN paths with normal paths ''' 550 551 path = re.sub(r'/\.automount/\w+/SAN/NLP/(\w+)-(dynamic|static)', 552 r'/home/nlp-\1/\2', path) 553 path = re.sub(r'/\.automount/[^/]+/SAN/Research/HomeResearch', 554 '/home/research', path) 555 return path
556
557 558 -def _zsave_db(obj, redis_server, prefix, job_num):
559 """ 560 Saves an object/function as bz2-compressed pickled data in a Redis database 561 562 @param obj: The object/function to store. 563 @type obj: C{object} or C{function} 564 @param redis_server: An open connection to the database 565 @type redis_server: L{StrictRedis} 566 @param prefix: The prefix to use for the key for this data. 567 @type prefix: C{basestring} 568 @param job_num: The ID of the job this data is for. 569 @type job_num: C{int} 570 """ 571 572 # Pickle the obj 573 pickled_data = bz2.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL), 9) 574 575 # Insert the pickled data into the database 576 redis_server.set('{0}_{1}'.format(prefix, job_num), pickled_data)
577
578 579 -def _zload_db(redis_server, prefix, job_num):
580 """ 581 Loads bz2-compressed pickled object from a Redis database 582 583 @param redis_server: An open connection to the database 584 @type redis_server: L{StrictRedis} 585 @param prefix: The prefix to use for the key for this data. 586 @type prefix: C{basestring} 587 @param job_num: The ID of the job this data is for. 588 @type job_num: C{int} 589 """ 590 attempt = 0 591 pickled_data = None 592 while pickled_data is None and attempt < MAX_TRIES: 593 pickled_data = redis_server.get('{0}_{1}'.format(prefix, job_num)) 594 attempt += 1 595 sleep(SLEEP_TIME) 596 return pickle.loads(bz2.decompress(pickled_data))
597
598 599 ################################################################ 600 # The following code will be executed on the cluster # 601 ################################################################ 602 -def _run_job(uniq_id, job_num, temp_dir, redis_host):
603 """ 604 Execute the pickled job and produce pickled output. 605 606 @param uniq_id: The unique suffix for the tables corresponding to this job 607 in the database. 608 @type uniq_id: C{basestring} 609 @param job_num: The index for this job's content in the job and output 610 tables. 611 @type job_num: C{int} 612 @param temp_dir: Local temporary directory for storing output for an 613 individual job. 614 @type temp_dir: C{basestring} 615 @param redis_host: Hostname of the database to connect to get the job data. 616 @type redis_host: C{basestring} 617 """ 618 # Connect to database 619 redis_server = StrictRedis(host=redis_host, port=REDIS_PORT, db=REDIS_DB) 620 621 print("Loading job...", end="", file=sys.stderr) 622 sys.stderr.flush() 623 job = _zload_db(redis_server, 'job{0}'.format(uniq_id), job_num) 624 print("done", file=sys.stderr) 625 626 print("Running job...", end="", file=sys.stderr) 627 sys.stderr.flush() 628 job.execute() 629 print("done", file=sys.stderr) 630 631 print("Writing output to database for job {0}...".format(job_num), end="", 632 file=sys.stderr) 633 sys.stderr.flush() 634 _zsave_db(job.ret, redis_server, 'output{0}'.format(uniq_id), job_num) 635 print("done", file=sys.stderr) 636 637 #remove files 638 if job.cleanup: 639 log_stdout_fn = os.path.join(temp_dir, '{0}.o{1}'.format(job.name, 640 job.jobid)) 641 log_stderr_fn = os.path.join(temp_dir, '{0}.e{1}'.format(job.name, 642 job.jobid)) 643 644 try: 645 os.remove(log_stdout_fn) 646 os.remove(log_stderr_fn) 647 except OSError: 648 pass
649
650 651 -def _main():
652 """ 653 Parse the command line inputs and call _run_job 654 """ 655 656 # Get command line arguments 657 parser = argparse.ArgumentParser(description="This wrapper script will run \ 658 a pickled Python function on \ 659 some pickled data in a Redis\ 660 database, " + "and write the\ 661 results back to the database.\ 662 You almost never want to run\ 663 this yourself.", 664 formatter_class=argparse.ArgumentDefaultsHelpFormatter, 665 conflict_handler='resolve') 666 parser.add_argument('uniq_id', 667 help='The unique suffix for the tables corresponding to\ 668 this job in the database.') 669 parser.add_argument('job_number', 670 help='Which job number should be run. Dictates which \ 671 input data is read from database and where output\ 672 data is stored.', 673 type=int) 674 parser.add_argument('module_dir', 675 help='Directory that contains module containing pickled\ 676 function. This will get added to PYTHONPATH \ 677 temporarily.') 678 parser.add_argument('temp_dir', 679 help='Directory that temporary output will be stored\ 680 in.') 681 parser.add_argument('redis_host', 682 help='The hostname of the server that where the Redis\ 683 database is.') 684 args = parser.parse_args() 685 686 print("Appended {0} to PYTHONPATH".format(args.module_dir), file=sys.stderr) 687 sys.path.append(_clean_path(args.module_dir)) 688 689 # Process the database and get job started 690 _run_job(args.uniq_id, args.job_number, _clean_path(args.temp_dir), 691 args.redis_host)
692 693 694 if __name__ == "__main__": 695 _main() 696