Package mmLib :: Module StructureBuilder
[hide private]
[frames] | no frames]

Source Code for Module mmLib.StructureBuilder

  1  ## Copyright 2002-2010 by PyMMLib Development Group (see AUTHORS file) 
  2  ## This code is part of the PyMMLib distribution and governed by 
  3  ## its license.  Please see the LICENSE file that should have been 
  4  ## included as part of this package. 
  5  """Classes for building a mmLib.Structure representation of biological 
  6  macromolecules. 
  7  """ 
  8  import ConsoleOutput 
  9  import Library 
 10  import Structure 
 11  import UnitCell 
 12   
 13   
14 -class StructureBuilderError(Exception):
15 """Base class of errors raised by Structure objects. 16 """
17 - def __init__(self, message):
18 Exception.__init__(self) 19 self.message = message
20
21 - def __str__(self):
22 return self.message
23 24
25 -class StructureBuilder(object):
26 """Builder class for the mmLib.Structure object hierarchy. 27 StructureBuilder must be subclassed with a working parse_format() 28 method to implement a working builder. 29 """
30 - def __init__(self, 31 sequence_from_structure = False, 32 library_bonds = False, 33 distance_bonds = False, 34 auto_sort = True, 35 **args):
36 37 ## allocate a new Structure object for building if one was not 38 ## passed to the StructureBuilder 39 if args.has_key("structure"): 40 self.struct = args["structure"] 41 elif args.has_key("struct"): 42 self.struct = args["struct"] 43 else: 44 self.struct = Structure.Structure() 45 46 ## set structure_id 47 if args.has_key("structure_id"): 48 self.struct.structure_id = args["structure_id"] 49 50 ## options 51 self.calc_sequence = sequence_from_structure 52 self.library_bonds = library_bonds 53 self.distance_bonds = distance_bonds 54 self.auto_sort = auto_sort 55 56 ## caches used while building 57 self.cache_chain = None 58 self.cache_frag = None 59 60 ## if anything goes wrong, setting self.halt=True will stop the madness 61 self.halt = False 62 63 ## build the structure by executing this fixed sequence of methods 64 self.read_start(args["fil"]) 65 66 if not self.halt: self.read_start_finalize() 67 if not self.halt: self.read_atoms() 68 if not self.halt: self.read_atoms_finalize() 69 if not self.halt: self.read_metadata() 70 if not self.halt: self.read_metadata_finalize() 71 if not self.halt: self.read_end() 72 if not self.halt: self.read_end_finalize() 73 ## self.struct is now built and ready for use 74 75 if self.halt == True: 76 ConsoleOutput.fatal("self.halt == True")
77
78 - def read_start(self, fil):
79 """This methods needs to be reimplemented in a functional subclass. 80 This function is called with the file object (or any other object 81 passed in to build a Structure from) to begin the reading process. 82 This is usually used to open the source file. 83 """ 84 pass
85
86 - def read_start_finalize(self):
87 """Called after the read_start method. Does nothing currently, but may 88 be used in the future. 89 """ 90 self.name_service_list = []
91
92 - def read_atoms(self):
93 """This method needs to be reimplemented in a functional subclass. 94 The subclassed read_atoms method should call load_atom once for 95 every atom in the structure, and should not call any other 96 load_* methods. 97 """ 98 pass
99
100 - def load_atom(self, atm_map):
101 """Called repeatedly by the implementation of read_atoms to load all 102 the data for a single atom. The data is contained in the atm_map 103 argument, and is not well documented at this point. 104 Look at this function and you'll figure it out. 105 """ 106 ## create atom object 107 atm = Structure.Atom(**atm_map) 108 109 ## survey the atom and structure and determine if the atom requires 110 ## being passed to the naming service, absence of required fields 111 if not atm.fragment_id or not atm.chain_id: 112 self.name_service_list.append(atm) 113 return atm 114 115 try: 116 self.struct.add_atom(atm, True) 117 118 except Structure.FragmentOverwrite: 119 ConsoleOutput.warning("FragmentOverwrite: %s" % (atm)) 120 self.name_service_list.append(atm) 121 122 except Structure.AtomOverwrite, err: 123 ConsoleOutput.warning("AtomOverwrite: %s" % (err)) 124 self.name_service_list.append(atm) 125 126 return atm
127
128 - def name_service(self):
129 """Runs the name service on all atoms needing to be named. This is a 130 complicated function which corrects most commonly found errors and 131 omissions from PDB files. 132 """ 133 if len(self.name_service_list) == 0: 134 return 135 136 ## returns the next available chain_id in self.struct 137 ## XXX: it's possible to run out of chain IDs! 138 def next_chain_id(suggest_chain_id): 139 if suggest_chain_id != "": 140 chain = self.struct.get_chain(suggest_chain_id) 141 if not chain: 142 return suggest_chain_id 143 144 ## TODO: Add the following alphanumeric string to Constants.py, 2010-09-21 145 for chain_id in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789": 146 chain = self.struct.get_chain(chain_id) 147 if not chain: 148 return chain_id 149 150 raise StructureBuilderError("name_service exhausted new chain_ids")
151 152 153 ## NAME SERVICE FOR POLYMER ATOMS 154 155 ## What if we are given a list of atoms with res_name, frag_id, and 156 ## model_id where the frag_id are sequential? They can be sequential 157 ## several ways using insertion codes, but large breaks often denote 158 ## chain breaks. 159 160 ## We need to handle the special case of a list of polymer residues 161 ## which do not have chain_ids. This requires a first pass over the 162 ## atom list using different rules than what we use for sorting out 163 ## non-polymers. 164 165 current_polymer_type = None 166 current_polymer_model_id = None 167 current_polymer_chain_id = None 168 current_polymer_frag_id = None 169 current_polymer_res_name = None 170 current_polymer_name_dict = None 171 172 polymer_model_dict = {} 173 current_frag = None 174 current_frag_list = None 175 176 for atm in self.name_service_list[:]: 177 ## determine the polymer type of the atom 178 if Library.library_is_amino_acid(atm.res_name): 179 polymer_type = "protein" 180 elif Library.library_is_nucleic_acid(atm.res_name): 181 polymer_type = "dna" 182 else: 183 ## if the atom is not a polymer, we definitely have a break 184 ## in this chain 185 current_polymer_type = None 186 current_polymer_model_id = None 187 current_polymer_chain_id = None 188 current_polymer_frag_id = None 189 current_polymer_res_name = None 190 current_polymer_name_dict = None 191 current_frag = None 192 current_frag_list = None 193 continue 194 195 fragment_id = Structure.FragmentID(atm.fragment_id) 196 197 ## now we deal with conditions which can terminate the current 198 ## polymer chain 199 if polymer_type!=current_polymer_type or \ 200 atm.model_id!=current_polymer_model_id or \ 201 atm.chain_id!=current_polymer_chain_id or \ 202 fragment_id<current_polymer_frag_id: 203 204 current_polymer_type = polymer_type 205 current_polymer_model_id = atm.model_id 206 current_polymer_chain_id = atm.chain_id 207 current_polymer_frag_id = Structure.FragmentID(atm.fragment_id) 208 current_polymer_res_name = atm.res_name 209 current_polymer_name_dict = {atm.name: True} 210 211 ## create new fragment 212 current_frag = [atm] 213 current_frag_list = [current_frag] 214 215 ## create new fragment list (chain) 216 try: 217 model = polymer_model_dict[atm.model_id] 218 except KeyError: 219 model = [current_frag_list] 220 polymer_model_dict[atm.model_id] = model 221 else: 222 model.append(current_frag_list) 223 224 ## we have now dealt with the atom, so it can be removed from 225 ## the name service list 226 self.name_service_list.remove(atm) 227 continue 228 229 ## if we get here, then we know this atom is destine for the 230 ## current chain, and the algorithm needs to place the atom 231 ## in the current fragment, or create a new fragment for it 232 ## to go into; the conditions for it going into the current 233 ## fragment are: it has it have the same res_name, and its 234 ## atom name cannot conflict with the names of atoms already in 235 ## in the fragment 236 if atm.res_name != current_polymer_res_name or current_polymer_name_dict.has_key(atm.name): 237 current_polymer_res_name = atm.res_name 238 current_polymer_name_dict = {atm.name: True} 239 240 ## create new fragment and add it to the current fragment list 241 current_frag = [atm] 242 current_frag_list.append(current_frag) 243 244 ## we have now dealt with the atom, so it can be removed 245 ## from the name service list 246 self.name_service_list.remove(atm) 247 continue 248 249 ## okay, put it in the current fragment 250 current_frag.append(atm) 251 self.name_service_list.remove(atm) 252 253 ## now assign chain_ids and add the atoms to the structure 254 model_ids = polymer_model_dict.keys() 255 model_ids.sort() 256 model_list = [polymer_model_dict[model_id] for model_id in model_ids] 257 258 num_chains = 0 259 for frag_list in polymer_model_dict.itervalues(): 260 num_chains = max(num_chains, len(frag_list)) 261 262 for chain_index in xrange(num_chains): 263 ## get next available chain_id 264 chain_id = next_chain_id("") 265 266 ## assign the chain_id to all the atoms in the chain 267 ## TODO: check fragment_id too, 2010-09-22 268 for model in model_list: 269 frag_list = model[chain_index] 270 271 for frag in frag_list: 272 for atm in frag: 273 atm.chain_id = chain_id 274 self.struct.add_atom(atm, True) 275 276 ## free the memory used by the polymer naming service 277 del polymer_model_dict 278 del model_list 279 280 281 ## NAME SERVICE FOR NON-POLYMER ATOMS 282 ## cr = (chain_id, res_name) 283 ## 284 ## cr_dict[cr_key] = model_dict 285 ## 286 ## model_dict[model] = frag_list 287 ## 288 ## frag_list = [ frag1, frag2, frag3, ...] 289 ## 290 ## frag = [atm1, atm2, atm3, ...] 291 cr_dict = {} 292 cr_key_list = [] 293 294 frag_id = None 295 frag = None 296 name_dict = {} 297 298 ## split atoms into fragments 299 for atm in self.name_service_list: 300 atm_id = (atm.name, atm.alt_loc) 301 atm_frag_id = (atm.model_id, atm.chain_id, atm.fragment_id, atm.res_name) 302 303 ## if the atom fragment id matches the current fragment id 304 ## and doesn't conflict with any other atom name in the fragment 305 ## then add it to the fragment 306 if atm_frag_id==frag_id and not name_dict.has_key(atm_id): 307 frag.append(atm) 308 name_dict[atm_id] = True 309 310 else: 311 cr_key = (atm.chain_id, atm.res_name) 312 313 ### debug 314 if frag: 315 msg = "name_service: fragment detected in cr=%s" % ( 316 str(cr_key)) 317 ConsoleOutput.debug(msg) 318 for a in frag: 319 ConsoleOutput.debug(" " + str(a)) 320 ### /debug 321 322 try: 323 model_dict = cr_dict[cr_key] 324 except KeyError: 325 model_dict = cr_dict[cr_key] = {} 326 cr_key_list.append(cr_key) 327 328 try: 329 frag_list = model_dict[atm.model_id] 330 except KeyError: 331 frag_list = model_dict[atm.model_id] = [] 332 333 name_dict = {atm_id: True} 334 frag_id = atm_frag_id 335 frag = [atm] 336 frag_list.append(frag) 337 338 ## free self.name_service_list and other vars to save some memory 339 del self.name_service_list 340 341 new_chain_id = None 342 fragment_id_num = None 343 344 for cr_key in cr_key_list: 345 ### debug 346 msg = "name_service: chain_id / res_name keys\n" 347 msg += " cr_key: chain_id='%s' res_name='%s'" % ( 348 cr_key[0], cr_key[1]) 349 ConsoleOutput.debug(msg) 350 ### /debug 351 352 ## get the next chain ID, use the cfr group's 353 ## loaded chain_id if possible 354 chain_id = next_chain_id(cr_key[0]) 355 356 ## if we are not out of chain IDs, use the new chain ID and 357 ## reset the fragment_id 358 if chain_id != None: 359 new_chain_id = chain_id 360 fragment_id_num = 0 361 362 elif new_chain_id == None or fragment_id_num == None: 363 ConsoleOutput.fatal("name_service: unable to assign any chain ids") 364 365 ## get model dictionary 366 model_dict = cr_dict[cr_key] 367 368 ## inspect the model dictionary to determine the number of 369 ## fragments in each model -- they should be the same 370 ## and have a 1:1 correspondence; if not, match up the 371 ## fragments as much as possible 372 max_frags = -1 373 for (model, frag_list) in model_dict.iteritems(): 374 frag_list_len = len(frag_list) 375 376 if max_frags == -1: 377 max_frags = frag_list_len 378 continue 379 380 if max_frags != frag_list_len: 381 strx = "name_service: model fragments not identical" 382 ConsoleOutput.debug(strx) 383 ConsoleOutput.warning(strx) 384 max_frags = max(max_frags, frag_list_len) 385 386 ## now iterate through the fragment lists in parallel and assign 387 ## the new chain_id and fragment_id 388 for i in xrange(max_frags): 389 fragment_id_num += 1 390 391 for frag_list in model_dict.itervalues(): 392 try: 393 frag = frag_list[i] 394 except IndexError: 395 continue 396 397 ## assign new chain_id and fragment_id, than place the 398 ## atom in the structure 399 for atm in frag: 400 atm.chain_id = new_chain_id 401 atm.fragment_id = str(fragment_id_num) 402 self.struct.add_atom(atm, True) 403 404 ## logging 405 ConsoleOutput.warning("name_service(): added chain_id=%s, res_name=%s, num_residues=%d" % ( 406 new_chain_id, cr_key[1], fragment_id_num))
407
408 - def read_atoms_finalize(self):
409 """After loading all atom records, use the list of atom records to 410 build the structure. 411 """ 412 ## name atoms which did not fit into the Structure hierarchy with 413 ## their names from the file 414 self.name_service() 415 416 ## sort structural objects into their correct order 417 if self.auto_sort: 418 self.struct.sort()
419
420 - def read_metadata(self):
421 """This method needs to be reimplemented in a functional subclass. 422 The subclassed read_metadata method should call the various load_* 423 methods to set non-atom coordinate data for the Structure. 424 """ 425 pass
426
427 - def load_structure_id(self, structure_id):
428 """ 429 """ 430 assert isinstance(structure_id, str) 431 self.struct.structure_id = structure_id
432
433 - def load_unit_cell(self, ucell_map):
434 """Called by the implementation of load_metadata to load the unit cell 435 parameters for the structure. 436 """ 437 for key in ("a", "b", "c", "alpha", "beta", "gamma"): 438 if not ucell_map.has_key(key): 439 ConsoleOutput.debug("ucell_map missing: %s" % (key)) 440 return 441 442 if ucell_map.has_key("space_group"): 443 self.struct.unit_cell = UnitCell.UnitCell( 444 a = ucell_map["a"], 445 b = ucell_map["b"], 446 c = ucell_map["c"], 447 alpha = ucell_map["alpha"], 448 beta = ucell_map["beta"], 449 gamma = ucell_map["gamma"], 450 space_group = ucell_map["space_group"]) 451 else: 452 self.struct.unit_cell = UnitCell.UnitCell( 453 a = ucell_map["a"], 454 b = ucell_map["b"], 455 c = ucell_map["c"], 456 alpha = ucell_map["alpha"], 457 beta = ucell_map["beta"], 458 gamma = ucell_map["gamma"])
459
460 - def load_bonds(self, bond_map):
461 """Call by the implementation of load_metadata to load bond 462 information on the structure. The keys of the bond map are a 2-tuple 463 of the bonded Atom instances, and the value is a dictionary 464 containing information on the type of bond, which may also 465 be a symmetry operator. 466 467 [bond_map] 468 keys: (atm1, atm2) 469 values: bond_data_map(s) 470 471 [bond_data_map] 472 bond_type -> text description of bond type: covalent, salt bridge, 473 hydrogen, cispeptide 474 475 atm1_symop -> symmetry operation (if any) to be applied to atm1 476 atm2_symop -> same as above, for atom 2 477 478 The symmetry operations themselves are a 3x4 array of floating point 479 values composed of the 3x3 rotation matrix and the 3x1 translation. 480 """ 481 482 ### TODO: Fix this to build bonds in all models! 2010-09-22 483 for ((atm1, atm2), bd_map) in bond_map.iteritems(): 484 485 ## check for files which, for some reason, define have a bond 486 ## entry bonding the atom to itself 487 if atm1 == atm2: 488 ConsoleOutput.warning("silly file defines self bonded atom") 489 continue 490 491 atm1.create_bonds( 492 atom = atm2, 493 bond_type = bd_map.get("bond_type"), 494 atom1_symop = bd_map.get("atm1_symop"), 495 atom2_symop = bd_map.get("atm2_symop"), 496 standard_res_bond = False)
497
498 - def load_sequence(self, sequence_map):
499 """The sequence map contains the following keys: chain_id: the 500 chain ID fo the sequence; num_res: the number of residues in the 501 sequence; sequence_list: a list of 3-letter codes of the residues 502 in the sequence. 503 """ 504 try: 505 chain_id = sequence_map["chain_id"] 506 sequence_list = sequence_map["sequence_list"] 507 except KeyError: 508 return 509 510 ## add a copy of the sequence to each equivalent chain in 511 ## all models of the structure 512 for model in self.struct.iter_models(): 513 chain = model.get_chain(chain_id) 514 if chain: 515 chain.sequence.set_from_three_letter(sequence_list)
516
517 - def load_alpha_helicies(self, helix_list):
518 """The argument helix_list is a list of Python dictionaries with 519 information to build build AlphaHelix objects into the Structure. 520 521 The dictionary has attributes: 522 helix_id: The ID of the helix 523 chain_id: The chain_id where the helix is located 524 frag_id1: The start fragment_id of the helix 525 frag_id2: The end fragment_id of the helix 526 helix_class: The PDB helix class number 527 detaisl: Text commont about the helix 528 """ 529 for helix in helix_list: 530 ## get required information or blow off the helix 531 try: 532 helix["helix_id"] 533 helix["chain_id1"] 534 helix["frag_id1"] 535 helix["chain_id2"] 536 helix["frag_id2"] 537 except KeyError: 538 continue 539 540 ## build a AlphaHelix for every Model in the Structure 541 for model in self.struct.iter_models(): 542 alpha_helix = Structure.AlphaHelix(model_id=model.model_id, **helix) 543 model.add_alpha_helix(alpha_helix) 544 alpha_helix.construct_segment()
545
546 - def load_beta_sheets(self, beta_sheet_list):
547 """The argument beta_sheet_list is a list of Python dictionaries with 548 information to build build BetaSheet objects into the Structure. 549 550 The dictionary has attributes: 551 sheet_id: ID of the sheet 552 num_strands: total number of strands in the beta sheet 553 strand_list: list of dictionaries describing the strand with 554 the following attributes: 555 556 chain_id1/frag_id1: chain_id and fragment_id of inital residue 557 in the strand 558 chain_id2/frag_id2: chain_id and fragment_id of end residue 559 in the strand 560 sense: the sense of the strand with respect to the 561 previous strand, either the string 562 parallel or anti_parallel 563 564 reg_chain_id, reg_frag_id, reg_atom: 565 registration atom in current strand 566 reg_prev_chain_id, reg_prev_frag_id, reg_prev_atom: 567 registration atom in previous strand 568 """ 569 for sheet in beta_sheet_list: 570 ## get required info 571 try: 572 sheet["sheet_id"] 573 sheet["strand_list"] 574 except KeyError: 575 continue 576 577 ## iterate over all Models and add the BetaSheet description to 578 ## each Model 579 for model in self.struct.iter_models(): 580 beta_sheet = Structure.BetaSheet(model=model.model_id, **sheet) 581 582 for strand in sheet["strand_list"]: 583 ## required strand info 584 try: 585 strand["chain_id1"] 586 strand["frag_id1"] 587 strand["frag_id1"] 588 strand["frag_id2"] 589 except KeyError: 590 continue 591 592 beta_strand = Structure.Strand(**strand) 593 beta_sheet.add_strand(beta_strand) 594 595 model.add_beta_sheet(beta_sheet) 596 beta_sheet.construct_segments()
597
598 - def load_sites(self, site_list):
599 """The argument site_list is a list of Python dictionaries with 600 information to build build Site objects into the Structure. 601 """ 602 for site_desc in site_list: 603 ## check for required site info 604 try: 605 site_desc["site_id"] 606 site_desc["fragment_list"] 607 except KeyError: 608 continue 609 610 for model in self.struct.iter_models(): 611 site = Structure.Site(**site_desc) 612 model.add_site(site) 613 site.construct_fragments()
614
615 - def read_metadata_finalize(self):
616 """Called after the the metadata loading is complete. 617 """ 618 pass
619
620 - def read_end(self):
621 """This method needs to be reimplemented in a functional subclass. 622 The subclassed read_end method can be used for any clean up from 623 the file loading process you need, or may be left unimplemented. 624 """ 625 pass
626
627 - def read_end_finalize(self):
628 """Called for final cleanup after structure source reading is done. 629 Currently, this method does nothing but may be used in future versions. 630 """ 631 ConsoleOutput.debug("read_end_finalize()") 632 633 ## calculate sequences for all chains 634 if self.calc_sequence is True: 635 for model in self.struct.iter_models(): 636 for chain in model.iter_chains(): 637 if len(chain.sequence) == 0: 638 chain.sequence.set_from_fragments(chain.iter_fragments()) 639 640 ## build bonds as defined in the monomer library 641 if self.library_bonds is True: 642 self.struct.add_bonds_from_library() 643 644 ## build bonds by covalent distance calculations 645 if self.distance_bonds is True: 646 self.struct.add_bonds_from_covalent_distance()
647