1
2
3
4
5 """mmCIF file and mmCIF dictionary parser. Files are parsed into a set of data
6 structures where they can be further processed. The data structures can also
7 be constructed and written back out as mmCIF. A CIF dictionary parser is also
8 included as a specialized version of the mmCIF parser.
9 """
10 from __future__ import generators
11
12 import re
13 import copy
14 import itertools
15
16
17
18
19
20
21
22
23
24
25
26
27 MAX_LINE = 2048
28
29
31 """Base class of errors raised by Structure objects.
32 """
33 pass
34
35
37 """Base class of errors raised by Structure objects.
38 """
40 Exception.__init__(self)
41 self.line_num = line_num
42 self.text = text
43
45 return "[line: %d] %s" % (self.line_num, self.text)
46
47
49 """Contains one row of data. In a mmCIF file, this is one complete
50 set of data found under a section. The data can be accessed by using
51 the column names as class attributes.
52 """
53 __slots__ = ["table"]
54
56 return id(self) == id(other)
57
59 cif_row = mmCIFRow()
60 for key, val in self.iteritems():
61 cif_row[key] = val
62 return cif_row
63
66
68 assert value is not None
69 dict.__setitem__(self, column.lower(), value)
70
72 try:
73 return self[name]
74 except KeyError:
75 raise AttributeError(name)
76
79
82
85
86 - def get(self, column, default = None):
87 return dict.get(self, column.lower(), default)
88
90 return dict.get(self, clower, default)
91
93 return dict.has_key(self, column.lower())
94
96 return dict.has_key(self, clower)
97
98
100 """Contains columns and rows of data for a mmCIF section. Rows of data
101 are stored as mmCIFRow classes.
102 """
103 __slots__ = ["name", "columns", "columns_lower", "data"]
104
105 - def __init__(self, name, columns = None):
115
121
123 return id(self) == id(other)
124
126 """Return true if the table is not a _loop table with multiple
127 rows of data.
128 """
129 return len(self) <= 1
130
132 try:
133 return self[name]
134 except KeyError:
135 raise AttributeError(name)
136
138 """Retrieves mmCIFRow at index x from the table if the argument is
139 an integer. If the argument is a string, then the data from the
140 first row is returned.
141 """
142 if isinstance(x, int):
143 return list.__getitem__(self, x)
144
145 elif isinstance(x, str):
146 try:
147 return self[0][x]
148 except (IndexError, KeyError):
149 raise KeyError
150
151 raise TypeError, x
152
154 assert value is not None
155
156 if isinstance(x, int) and isinstance(value, mmCIFRow):
157 value.table = self
158 list.__setitem__(self, x, value)
159
160 elif isinstance(x, str):
161 try:
162 self[0][x] = value
163 except IndexError:
164 row = mmCIFRow()
165 row[x] = value
166 self.append(row)
167
170
171 - def get(self, x, default = None):
172 try:
173 return self[x]
174 except KeyError:
175 return default
176
181
186
191
193 """Sets the list of column(subsection) names to the list of names in
194 columns.
195 """
196 self.columns = list()
197 self.columns_lower = dict()
198 for column in columns:
199 self.append_column(column)
200
212
214 """Tests if the table contains the column name.
215 """
216 return column.lower() in self.columns_lower
217
226
228 """Automatically sets the mmCIFTable column names by inspecting all
229 mmCIFRow objects it contains.
230 """
231 clower_used = {}
232 for cif_row in self:
233 for clower in cif_row.iterkeys():
234 clower_used[clower] = True
235 if clower not in self.columns_lower:
236 self.append_column(clower)
237 for clower in self.columns_lower.keys():
238 if not clower_used.has_key(clower):
239 self.remove_column(clower)
240
242 """Return the first row which which has column data matching value.
243 """
244 fpred = lambda r: r.get_lower(clower) == value
245 itertools.ifilter(fpred, self)
246 for row in itertools.ifilter(fpred, self):
247 return row
248 return None
249
251 """Preforms a SQL-like 'AND' select aginst all the rows in the table,
252 and returns the first matching row found. The arguments are a
253 variable list of tuples of the form:
254 (<lower-case-column-name>, <column-value>)
255 For example:
256 get_row(('atom_id','CA'),('entity_id', '1'))
257 returns the first matching row with atom_id==1 and entity_id==1.
258 """
259 if len(args) == 1:
260 clower, value = args[0]
261 for row in self:
262 if row.get_lower(clower) == value:
263 return row
264 else:
265 for row in self:
266 match_row = True
267 for clower, value in args:
268 if row.get_lower(clower) != value:
269 match_row = False
270 break
271 if match_row:
272 return row
273 return None
274
276 """Creates a new mmCIF rows, addes it to the table, and returns it.
277 """
278 cif_row = mmCIFRow()
279 self.append(cif_row)
280 return cif_row
281
283 """This is the same as get_row, but it iterates over all matching
284 rows in the table.
285 """
286 for cif_row in self:
287 match_row = True
288 for clower, value in args:
289 if cif_row.get_lower(clower) != value:
290 match_row = False
291 break
292 if match_row:
293 yield cif_row
294
296 """Return a dictionary mapping the value of the row's value in
297 column 'key' to the row itself. If there are multiple rows with
298 the same key value, they will be overwritten with the last found
299 row.
300 """
301 dictx = dict()
302 for row in self:
303 try:
304 dictx[row.getitem_lower(clower)] = row
305 except KeyError:
306 pass
307 return dictx
308
309
311 """Contains all information found under a data_ block in a mmCIF file.
312 mmCIF files are represented differently here than their file format
313 would suggest. Since a mmCIF file is more-or-less a SQL database dump,
314 the files are represented here with their sections as "Tables" and
315 their subsections as "Columns". The data is stored in "Rows".
316 """
317 __slots__ = ["name", "file"]
318
323
325 return "mmCIFData(name = %s)" % (self.name)
326
332
334 return id(self) == id(other)
335
337 try:
338 return self[name]
339 except KeyError:
340 raise AttributeError(name)
341
343 if isinstance(x, int):
344 return list.__getitem__(self, x)
345
346 elif isinstance(x, str):
347 name = x.lower()
348 for ctable in self:
349 if ctable.name.lower() == name:
350 return ctable
351 raise KeyError, x
352
353 raise TypeError, x
354
356 """
357 """
358 assert isinstance(table, mmCIFTable)
359
360 try:
361 old_table = self[x]
362 except (KeyError, IndexError):
363 pass
364 else:
365 self.remove(old_table)
366
367 if isinstance(x, int):
368 table.data = self
369 list.__setitem__(self, x, table)
370
371 elif isinstance(x, str):
372 self.append(table)
373
375 """Remove a mmCIFTable by index or table name.
376 """
377 self.remove(self[x])
378
380 """Append a mmCIFTable. This will trigger the removal of any table
381 with the same name.
382 """
383 assert isinstance(table, mmCIFTable)
384 try:
385 del self[table.name]
386 except KeyError:
387 pass
388 table.data = self
389 list.append(self, table)
390
399
404
406 try:
407 self[x]
408 except KeyError:
409 return False
410 else:
411 return True
412
413 - def get(self, x, default = None):
414 try:
415 return self[x]
416 except KeyError:
417 return default
418
420 try:
421 self[x]
422 except KeyError:
423 return False
424 else:
425 return True
426
428 """Looks up and returns a stored mmCIFTable class by its name. This
429 name is the section key in the mmCIF file.
430 """
431 try:
432 return self[name]
433 except KeyError:
434 return None
435 except IndexError:
436 return None
437
439 """Creates and returns a mmCIFTable object with the given name.
440 The object is added to this object before it is returned.
441 """
442 cif_table = mmCIFTable(name, columns)
443 self.append(cif_table)
444 return cif_table
445
447 cif_table_name, cif_column_name = tag[1:].split(".")
448 return cif_table_name.lower(), cif_column_name.lower()
449
450 - def join_tag(self, cif_table_name, cif_column_name):
451 return "_%s.%s" % (cif_table_name, cif_column_name)
452
454 """Get.
455 """
456 table_name, column = self.split_tag(tag)
457 try:
458 return self[table_name][column]
459 except KeyError:
460 return None
461
463 """Set.x
464 """
465 table_name, column = self.split_tag(tag)
466 self[table_name][column] = value
467
468
470 """Class to store data from mmCIF dictionary save_ blocks. We treat
471 them as non-nested sections along with data_ sections.
472 This may not be correct!
473 """
474 pass
475
476
478 """Class representing a mmCIF files.
479 """
481 cif_file = mmCIFFile()
482 for data in self:
483 cif_file.append(copy.deepcopy(data, memo))
484 return cif_file
485
487 l = [str(cdata) for cdata in self]
488 return "mmCIFFile([%s])" % (", ".join(l))
489
491 return id(self) == id(other)
492
494 try:
495 return self[name]
496 except KeyError:
497 raise AttributeError(name)
498
500 """Retrieve a mmCIFData object by index or name.
501 """
502 if isinstance(x, int):
503 return list.__getitem__(self, x)
504
505 elif isinstance(x, str):
506 name = x.lower()
507 for cdata in self:
508 if cdata.name.lower() == name:
509 return cdata
510 raise KeyError, x
511
512 raise TypeError, x
513
515 """Remove a mmCIFData by index or data name. Raises IndexError
516 or KeyError if the mmCIFData object is not found, the error raised
517 depends on the argument type.
518 """
519 self.remove(self[x])
520
522 """Append a mmCIFData object. This will trigger the removal of any
523 mmCIFData object in the file with the same name.
524 """
525 assert isinstance(cdata, mmCIFData)
526 try:
527 del self[cdata.name]
528 except KeyError:
529 pass
530 cdata.file = self
531 list.append(self, cdata)
532
534 assert isinstance(cdata, mmCIFData)
535 try:
536 del self[cdata.name]
537 except KeyError:
538 pass
539 cdata.file = self
540 list.insert(self, i, cdata)
541
543 for cdata in self:
544 if cdata.name == x:
545 return True
546 return False
547
548 - def get(self, x, default = None):
549 try:
550 return self[x]
551 except KeyError:
552 return default
553
555 """Load and append the mmCIF data from file object fil into self.
556 The fil argument must be a file object or implement its iterface.
557 """
558 if isinstance(fil, str):
559 fileobj = open(fil, "r")
560 else:
561 fileobj = fil
562 mmCIFFileParser().parse_file(fileobj, self)
563
565 if isinstance(fil, str):
566 fileobj = open(fil, "w")
567 else:
568 fileobj = fil
569 mmCIFFileWriter().write_file(fileobj, self)
570
572 """Returns the mmCIFData object with the given name. Returns None
573 if no such object exists.
574 """
575 try:
576 return self[name]
577 except KeyError:
578 return None
579 except IndexError:
580 return None
581
583 """Creates a new mmCIFData object with the given name, adds it
584 to this mmCIFFile, and returns it.
585 """
586 cif_data = mmCIFData(name)
587 self.append(cif_data)
588 return cif_data
589
590
592 """Class representing a mmCIF dictionary. The constructor of this class
593 takes two arguments. The first is the string path for the file, or
594 alternativly a file object.
595 """
596 pass
597
598
599
600
601
602
603
605 """Stateful parser which uses the mmCIFElementFile tokenizer to read
606 a mmCIF file and convert it into the mmCIFData/mmCIFTable/mmCIFRow
607 data hierarchy.
608 """
610 self.line_number = 0
611 token_iter = self.gen_token_iter(fileobj)
612
613 try:
614 self.parse(token_iter, cif_file)
615 except StopIteration:
616 pass
617 else:
618 raise mmCIFError()
619
622
624 """Returns the mmCIF token split into a 2-tuple:
625 (reserved word, name) where directive is one of the mmCIF
626 reserved words: data_, loop_, global_, save_, stop_
627 """
628 i = tokx.find("_")
629 if i == -1:
630 return None, None
631
632 rword = tokx[:i].lower()
633 if rword not in ("data", "loop", "global", "save", "stop"):
634 return None, None
635
636 name = tokx[i+1:]
637 return rword, name
638
639 - def parse(self, token_iter, cif_file):
640 """Stateful parser for mmCIF files.
641
642 XXX: loop_, data_, save_ tags are handled in a case-sensitive
643 manor. These tokens are case-insensitive.
644 """
645
646 cif_table_cache = dict()
647 cif_data = None
648 cif_table = None
649 cif_row = None
650 state = ""
651
652
653
654 while True:
655 tblx, colx, strx, tokx = token_iter.next()
656 if tokx is None:
657 continue
658 rword, name = self.split_token(tokx)
659 if rword is not None:
660 break
661
662 while True:
663
664
665
666 if tblx is not None:
667 state = "RD_SINGLE"
668
669 elif tokx is not None:
670 rword, name = self.split_token(tokx)
671
672 if rword == "loop":
673 state = "RD_LOOP"
674
675 elif rword == "data":
676 state = "RD_DATA"
677
678 elif rword == "save":
679 state = "RD_SAVE"
680
681 elif rword == "stop":
682 return
683
684 elif rword == "global":
685 self.syntax_error("unable to handle global_ syntax")
686
687 else:
688 self.syntax_error("bad token #1: " + str(tokx))
689
690 else:
691 self.syntax_error("bad token #2")
692 return
693
694
695
696
697 if state == "RD_SINGLE":
698 try:
699 cif_table = cif_table_cache[tblx]
700 except KeyError:
701 cif_table = cif_table_cache[tblx] = mmCIFTable(tblx)
702
703 try:
704 cif_data.append(cif_table)
705 except AttributeError:
706 self.syntax_error("section not contained in data_ block")
707 return
708
709 cif_row = mmCIFRow()
710 cif_table.append(cif_row)
711 else:
712 try:
713 cif_row = cif_table[0]
714 except IndexError:
715 self.syntax_error("bad token #3")
716 return
717
718
719 if colx in cif_table.columns:
720 self.syntax_error("redefined subsection (column)")
721 return
722 else:
723 cif_table.append_column(colx)
724
725
726
727 tx, cx, strx, tokx = token_iter.next()
728 if tx is not None or (strx is None and tokx is None):
729 self.syntax_error("missing data for _%s.%s" % (tblx,colx))
730
731 if tokx is not None:
732
733 rword, name = self.split_token(tokx)
734 if rword is not None:
735 if rword == "stop":
736 return
737 self.syntax_error("unexpected reserved word: %s" % (rword))
738
739 if tokx != ".":
740 cif_row[colx] = tokx
741
742 elif strx is not None:
743 cif_row[colx] = strx
744
745 else:
746 self.syntax_error("bad token #4")
747
748 tblx, colx, strx, tokx = token_iter.next()
749 continue
750
751
752
753
754
755
756
757 elif state == "RD_LOOP":
758
759
760 tblx, colx, strx, tokx = token_iter.next()
761
762 if tblx is None or colx is None:
763 self.syntax_error("bad token #5")
764 return
765
766 if cif_table_cache.has_key(tblx):
767 self.syntax_error("_loop section duplication")
768 return
769
770 cif_table = mmCIFTable(tblx)
771
772 try:
773 cif_data.append(cif_table)
774 except AttributeError:
775 self.syntax_error("_loop section not contained in data_ block")
776 return
777
778 cif_table.append_column(colx)
779
780
781 while True:
782 tblx, colx, strx, tokx = token_iter.next()
783
784 if tblx is None:
785 break
786
787 if tblx != cif_table.name:
788 self.syntax_error("changed section names in loop_")
789 return
790
791 cif_table.append_column(colx)
792
793
794
795 if tokx is not None:
796 rword, name = self.split_token(tokx)
797 if rword is not None:
798 if rword == "stop":
799 return
800 else:
801 self.syntax_error(
802 "unexpected reserved word: %s" % (rword))
803
804
805 while True:
806 cif_row = mmCIFRow()
807 cif_table.append(cif_row)
808
809 for col in cif_table.columns:
810 if tokx is not None:
811 if tokx != ".":
812 cif_row[col] = tokx
813 elif strx is not None:
814 cif_row[col] = strx
815
816 tblx,colx,strx,tokx = token_iter.next()
817
818
819
820 if tblx is not None:
821 break
822
823
824 if tokx is not None:
825 rword, name = self.split_token(tokx)
826 if rword is not None:
827 break
828
829 continue
830
831 elif state == "RD_DATA":
832 cif_data = mmCIFData(tokx[5:])
833 cif_file.append(cif_data)
834 cif_table_cache = dict()
835 cif_table = None
836
837 tblx,colx,strx,tokx = token_iter.next()
838
839 elif state == "RD_SAVE":
840 cif_data = mmCIFSave(tokx[5:])
841 cif_file.append(cif_data)
842 cif_table_cache = dict()
843 cif_table = None
844
845 tblx,colx,strx,tokx = token_iter.next()
846
847
849 re_tok = re.compile(
850 r"(?:"
851
852 "(?:_(.+?)[.](\S+))" "|"
853
854 "(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|"
855
856 "(?:\s*#.*$)" "|"
857
858 "(\S+)"
859
860 ")")
861
862 file_iter = iter(fileobj)
863
864
865 while True:
866 ln = file_iter.next()
867 self.line_number += 1
868
869
870 if ln.startswith("#"):
871 continue
872
873
874 if ln.startswith(";"):
875 lmerge = [ln[1:]]
876 while True:
877 ln = file_iter.next()
878 self.line_number += 1
879 if ln.startswith(";"):
880 break
881 lmerge.append(ln)
882
883 lmerge[-1] = lmerge[-1].rstrip()
884 yield (None, None, "".join(lmerge), None)
885 continue
886
887
888 tok_iter = re_tok.finditer(ln)
889
890 for tokm in tok_iter:
891 groups = tokm.groups()
892 if groups != (None, None, None, None):
893 yield groups
894
895
897 """Writes out a mmCIF file using the data in the mmCIFData list.
898 """
900 self.fil = fil
901
902
903 self.SPACING = 2
904
905
906
907 for cif_data in cif_data_list:
908 self.cif_data = cif_data
909 self.write_cif_data()
910
913
915 self.fil.write(x + "\n")
916
919
937
939 """Analyze x and return its type: token, qstring, mstring
940 """
941 assert x is not None
942
943 if not isinstance(x, str):
944 x = str(x)
945 return x, "token"
946
947 if x == "" or x == ".":
948 return ".", "token"
949
950 if x.find("\n") != -1:
951 return x, "mstring"
952
953 if x.count(" ") != 0 or x.count("\t") != 0 or x.count("#") != 0:
954 if len(x) > (MAX_LINE - 2):
955 return x, "mstring"
956 if x.count("' ") != 0 or x.count('" ') != 0:
957 return x, "mstring"
958 return x, "qstring"
959
960 if len(x) < MAX_LINE:
961 return x, "token"
962 else:
963 return x, "mstring"
964
966 if isinstance(self.cif_data, mmCIFSave):
967 self.writeln("save_%s" % self.cif_data.name)
968 else:
969 self.writeln("data_%s" % self.cif_data.name)
970
971 self.writeln("#")
972
973 for cif_table in self.cif_data:
974
975 if len(cif_table) == 0:
976 continue
977
978
979 elif len(cif_table) == 1:
980 self.write_one_row_table(cif_table)
981
982
983 elif len(cif_table) > 1 and len(cif_table.columns) > 0:
984 self.write_multi_row_table(cif_table)
985
986 else:
987 raise mmCIFError()
988
989 self.writeln("#")
990
992 row = cif_table[0]
993
994
995 kmax = 0
996 table_len = len(cif_table.name) + 2
997 for col in cif_table.columns:
998 klen = table_len + len(col)
999 assert klen < MAX_LINE
1000 kmax = max(kmax, klen)
1001
1002
1003 kmax += self.SPACING
1004 vmax = MAX_LINE - kmax - 1
1005
1006
1007 for col in cif_table.columns:
1008
1009 cif_key = "_%s.%s" % (cif_table.name, col)
1010 l = [cif_key.ljust(kmax)]
1011
1012 try:
1013 x0 = row[col]
1014 except KeyError:
1015 x = "?"
1016 dtype = "token"
1017 else:
1018 x, dtype = self.data_type(x0)
1019
1020 if dtype == "token":
1021 if len(x) > vmax:
1022 l.append("\n")
1023 l.append("%s\n" % (x))
1024 self.write("".join(l))
1025
1026 elif dtype == "qstring":
1027 if len(x) > vmax:
1028 l.append("\n")
1029 self.write("".join(l))
1030 self.write_mstring(x)
1031
1032 else:
1033 l.append("'%s'\n" % (x))
1034 self.write("".join(l))
1035
1036 elif dtype == "mstring":
1037 l.append("\n")
1038 self.write("".join(l))
1039 self.write_mstring(x)
1040
1042
1043 self.writeln("loop_")
1044 for col in cif_table.columns:
1045 key = "_%s.%s" % (cif_table.name, col)
1046 assert len(key) < MAX_LINE
1047 self.writeln(key)
1048
1049 col_len_map = {}
1050 col_dtype_map = {}
1051
1052 for row in cif_table:
1053 for col in cif_table.columns:
1054
1055 try:
1056 x0 = row[col]
1057 except KeyError:
1058 lenx = 1
1059 dtype = "token"
1060 else:
1061 x, dtype = self.data_type(x0)
1062
1063
1064 if dtype == "token":
1065 lenx = len(x)
1066 elif dtype == "qstring":
1067 lenx = len(x) + 2
1068 else:
1069 lenx = 0
1070
1071 try:
1072 col_dtype = col_dtype_map[col]
1073 except KeyError:
1074 col_dtype_map[col] = dtype
1075 col_len_map[col] = lenx
1076 continue
1077
1078
1079 if col_len_map[col] < lenx:
1080 col_len_map[col] = lenx
1081
1082
1083 if col_dtype != dtype:
1084 if dtype == "mstring":
1085 col_dtype_map[col] = "mstring"
1086 elif col_dtype == "token" and dtype == "qstring":
1087 col_dtype_map[col] = "qstring"
1088
1089
1090
1091 wlist = []
1092 llen = 0
1093 for col in cif_table.columns:
1094 dtype = col_dtype_map[col]
1095
1096 if dtype == "mstring":
1097 llen = 0
1098 wlist.append((None, None, None))
1099 wlist.append((col, dtype, None))
1100 continue
1101
1102 lenx = col_len_map[col]
1103 if llen == 0:
1104 llen = lenx
1105 else:
1106 llen += self.SPACING + lenx
1107
1108 if llen > (MAX_LINE - 1):
1109 wlist.append((None, None, None))
1110 llen = lenx
1111
1112 wlist.append((col, dtype, lenx))
1113
1114
1115 spacing = " " * self.SPACING
1116 add_space = False
1117 listx = []
1118
1119 for row in cif_table:
1120 for (col, dtype, lenx) in wlist:
1121
1122 if col is None:
1123 add_space = False
1124 listx.append("\n")
1125 continue
1126
1127 if add_space == True:
1128 add_space = False
1129 listx.append(spacing)
1130
1131 if dtype == "token":
1132 x = str(row.get(col, "."))
1133 if x == "":
1134 x = "."
1135 x = x.ljust(lenx)
1136 listx.append(x)
1137 add_space = True
1138
1139 elif dtype == "qstring":
1140 x = row.get(col, ".")
1141 if x == "":
1142 x = "."
1143 elif x != "." and x != "?":
1144 x = "'%s'" % (x)
1145 x = x.ljust(lenx)
1146 listx.append(x)
1147 add_space = True
1148
1149 elif dtype == "mstring":
1150 try:
1151 listx.append(self.form_mstring(row[col]))
1152 except KeyError:
1153 listx.append(".\n")
1154 add_space = False
1155
1156 add_space = False
1157 listx.append("\n")
1158
1159
1160
1161 if len(listx) > 1024:
1162 self.write("".join(listx))
1163 listx = []
1164
1165
1166 self.write("".join(listx))
1167
1168
1169
1171 import sys
1172 try:
1173 path = sys.argv[1]
1174 except IndexError:
1175 print "usage: mmCIF.py <mmCIF file path>"
1176 raise SystemExit
1177
1178 cif = mmCIFDictionary()
1179 cif.load_file(path)
1180 cif.save_file(sys.stdout)
1181
1182 if __name__ == '__main__':
1183 test_module()
1184
1185