Package mmLib :: Module CIF
[hide private]
[frames] | no frames]

Source Code for Module mmLib.CIF

  1  ## Copyright 2002-2010 by PyMMLib Development Group (see AUTHORS file) 
  2  ## This code is part of the PyMMLib distribution and governed by 
  3  ## its license.  Please see the LICENSE file that should have been 
  4  ## included as part of this package. 
  5  ## 
  6  ## DESCRIPTION: CIF Parser for CIF 1.1 format 
  7   
  8  from mmCIF import mmCIFSyntaxError 
9 -class CIFSyntaxError(mmCIFSyntaxError):
10 pass
11 12 # 13 # Lexical type constants 14 # 15 L_EOF = "<eof>" 16 L_DATA = "<data>" 17 L_LOOP = "<loop>" 18 L_STOP = "<stop>" 19 L_SAVE = "<save>" 20 L_GLOBAL = "<global>" 21 L_TAG = "<tag>" 22 L_VALUE = "<value>" 23 24 # 25 # Parser classes 26 #
27 -class CIFFile:
28 29 "Parser for reading a CIF 1.1 file." 30
31 - def __init__(self):
32 self.data_blocks = []
33
34 - def load_file(self, f):
35 import types 36 if isinstance(f, types.StringTypes): 37 name = f 38 f = open(f) 39 needClose = True 40 else: 41 name = "<input>" 42 needClose = False 43 try: 44 self.parse(f, name) 45 finally: 46 if needClose: 47 f.close()
48
49 - def parse(self, f, name):
50 lexer = Lexer(f, name) 51 token = lexer.next_token() 52 while token.type is L_DATA: 53 self.data_blocks.append(DataBlock(token.value, lexer, name)) 54 token = lexer.next_token() 55 if token.type is not L_EOF: 56 raise CIFSyntaxError(token.line, "input after data block")
57 58
59 -class DataBlock:
60 61 "CIF data block of tags and tables." 62
63 - def __init__(self, name, lexer, filename):
64 self.name = name 65 self.tags = {} 66 self.tables = [] 67 self.save_frames = [] 68 69 token = lexer.next_token() 70 while token.type is not L_EOF: 71 if token.type is L_TAG: 72 self.get_tag(token, lexer, filename) 73 elif token.type is L_LOOP: 74 self.get_table(token, lexer, filename) 75 elif token.type is L_DATA: 76 lexer.push_back(token) 77 break 78 elif token.type is L_SAVE: 79 self.get_save_frame(token, lexer, filename) 80 else: 81 raise CIFSyntaxError(token.line, "unexpected %s(%s)" % 82 (token.type, token.value)) 83 token = lexer.next_token()
84
85 - def get_tag(self, token, lexer, filename):
86 if '.' in token.value: 87 raise CIFSyntaxError(token.line, "'.' appears in tag name %s" % 88 token.value) 89 name = token.value.lower() 90 t = lexer.next_token() 91 if t.type is not L_VALUE: 92 raise CIFSyntaxError(token.line, "missing value for tag %s" % 93 token.value) 94 self.tags[name] = t.value
95
96 - def get_table(self, token, lexer, filename):
97 self.tables.append(Table(lexer, filename))
98
99 - def get_save_frame(self, token, lexer, filename):
100 raise CIFSyntaxError(token.line, "SaveFrame unimplemented")
101 102
103 -class Table:
104 """CIF data table. 105 """ 106
107 - def __init__(self, lexer, filename):
108 self.columns = [] 109 self.rows = [] 110 token = lexer.next_token() 111 if token.type is not L_TAG: 112 raise CIFSyntaxError(token.line, "missing tags for table") 113 while token.type is L_TAG: 114 self.columns.append(token.value.lower()) 115 token = lexer.next_token() 116 if token.type is not L_VALUE: 117 raise CIFSyntaxError(token.line, "missing rows for table") 118 numColumns = len(self.columns) 119 while token.type is L_VALUE: 120 row = [] 121 for i in range(numColumns): 122 if token.type is not L_VALUE: 123 raise CIFSyntaxError(token.line, "expected value and got %s" 124 % token.type) 125 row.append(token.value) 126 token = lexer.next_token() 127 self.rows.append(row) 128 lexer.push_back(token) 129 self._columnIndex = {} 130 for n, name in enumerate(self.columns): 131 self._columnIndex[name] = n
132
133 - def get_value(self, column, row):
134 n = self._columnIndex[column] 135 r = self.rows[row] 136 return r[n]
137 138 139 # 140 # Lexical analyzer classes 141 #
142 -class Lexer:
143 """Lexical analyzer for reading a CIF 1.1 file. 144 """ 145
146 - def __init__(self, f, filename):
147 self.f = f 148 self.filename = filename 149 self.prev_char = None 150 self.cur_char = None 151 self.peeked_char = None 152 self.pushed_token = None 153 self.line = 1
154
155 - def next_token(self):
156 # Return any tokens from previous "push_back" calls 157 if self.pushed_token is not None: 158 t = self.pushed_token 159 self.pushed_token = None 160 return t 161 162 from string import whitespace, digits 163 while True: 164 # 165 # Skip over whitespaces 166 # 167 while True: 168 c = self.next_char() 169 if not c: 170 return self.token(L_EOF, None) 171 if c not in whitespace: 172 break 173 # 174 # Check for comments 175 # 176 if c == '#': 177 while True: 178 c = self.next_char() 179 if not c: 180 return self.token(L_EOF, None) 181 if c == '\n': 182 break 183 # Start over with the next line 184 continue 185 # 186 # Check for quoted strings 187 # 188 if c == "'" or c == '"': 189 endQuote = c 190 atEnd = False 191 chars = [] 192 while True: 193 c = self.next_char() 194 if not c: 195 raise CIFSyntaxError(self.line, 196 "<eof> in quoted string") 197 if atEnd: 198 if c in whitespace: 199 return self.token(L_VALUE, ''.join(chars)) 200 else: 201 chars.append(endQuote) 202 if c != endQuote: 203 chars.append(c) 204 atEnd = False 205 else: 206 if c == endQuote: 207 atEnd = True 208 else: 209 chars.append(c) 210 atEnd = False 211 # 212 # Check for (illegal) bracket string 213 # 214 if c == '[': 215 raise CIFSyntaxError(self.line, 216 "bracket strings not permitted in CIF") 217 # 218 # Check for text field 219 # 220 if c == ';' and self.prev_char == '\n': 221 chars = [] 222 atStart = False 223 while True: 224 c = self.next_char() 225 if not c: 226 raise CIFSyntaxError(self.line, "<eof> in text field") 227 if c == ';' and atStart: 228 return self.token(L_VALUE, ''.join(chars)) 229 if atStart: 230 chars.append('\n') 231 if c == '\n': 232 atStart = True 233 else: 234 chars.append(c) 235 atStart = False 236 # 237 # Check for tags 238 # 239 if c == '_': 240 chars = [] 241 while True: 242 c = self.next_char() 243 if not c: 244 raise CIFSyntaxError(self.line, "<eof> in tag") 245 if c in whitespace: 246 return self.token(L_TAG, ''.join(chars)) 247 chars.append(c) 248 # 249 # Check for simple values 250 # 251 if c == '?': 252 return self.token(L_VALUE, c) 253 if c == '.': 254 if self.peek_char() in whitespace: 255 return self.token(L_VALUE, c) 256 # 257 # Get a value with no embedded whitespace 258 # 259 chars = [ c ] 260 while True: 261 c = self.next_char() 262 if not c or c in whitespace: 263 break 264 chars.append(c) 265 266 data = ''.join(chars) 267 lc = data.lower() 268 269 if lc.startswith("data_"): 270 return self.token(L_DATA, data[5:]) 271 elif lc.startswith("loop_"): 272 return self.token(L_LOOP, data[5:]) 273 elif lc.startswith("save_"): 274 return self.token(L_SAVE, data[5:]) 275 elif lc.startswith("stop_"): 276 return self.token(L_STOP, data[5:]) 277 elif lc.startswith("global_"): 278 return self.token(L_GLOBAL, data[5:]) 279 else: 280 return self.token(L_VALUE, data)
281
282 - def next_char(self):
283 if self.prev_char == '\n': 284 self.line += 1 285 self.prev_char = self.cur_char 286 if self.peeked_char is None: 287 self.cur_char = self.f.read(1) 288 else: 289 self.cur_char = self.peeked_char 290 self.peeked_char = None 291 return self.cur_char
292
293 - def peek_char(self):
294 if self.peeked_char is None: 295 self.peeked_char = self.f.read(1) 296 return self.peeked_char
297
298 - def token(self, type, value):
299 return Token(type, value, self.line)
300
301 - def push_back(self, token):
302 assert(self.pushed_token is None) 303 self.pushed_token = token
304
305 - def msg(self, s):
306 return formatMessage(self.filename, self.line, s)
307 308
309 -class Token:
310 """Lexical token with type, value and line number. 311 """
312 - def __init__(self, type, value, line):
313 self.type = type 314 self.value = value 315 self.line = line
316 317 318 # 319 # Utility functions 320 #
321 -def formatMessage(filename, line, msg):
322 return "%s(%d): %s" % (filename, line, msg)
323
324 -def makeNumber(s):
325 print "%s\n" % s ## DEBUG 326 327 paren = s.find('(') # ) for balance in vim 328 if paren != -1: 329 s = s[:paren] 330 try: 331 return int(s) 332 except ValueError: 333 return float(s)
334 335 336 ### <TESTING> 337 if __name__ == "__main__": 338 """Module tests. 339 """
340 - def lexer_test(test_file):
341 f = open(test_file) 342 lexer = Lexer(f, test_file) 343 while True: 344 token = lexer.next_token() 345 if token.type is L_EOF: 346 break 347 print formatMessage(test_file, token.line, 348 "%s: %s" % (token.type, token.value)) 349 350 f.close()
351
352 - def parser_test(test_file):
353 cif = CIFFile() 354 cif.load_file(test_file) 355 print "%d data blocks" % len(cif.data_blocks) 356 import pprint 357 for db in cif.data_blocks: 358 print "%s: %d tags, %d tables" % (db.name, 359 len(db.tags), len(db.tables)) 360 pprint.pprint(db.tags)
361 362 #lexer_test("ccd.cif") 363 parser_test("ccd.cif") 364 ### </TESTING> 365