Package PyML :: Package utils :: Module myio
[hide private]
[frames] | no frames]

Source Code for Module PyML.utils.myio

  1   
  2  import os 
  3  import string 
  4  import numpy 
  5  import math 
  6  import misc 
  7  import tempfile 
  8   
9 -def load(fileName) :
10 11 import cPickle 12 13 if not os.path.exists(fileName) : 14 raise ValueError, "file does not exist at %s" % fileName 15 16 fileHandle = open(fileName, "r") 17 18 u = cPickle.Unpickler(fileHandle) 19 object = u.load() 20 fileHandle.close() 21 return object
22
23 -def xmlLoad(fileName) :
24 25 import gnosis.xml.pickle as xml_pickle 26 27 if not os.path.exists(fileName) : 28 raise ValueError, "file does not exist at %s" % fileName 29 30 file = open(fileName) 31 32 return xml_pickle.load(file)
33
34 -def xmlSave(object, fileName) :
35 36 import gnosis.xml.pickle as xml_pickle 37 38 file = open(fileName, 'w') 39 xml_pickle.dump(object, file) 40 file.close()
41
42 -def save(object, fileName, binary = 1) :
43 44 import cPickle 45 46 fileHandle = open(fileName, "w") 47 p = cPickle.Pickler(fileHandle) 48 p.dump(object) 49 fileHandle.close()
50 51
52 -def csvwrite(a, fileName, delim = ',') :
53 '''write an array to a file in csv (comma-delimited) format 54 ''' 55 56 fileHandle = open(fileName,"w") 57 58 if type(a[0]) == type('') or type(a[0]) == type(1) or type(a[0]) == type(1.0) \ 59 or (type(a).__name__ == 'array' and len(numpy.shape(a)) == 1) : 60 for i in range(len(a)) : 61 fileHandle.write(str(a[i]) + '\n') 62 fileHandle.close() 63 return 64 65 for i in range(len(a)) : 66 out = "" 67 for j in range(len(a[i])) : 68 out += str(a[i][j]) 69 if j < len(a[i]) - 1 : 70 out = out + delim 71 out += '\n' 72 fileHandle.write(out) 73 74 fileHandle.close()
75
76 -def csvread(fileName, delim = ',') :
77 '''read a character array from a file in csv format''' 78 import misc 79 80 fileHandle = open(fileName, "r") 81 82 line = fileHandle.readline() 83 84 if delim == ' ' : delim = None 85 data = misc.emptyLOL(len(line.split(delim))) 86 dim = len(data) 87 88 while 1 : 89 line = line[:-1] 90 fields = line.split(delim) 91 if len(fields) != dim : 92 print 'badline:', line 93 for i in range(dim) : 94 data[i].append(fields[i]) 95 96 line=fileHandle.readline() 97 98 if not line : break 99 100 if len(data) == 1 : data=data[0] 101 102 return data
103
104 -def csvreadArray(fileName, type = 'float') :
105 '''read an array from a file in csv format into a numpy array''' 106 107 return dlmreadArray(fileName, ',', type)
108 109
110 -def dlmreadArray(fileName, delimiter = ' ', type = 'float') :
111 '''read an array from a delimited file into a numpy array. 112 all lines are assumed to have the same number of columns''' 113 114 commentChar = ['#', '%'] 115 fileHandle = open(fileName, "r") 116 if delimiter is ' ' : delimiter = None 117 n = 0 118 for line in fileHandle : 119 if line[0] in commentChar : continue 120 n += 1 121 if n == 1 : 122 d = len(line.split(delimiter)) 123 124 fileHandle.close() 125 fileHandle = open(fileName, "r") 126 127 if type == 'float' : 128 arrayType = numpy.float_ 129 elif type == 'int' : 130 arrayType = numpy.int 131 else : 132 raise ValueError, 'Wrong type of array' 133 if d == 1 : 134 X = numpy.zeros(n, arrayType) 135 else : 136 X = numpy.zeros((n,d), arrayType) 137 138 i=0 139 print d,n 140 for line in fileHandle : 141 if line[0] in commentChar : continue 142 #line = line[:-1] 143 fields = line.split(delimiter) 144 if len(fields) != d : 145 print 'badline:', line 146 if d == 1 : 147 X[i] = float(fields[0]) 148 else : 149 for j in range(d) : 150 X[i][j] = float(fields[j]) 151 i += 1 152 153 fileHandle.close() 154 155 return X
156 157
158 -def tableWrite(t, fileName=None, **args) :
159 '''Output a table out of a list of lists; elements number i 160 of each list form row i of the table 161 Usage : 162 tableWrite((list1,list2...)) - write table to stdout 163 tableWrite((list1,list2...), fileName) - write table to file 164 ''' 165 166 167 import sys 168 169 if fileName is not None : 170 fileHandle = open(fileName, "w") 171 else : 172 fileHandle = sys.stdout 173 174 if 'headings' in args : 175 headings = args['headings'] 176 else : 177 headings = None 178 179 d = len(t) 180 n = len(t[0]) 181 print d,n 182 maxlen=numpy.zeros(d) 183 184 if headings != None : 185 assert len(headings) == d 186 187 for i in range(n) : 188 for j in range(d) : 189 if type(t[j][i]) == type(1.0) : 190 s = "%f" % t[j][i] 191 else : 192 s = str(t[j][i]) 193 if len(s) > maxlen[j] : 194 maxlen[j] = len(s) 195 196 197 if headings != None : 198 for j in range(d) : 199 if len(headings[j]) > maxlen[j] : 200 maxlen[j] = len(headings[j]) 201 print >> fileHandle, "%s" % string.center(headings[j], maxlen[j]), 202 print >> fileHandle 203 204 for i in range(n) : 205 for j in range(d) : 206 207 if type(t[j][i]) == type("") : 208 print >> fileHandle, "%s" % string.ljust(t[j][i], maxlen[j]), 209 elif type(t[j][i]) == type(1) : 210 print >> fileHandle, "%s" % string.rjust(str(t[j][i]), maxlen[j]), 211 elif type(t[j][i]) == type(1.0) : 212 s = "%f" % t[j][i] 213 print >> fileHandle, "%s" % string.rjust(s, maxlen[j]), 214 else : 215 print >> fileHandle, "%s" % ' ' * maxlen[j], 216 print "unknown data type" 217 print >> fileHandle
218
219 -def dlmWrite(t, fileName, delim = ',') :
220 221 fileHandle = open(fileName, "w") 222 d = len(t) 223 224 try : 225 n = len(t[0]) 226 except : 227 n = len(t) 228 for i in range(n) : 229 fileHandle.write(str(t[i]) + '\n') 230 231 fileHandle.close() 232 233 return 234 235 for i in range(n) : 236 for j in range(d) : 237 238 fileHandle.write(str(t[j][i])) 239 240 if j == d-1 : 241 fileHandle.write('\n') 242 else : 243 fileHandle.write(delim)
244 245
246 -def writeDict(dict, fileName, delim = ',') :
247 '''write a dictionary into a file as a set of pairs of key,value 248 ''' 249 250 file = open(fileName, 'w') 251 252 for k in dict.keys() : 253 file.write(str(k) + delim + str(dict[k]) + '\n') 254 255 file.close()
256
257 -def subDict(dict, list) :
258 259 newDict = {} 260 for l in list : 261 if l in dict : 262 newDict[l] = dict[l] 263 264 return newDict
265 266
267 -def concatenateFiles(fileName1, fileName2, fileName3, delim = ' ') :
268 """Horizontal concatenation of of two delimited files into a third file 269 the delimiter is a space by default 270 """ 271 272 273 file1 = open(fileName1) 274 file2 = open(fileName2) 275 276 file3 = open(fileName3, 'w') 277 278 while 1 : 279 line1 = file1.readline() 280 line2 = file2.readline() 281 282 if len(line1) == 0 | len(line2) == 0 : 283 break 284 285 file3.write(line1[:-1] + delim + line2) 286 287 file1.close() 288 file2.close() 289 file3.close()
290 291
292 -def datasetIntersect(datasetSourceName, datasetIntersectName, newDatasetName) :
293 294 '''keep the patterns in the source dataset that appear in the intersect 295 dataset 296 ''' 297 298 idDict = {} 299 datasetIntersect = open(datasetIntersectName) 300 for line in datasetIntersect : 301 idDict[line[:line.find(',')]] = 1 302 303 datasetIntersect.close() 304 305 datasetSource = open(datasetSourceName) 306 newDataset = open(newDatasetName,'w') 307 308 for line in datasetSource : 309 id = line[:line.find(',')] 310 if id in idDict : 311 newDataset.write(line) 312 313 datasetSource.close() 314 newDataset.close()
315 316
317 -def datasetUnion(datasetName1, datasetName2, newDatasetName) :
318 '''assumes that the features in the two datasets have 319 different names!''' 320 321 dataset1 = open(datasetName1) 322 dataset2 = open(datasetName2) 323 newDataset = open(newDatasetName, 'w') 324 325 for line1 in dataset1 : 326 line2 = dataset2.readline() 327 328 newDataset.write(line1[:-1] + ' ') 329 tokens = line2.split()[1:] 330 newDataset.write(' '.join(tokens) + '\n')
331 332
333 -def makeDataSet(XfileName, labelsFileName, datasetFileName) :
334 '''make a sparse format data file out of an unlabeled sparse data file 335 and a labels file (comma delimited: id,label) 336 ''' 337 338 if not os.path.exists(XfileName) : 339 raise ValueError, "Xfile does not exist at %s" % XfileName 340 341 if not os.path.exists(labelsFileName) : 342 raise ValueError, "labels file does not exist at %s" % LabelsFileName 343 344 labelsFile = open(labelsFileName) 345 346 labels = {} 347 for line in labelsFile : 348 line = line[:-1] 349 if len(line.split()) ==2 : 350 (id,label) = line.split() 351 else : 352 (id,label) = line.split(',') 353 labels[id] = label 354 355 print len(labels) 356 labelsFile.close() 357 358 Xfile = open(XfileName) 359 datasetFile = open(datasetFileName, 'w') 360 361 for line in Xfile : 362 (id, restOfLine) = line.split(',') 363 if labels.has_key(id) : 364 datasetFile.write(id + ',' + labels[id] + ' ' + restOfLine) 365 366 Xfile.close() 367 datasetFile.close()
368 369
370 -def formatMotifData(motifFileName, labelsFileName, svmFormatFileName, 371 motifSpecFile = None) :
372 373 labelsFile = open(labelsFileName, 'r') 374 375 labels = {} 376 for line in labelsFile : 377 line = line[:-1] 378 if len(line.split()) == 2 : 379 (ac,label)=line.split() 380 else : 381 (ac,label) = line.split(',') 382 labels[ac] = label 383 384 labelsFile.close() 385 386 if motifSpecFile != None : 387 motifSpec 388 motifSpec = open(motifSpecFile) 389 for line in motifSpecFile : 390 line = line[:-1] 391 392 svmFormatFile = open(svmFormatFileName, 'w') 393 motifFile = open(motifFileName, 'r') 394 395 for line in motifFile : 396 397 tokens = line.split(';') 398 ac = tokens[0].split(',')[0] 399 400 if labels.has_key(ac) : 401 x = {} 402 for token in tokens[1:] : 403 featureID = token.split(',')[0] 404 if x.has_key(featureID) : 405 x[featureID] += 1 406 else : 407 x[featureID] = 1 408 409 print >> svmFormatFile, "%s,%s" % (ac,labels[ac]), 410 xKeys = x.keys() 411 #xKeys.sort() 412 for xKey in xKeys : 413 print >> svmFormatFile, "%s:%s" % (xKey,x[xKey]), 414 415 print >> svmFormatFile 416 417 svmFormatFile.close() 418 motifFile.close()
419 420 421
422 -def formatMotifX(motifFileName, XfileName, motifSpecFile = None) :
423 424 425 if motifSpecFile != None : 426 motifSpec = open(motifSpecFile) 427 for line in motifSpecFile : 428 line = line[:-1] 429 430 Xfile = open(XfileName, 'w') 431 motifFile = open(motifFileName, 'r') 432 433 for line in motifFile : 434 435 tokens = line.split(';') 436 ac = tokens[0].split(',')[0] 437 438 x = {} 439 for token in tokens[1:] : 440 featureID = token.split(',')[0] 441 if x.has_key(featureID) : 442 x[featureID] += 1 443 else : 444 x[featureID] = 1 445 446 Xfile.write(ac + ',') 447 xKeys = x.keys() 448 #xKeys.sort() 449 for xKey in xKeys : 450 print >> Xfile, "%s:%s" % (xKey,x[xKey]), 451 452 print >> Xfile 453 454 Xfile.close() 455 motifFile.close()
456 457
458 -def dlmExtract(inFile, outFields, outFile = None, convert = True, 459 filterFile = None, filterField = None, 460 inDelim = ',', outDelim = ',') :
461 462 '''Extract from a delimited file a list of fields to another delimited file 463 Input: 464 inFile - file name with the input data 465 outFields - a list of fields to extract from inFile 466 outFile - output file 467 convert - whether to convert numeric inputs from strings 468 inDelim - the delimiter in the input file 469 outDelim - the delimiter for the output file 470 ''' 471 472 473 inFileHandle = open(inFile) 474 if outFile is not None : 475 outFileHandle = open(outFile, 'w') 476 convert = False 477 478 if outFile is None : 479 data = [] 480 else : 481 data = None 482 483 if type(outFields) == type(1) : 484 outFields = [outFields] 485 486 if filterFile is not None : 487 filterDict = {} 488 filter = open(filterFile) 489 for line in filter : 490 filterDict[line[:-1]] = 1 491 filterFile.close() 492 493 for line in inFileHandle : 494 line = line[:-1] 495 496 fields = line.split(inDelim) 497 498 if filterFile is None or fields[filterField] in filterDict : 499 500 out = [] 501 for i in outFields : 502 if convert : 503 try : 504 out.append(float(fields[i])) 505 except : 506 out.append(fields[i]) 507 else : 508 out.append(fields[i]) 509 if outFile is None : 510 data.append(out) 511 else : 512 outFileHandle.write(outDelim.join(out) + '\n') 513 514 inFileHandle.close() 515 return data
516
517 -def countLines(fileName) :
518 519 if not os.path.exists(fileName) : 520 raise ValueError, "file does not exist at %s" % fileName 521 522 file = open(fileName) 523 524 numLines = 0 525 for line in file: 526 numLines += 1 527 528 file.close() 529 530 return numLines
531 532
533 -def extractLines(fileName, lines) :
534 '''extract the lines given by a list of line numbers in the file''' 535 536 file = open(fileName) 537 538 lineDict = misc.list2dict(lines) 539 lineNum = 1 540 for line in file : 541 if lineNum in lineDict : 542 print line[:-1] 543 lineNum += 1 544 545 file.close()
546 547
548 -def splitFile(fileName, numFiles) :
549 550 numLines = countLines(fileName) 551 552 if not os.path.exists(fileName) : 553 raise ValueError, "file does not exist at %s" % fileName 554 555 file = open(fileName) 556 557 numSplit = int(math.floor(numLines / numFiles)) 558 559 lineNum = 0 560 fileNum = 0 561 for line in file : 562 if math.fmod(lineNum, numSplit) == 0 : 563 fileNum += 1 564 try : 565 outFile.close() 566 except : 567 pass 568 outFile = open('split' + str(fileNum) + fileName, 'w') 569 570 outFile.write(line) 571 lineNum += 1 572 573 file.close()
574 575
576 -def log(message, fileName = 'progress.log', openMode = 'a') :
577 578 file = open(fileName, 'a') 579 file.write(message) 580 file.close()
581
582 -def isempty(fileName) :
583 584 if not os.path.exists(fileName) : 585 return 1 586 file = open(fileName) 587 if len(file.readlines()) == 0 : 588 file.close() 589 return 1 590 file.close() 591 return 0
592 593
594 -def findDelimiter(fileName) :
595 '''guess the delimiter of a delimited file according to the first line 596 in the file''' 597 598 delimiters = [',', ' ', ';', '\t'] 599 600 file = open(fileName) 601 line = file.readline() 602 file.close() 603 604 maxTokens = 0 605 maxDelim = '' 606 for delim in delimiters : 607 splitLen = len(misc.split(line, delim)) 608 if splitLen > maxTokens : 609 maxDelim = delim 610 maxTokens = splitLen 611 612 return maxDelim
613 614
615 -class UndoHandle:
616 """A Python handle that adds functionality for saving lines. 617 Saves lines in a LIFO fashion. 618 Added methods: 619 saveline Save a line to be returned next time. 620 peekline Peek at the next line without consuming it. 621 """
622 - def __init__(self, handle):
623 self._handle = handle 624 self._saved = []
625
626 - def readlines(self, *args, **keywds):
627 lines = self._saved + apply(self._handle.readlines, args, keywds) 628 self._saved = [] 629 return lines
630
631 - def readline(self, *args, **keywds):
632 if self._saved: 633 line = self._saved.pop(0) 634 else: 635 line = apply(self._handle.readline, args, keywds) 636 return line
637
638 - def read(self, size=-1):
639 if size == -1: 640 saved = string.join(self._saved, "") 641 self._saved[:] = [] 642 else: 643 saved = '' 644 while size > 0 and self._saved: 645 if len(self._saved[0]) <= size: 646 size = size - len(self._saved[0]) 647 saved = saved + self._saved.pop(0) 648 else: 649 saved = saved + self._saved[0][:size] 650 self._saved[0] = self._saved[0][size:] 651 size = 0 652 return saved + self._handle.read(size)
653
654 - def saveline(self, line):
655 if line: 656 self._saved = [line] + self._saved
657
658 - def peekline(self):
659 if self._saved: 660 line = self._saved[0] 661 else: 662 line = self._handle.readline() 663 self.saveline(line) 664 return line
665
666 - def tell(self):
667 lengths = map(len, self._saved) 668 sum = reduce(lambda x, y: x+y, lengths, 0) 669 return self._handle.tell() - sum
670
671 - def seek(self, *args):
672 self._saved = [] 673 apply(self._handle.seek, args)
674
675 - def __getattr__(self, attr):
676 return getattr(self._handle, attr)
677 678
679 -def removeEmpty(directory) :
680 '''remove all files that have size 0 from a directory''' 681 682 f = os.popen3("ls -l " + directory) 683 lines = f[1].readlines() 684 685 for line in lines[1:] : 686 if int(line.split()[4]) == 0 : 687 fileName = line.split()[-1] 688 print 'removing ',fileName 689 os.remove(os.path.join(directory, fileName))
690
691 -def selectLines(infile, outfile, lines, keepLines = 1) :
692 '''write to outfile the lines in infile whose line number is in the 693 given list of line numbers''' 694 695 infileHandle = open(infile) 696 outfileHandle = open(outfile, 'w') 697 698 lineDict = misc.list2dict(lines) 699 lineNum = 0 700 for line in infileHandle : 701 lineNum += 1 702 if keepLines == 1 : 703 if lineNum in lineDict : 704 outfileHandle.write(line) 705 else : 706 if lineNum not in lineDict : 707 outfileHandle.write(line)
708
709 -def return2newLine(inFile, outFile = None) :
710 """convert \r to \n (windows file to linux file)""" 711 712 rename = False 713 if outFile is None : 714 rename = True 715 outFile = tempfile.mktemp() 716 os.system("tr -d '\r' < " + inFile + " > " + outFile) 717 if rename : 718 os.rename(outFile, inFile)
719
720 -def concatByNum(filePattern, outfileName, directory = '.') :
721 """ 722 filePattern -- a regular expression that looks like: start\d+.dat 723 """ 724 725 files = os.listdir(directory) 726 import re 727 pattern = re.compile(filePattern) 728 numFiles = 1 729 for fileName in files : 730 if pattern.match(fileName) is not None : 731 numFiles += 1 732 733 outfile = open(outfileName, 'w') 734 for i in range(1, numFiles) : 735 fileName = filePattern.replace('\d+', str(i)) 736 fileHandle = open(os.path.join(directory, fileName)) 737 for line in fileHandle : 738 outfile.write(line)
739 740
741 -def myopen(fileName) :
742 """ 743 returns a file handle to a file which is possibly compressed 744 using either gzip or bz2 745 746 myopen tries to open the file as a gzip file or a bz2 file. 747 if unsuccessful with either it opens it with the standard open 748 command in 'U' that uses universal newline support (i.e. all 749 variations on \n yield \n. it returns the resulting file handle. 750 """ 751 752 753 if not ( os.path.exists(fileName) and os.path.isfile(fileName) ): 754 raise ValueError, 'file does not exist at %s' % fileName 755 756 import gzip 757 fileHandle = gzip.GzipFile(fileName) 758 gzippedFile = True 759 try : 760 line = fileHandle.readline() 761 fileHandle.close() 762 except : 763 gzippedFile = False 764 765 if gzippedFile : 766 return gzip.GzipFile(fileName) 767 768 import bz2 769 fileHandle = bz2.BZ2File(fileName) 770 bzippedFile = True 771 try : 772 line = fileHandle.readline() 773 fileHandle.close() 774 except : 775 bzippedFile = False 776 777 if bzippedFile : 778 return bz2.BZ2File(fileName) 779 780 return open(fileName, 'U')
781