1
2 import os
3 import string
4 import numpy
5 import math
6 import misc
7 import tempfile
8
10
11 import cPickle
12
13 if not os.path.exists(fileName) :
14 raise ValueError, "file does not exist at %s" % fileName
15
16 fileHandle = open(fileName, "r")
17
18 u = cPickle.Unpickler(fileHandle)
19 object = u.load()
20 fileHandle.close()
21 return object
22
24
25 import gnosis.xml.pickle as xml_pickle
26
27 if not os.path.exists(fileName) :
28 raise ValueError, "file does not exist at %s" % fileName
29
30 file = open(fileName)
31
32 return xml_pickle.load(file)
33
35
36 import gnosis.xml.pickle as xml_pickle
37
38 file = open(fileName, 'w')
39 xml_pickle.dump(object, file)
40 file.close()
41
42 -def save(object, fileName, binary = 1) :
43
44 import cPickle
45
46 fileHandle = open(fileName, "w")
47 p = cPickle.Pickler(fileHandle)
48 p.dump(object)
49 fileHandle.close()
50
51
53 '''write an array to a file in csv (comma-delimited) format
54 '''
55
56 fileHandle = open(fileName,"w")
57
58 if type(a[0]) == type('') or type(a[0]) == type(1) or type(a[0]) == type(1.0) \
59 or (type(a).__name__ == 'array' and len(numpy.shape(a)) == 1) :
60 for i in range(len(a)) :
61 fileHandle.write(str(a[i]) + '\n')
62 fileHandle.close()
63 return
64
65 for i in range(len(a)) :
66 out = ""
67 for j in range(len(a[i])) :
68 out += str(a[i][j])
69 if j < len(a[i]) - 1 :
70 out = out + delim
71 out += '\n'
72 fileHandle.write(out)
73
74 fileHandle.close()
75
76 -def csvread(fileName, delim = ',') :
77 '''read a character array from a file in csv format'''
78 import misc
79
80 fileHandle = open(fileName, "r")
81
82 line = fileHandle.readline()
83
84 if delim == ' ' : delim = None
85 data = misc.emptyLOL(len(line.split(delim)))
86 dim = len(data)
87
88 while 1 :
89 line = line[:-1]
90 fields = line.split(delim)
91 if len(fields) != dim :
92 print 'badline:', line
93 for i in range(dim) :
94 data[i].append(fields[i])
95
96 line=fileHandle.readline()
97
98 if not line : break
99
100 if len(data) == 1 : data=data[0]
101
102 return data
103
105 '''read an array from a file in csv format into a numpy array'''
106
107 return dlmreadArray(fileName, ',', type)
108
109
110 -def dlmreadArray(fileName, delimiter = ' ', type = 'float') :
111 '''read an array from a delimited file into a numpy array.
112 all lines are assumed to have the same number of columns'''
113
114 commentChar = ['#', '%']
115 fileHandle = open(fileName, "r")
116 if delimiter is ' ' : delimiter = None
117 n = 0
118 for line in fileHandle :
119 if line[0] in commentChar : continue
120 n += 1
121 if n == 1 :
122 d = len(line.split(delimiter))
123
124 fileHandle.close()
125 fileHandle = open(fileName, "r")
126
127 if type == 'float' :
128 arrayType = numpy.float_
129 elif type == 'int' :
130 arrayType = numpy.int
131 else :
132 raise ValueError, 'Wrong type of array'
133 if d == 1 :
134 X = numpy.zeros(n, arrayType)
135 else :
136 X = numpy.zeros((n,d), arrayType)
137
138 i=0
139 print d,n
140 for line in fileHandle :
141 if line[0] in commentChar : continue
142
143 fields = line.split(delimiter)
144 if len(fields) != d :
145 print 'badline:', line
146 if d == 1 :
147 X[i] = float(fields[0])
148 else :
149 for j in range(d) :
150 X[i][j] = float(fields[j])
151 i += 1
152
153 fileHandle.close()
154
155 return X
156
157
159 '''Output a table out of a list of lists; elements number i
160 of each list form row i of the table
161 Usage :
162 tableWrite((list1,list2...)) - write table to stdout
163 tableWrite((list1,list2...), fileName) - write table to file
164 '''
165
166
167 import sys
168
169 if fileName is not None :
170 fileHandle = open(fileName, "w")
171 else :
172 fileHandle = sys.stdout
173
174 if 'headings' in args :
175 headings = args['headings']
176 else :
177 headings = None
178
179 d = len(t)
180 n = len(t[0])
181 print d,n
182 maxlen=numpy.zeros(d)
183
184 if headings != None :
185 assert len(headings) == d
186
187 for i in range(n) :
188 for j in range(d) :
189 if type(t[j][i]) == type(1.0) :
190 s = "%f" % t[j][i]
191 else :
192 s = str(t[j][i])
193 if len(s) > maxlen[j] :
194 maxlen[j] = len(s)
195
196
197 if headings != None :
198 for j in range(d) :
199 if len(headings[j]) > maxlen[j] :
200 maxlen[j] = len(headings[j])
201 print >> fileHandle, "%s" % string.center(headings[j], maxlen[j]),
202 print >> fileHandle
203
204 for i in range(n) :
205 for j in range(d) :
206
207 if type(t[j][i]) == type("") :
208 print >> fileHandle, "%s" % string.ljust(t[j][i], maxlen[j]),
209 elif type(t[j][i]) == type(1) :
210 print >> fileHandle, "%s" % string.rjust(str(t[j][i]), maxlen[j]),
211 elif type(t[j][i]) == type(1.0) :
212 s = "%f" % t[j][i]
213 print >> fileHandle, "%s" % string.rjust(s, maxlen[j]),
214 else :
215 print >> fileHandle, "%s" % ' ' * maxlen[j],
216 print "unknown data type"
217 print >> fileHandle
218
219 -def dlmWrite(t, fileName, delim = ',') :
220
221 fileHandle = open(fileName, "w")
222 d = len(t)
223
224 try :
225 n = len(t[0])
226 except :
227 n = len(t)
228 for i in range(n) :
229 fileHandle.write(str(t[i]) + '\n')
230
231 fileHandle.close()
232
233 return
234
235 for i in range(n) :
236 for j in range(d) :
237
238 fileHandle.write(str(t[j][i]))
239
240 if j == d-1 :
241 fileHandle.write('\n')
242 else :
243 fileHandle.write(delim)
244
245
247 '''write a dictionary into a file as a set of pairs of key,value
248 '''
249
250 file = open(fileName, 'w')
251
252 for k in dict.keys() :
253 file.write(str(k) + delim + str(dict[k]) + '\n')
254
255 file.close()
256
258
259 newDict = {}
260 for l in list :
261 if l in dict :
262 newDict[l] = dict[l]
263
264 return newDict
265
266
268 """Horizontal concatenation of of two delimited files into a third file
269 the delimiter is a space by default
270 """
271
272
273 file1 = open(fileName1)
274 file2 = open(fileName2)
275
276 file3 = open(fileName3, 'w')
277
278 while 1 :
279 line1 = file1.readline()
280 line2 = file2.readline()
281
282 if len(line1) == 0 | len(line2) == 0 :
283 break
284
285 file3.write(line1[:-1] + delim + line2)
286
287 file1.close()
288 file2.close()
289 file3.close()
290
291
293
294 '''keep the patterns in the source dataset that appear in the intersect
295 dataset
296 '''
297
298 idDict = {}
299 datasetIntersect = open(datasetIntersectName)
300 for line in datasetIntersect :
301 idDict[line[:line.find(',')]] = 1
302
303 datasetIntersect.close()
304
305 datasetSource = open(datasetSourceName)
306 newDataset = open(newDatasetName,'w')
307
308 for line in datasetSource :
309 id = line[:line.find(',')]
310 if id in idDict :
311 newDataset.write(line)
312
313 datasetSource.close()
314 newDataset.close()
315
316
317 -def datasetUnion(datasetName1, datasetName2, newDatasetName) :
318 '''assumes that the features in the two datasets have
319 different names!'''
320
321 dataset1 = open(datasetName1)
322 dataset2 = open(datasetName2)
323 newDataset = open(newDatasetName, 'w')
324
325 for line1 in dataset1 :
326 line2 = dataset2.readline()
327
328 newDataset.write(line1[:-1] + ' ')
329 tokens = line2.split()[1:]
330 newDataset.write(' '.join(tokens) + '\n')
331
332
333 -def makeDataSet(XfileName, labelsFileName, datasetFileName) :
334 '''make a sparse format data file out of an unlabeled sparse data file
335 and a labels file (comma delimited: id,label)
336 '''
337
338 if not os.path.exists(XfileName) :
339 raise ValueError, "Xfile does not exist at %s" % XfileName
340
341 if not os.path.exists(labelsFileName) :
342 raise ValueError, "labels file does not exist at %s" % LabelsFileName
343
344 labelsFile = open(labelsFileName)
345
346 labels = {}
347 for line in labelsFile :
348 line = line[:-1]
349 if len(line.split()) ==2 :
350 (id,label) = line.split()
351 else :
352 (id,label) = line.split(',')
353 labels[id] = label
354
355 print len(labels)
356 labelsFile.close()
357
358 Xfile = open(XfileName)
359 datasetFile = open(datasetFileName, 'w')
360
361 for line in Xfile :
362 (id, restOfLine) = line.split(',')
363 if labels.has_key(id) :
364 datasetFile.write(id + ',' + labels[id] + ' ' + restOfLine)
365
366 Xfile.close()
367 datasetFile.close()
368
369
419
420
421
456
457
461
462 '''Extract from a delimited file a list of fields to another delimited file
463 Input:
464 inFile - file name with the input data
465 outFields - a list of fields to extract from inFile
466 outFile - output file
467 convert - whether to convert numeric inputs from strings
468 inDelim - the delimiter in the input file
469 outDelim - the delimiter for the output file
470 '''
471
472
473 inFileHandle = open(inFile)
474 if outFile is not None :
475 outFileHandle = open(outFile, 'w')
476 convert = False
477
478 if outFile is None :
479 data = []
480 else :
481 data = None
482
483 if type(outFields) == type(1) :
484 outFields = [outFields]
485
486 if filterFile is not None :
487 filterDict = {}
488 filter = open(filterFile)
489 for line in filter :
490 filterDict[line[:-1]] = 1
491 filterFile.close()
492
493 for line in inFileHandle :
494 line = line[:-1]
495
496 fields = line.split(inDelim)
497
498 if filterFile is None or fields[filterField] in filterDict :
499
500 out = []
501 for i in outFields :
502 if convert :
503 try :
504 out.append(float(fields[i]))
505 except :
506 out.append(fields[i])
507 else :
508 out.append(fields[i])
509 if outFile is None :
510 data.append(out)
511 else :
512 outFileHandle.write(outDelim.join(out) + '\n')
513
514 inFileHandle.close()
515 return data
516
518
519 if not os.path.exists(fileName) :
520 raise ValueError, "file does not exist at %s" % fileName
521
522 file = open(fileName)
523
524 numLines = 0
525 for line in file:
526 numLines += 1
527
528 file.close()
529
530 return numLines
531
532
534 '''extract the lines given by a list of line numbers in the file'''
535
536 file = open(fileName)
537
538 lineDict = misc.list2dict(lines)
539 lineNum = 1
540 for line in file :
541 if lineNum in lineDict :
542 print line[:-1]
543 lineNum += 1
544
545 file.close()
546
547
549
550 numLines = countLines(fileName)
551
552 if not os.path.exists(fileName) :
553 raise ValueError, "file does not exist at %s" % fileName
554
555 file = open(fileName)
556
557 numSplit = int(math.floor(numLines / numFiles))
558
559 lineNum = 0
560 fileNum = 0
561 for line in file :
562 if math.fmod(lineNum, numSplit) == 0 :
563 fileNum += 1
564 try :
565 outFile.close()
566 except :
567 pass
568 outFile = open('split' + str(fileNum) + fileName, 'w')
569
570 outFile.write(line)
571 lineNum += 1
572
573 file.close()
574
575
576 -def log(message, fileName = 'progress.log', openMode = 'a') :
577
578 file = open(fileName, 'a')
579 file.write(message)
580 file.close()
581
583
584 if not os.path.exists(fileName) :
585 return 1
586 file = open(fileName)
587 if len(file.readlines()) == 0 :
588 file.close()
589 return 1
590 file.close()
591 return 0
592
593
595 '''guess the delimiter of a delimited file according to the first line
596 in the file'''
597
598 delimiters = [',', ' ', ';', '\t']
599
600 file = open(fileName)
601 line = file.readline()
602 file.close()
603
604 maxTokens = 0
605 maxDelim = ''
606 for delim in delimiters :
607 splitLen = len(misc.split(line, delim))
608 if splitLen > maxTokens :
609 maxDelim = delim
610 maxTokens = splitLen
611
612 return maxDelim
613
614
616 """A Python handle that adds functionality for saving lines.
617 Saves lines in a LIFO fashion.
618 Added methods:
619 saveline Save a line to be returned next time.
620 peekline Peek at the next line without consuming it.
621 """
623 self._handle = handle
624 self._saved = []
625
627 lines = self._saved + apply(self._handle.readlines, args, keywds)
628 self._saved = []
629 return lines
630
632 if self._saved:
633 line = self._saved.pop(0)
634 else:
635 line = apply(self._handle.readline, args, keywds)
636 return line
637
638 - def read(self, size=-1):
639 if size == -1:
640 saved = string.join(self._saved, "")
641 self._saved[:] = []
642 else:
643 saved = ''
644 while size > 0 and self._saved:
645 if len(self._saved[0]) <= size:
646 size = size - len(self._saved[0])
647 saved = saved + self._saved.pop(0)
648 else:
649 saved = saved + self._saved[0][:size]
650 self._saved[0] = self._saved[0][size:]
651 size = 0
652 return saved + self._handle.read(size)
653
655 if line:
656 self._saved = [line] + self._saved
657
659 if self._saved:
660 line = self._saved[0]
661 else:
662 line = self._handle.readline()
663 self.saveline(line)
664 return line
665
667 lengths = map(len, self._saved)
668 sum = reduce(lambda x, y: x+y, lengths, 0)
669 return self._handle.tell() - sum
670
671 - def seek(self, *args):
672 self._saved = []
673 apply(self._handle.seek, args)
674
676 return getattr(self._handle, attr)
677
678
680 '''remove all files that have size 0 from a directory'''
681
682 f = os.popen3("ls -l " + directory)
683 lines = f[1].readlines()
684
685 for line in lines[1:] :
686 if int(line.split()[4]) == 0 :
687 fileName = line.split()[-1]
688 print 'removing ',fileName
689 os.remove(os.path.join(directory, fileName))
690
691 -def selectLines(infile, outfile, lines, keepLines = 1) :
692 '''write to outfile the lines in infile whose line number is in the
693 given list of line numbers'''
694
695 infileHandle = open(infile)
696 outfileHandle = open(outfile, 'w')
697
698 lineDict = misc.list2dict(lines)
699 lineNum = 0
700 for line in infileHandle :
701 lineNum += 1
702 if keepLines == 1 :
703 if lineNum in lineDict :
704 outfileHandle.write(line)
705 else :
706 if lineNum not in lineDict :
707 outfileHandle.write(line)
708
710 """convert \r to \n (windows file to linux file)"""
711
712 rename = False
713 if outFile is None :
714 rename = True
715 outFile = tempfile.mktemp()
716 os.system("tr -d '\r' < " + inFile + " > " + outFile)
717 if rename :
718 os.rename(outFile, inFile)
719
720 -def concatByNum(filePattern, outfileName, directory = '.') :
721 """
722 filePattern -- a regular expression that looks like: start\d+.dat
723 """
724
725 files = os.listdir(directory)
726 import re
727 pattern = re.compile(filePattern)
728 numFiles = 1
729 for fileName in files :
730 if pattern.match(fileName) is not None :
731 numFiles += 1
732
733 outfile = open(outfileName, 'w')
734 for i in range(1, numFiles) :
735 fileName = filePattern.replace('\d+', str(i))
736 fileHandle = open(os.path.join(directory, fileName))
737 for line in fileHandle :
738 outfile.write(line)
739
740
742 """
743 returns a file handle to a file which is possibly compressed
744 using either gzip or bz2
745
746 myopen tries to open the file as a gzip file or a bz2 file.
747 if unsuccessful with either it opens it with the standard open
748 command in 'U' that uses universal newline support (i.e. all
749 variations on \n yield \n. it returns the resulting file handle.
750 """
751
752
753 if not ( os.path.exists(fileName) and os.path.isfile(fileName) ):
754 raise ValueError, 'file does not exist at %s' % fileName
755
756 import gzip
757 fileHandle = gzip.GzipFile(fileName)
758 gzippedFile = True
759 try :
760 line = fileHandle.readline()
761 fileHandle.close()
762 except :
763 gzippedFile = False
764
765 if gzippedFile :
766 return gzip.GzipFile(fileName)
767
768 import bz2
769 fileHandle = bz2.BZ2File(fileName)
770 bzippedFile = True
771 try :
772 line = fileHandle.readline()
773 fileHandle.close()
774 except :
775 bzippedFile = False
776
777 if bzippedFile :
778 return bz2.BZ2File(fileName)
779
780 return open(fileName, 'U')
781