1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """
22 A parser for FASTA files.
23
24 Copyright (C) 2003, 2004 by BiRC -- Bioinformatics Research Center
25 University of Aarhus, Denmark
26 Contact: Thomas Mailund <mailund@birc.dk>
27 with changes by Asa Ben-Hur
28 """
29
30 from __future__ import generators
31 import os
32
33
35
36 if not ( os.path.exists(fileName) and os.path.isfile(fileName) ):
37 raise ValueError, 'file does not exist at %s' % fileName
38
39 import gzip
40 fileHandle = gzip.GzipFile(fileName)
41
42 gzippedFile = True
43 try :
44 line = fileHandle.readline()
45 fileHandle.close()
46 except :
47 gzippedFile = False
48
49 if gzippedFile :
50 return gzip.GzipFile(fileName)
51 else :
52 return open(fileName)
53
54
58
60 "a fasta record."
61
63 "Create a record with the given header and sequence."
64 self.header = header
65 self.sequence = sequence
66
68
69 return '>' + self.header + '\n' + self.sequence + '\n'
70
71
73 "Provide an iteration through the fasta records in file."
74
75 h = file.readline()[:-1]
76 if h[0] != '>':
77 raise MalformedInput()
78 h = h[1:]
79
80 seq = []
81 for line in file:
82 line = line[:-1]
83
84 if line[0] == '>':
85 yield FastaRecord(h,''.join(seq))
86
87 h = line[1:]
88 seq = []
89 continue
90
91
92 seq.append(line)
93
94 yield FastaRecord(h,''.join(seq))
95
96
98 "Provide an iteration through the fasta records in the file named fname. "
99
100 f = myopen(fname)
101 for rec in _fasta_itr_from_file(f) :
102 yield rec
103
104
106 """Provide an iteration through the fasta records in file `src'.
107
108 Here `src' can be either a file object or the name of a file.
109 """
110 if type(src) == str :
111 return _fasta_itr_from_name(src)
112 elif type(src) == file :
113 return _fasta_itr_from_file(src)
114 else:
115 raise TypeError
116
118 "Return the record in itr with the given name."
119 x = name.strip()
120 for rec in itr:
121 if rec.header.strip() == x:
122 return rec
123 return None
124
126 "An iterator through a sequence of fasta records."
127
129 "Create an iterator through the records in src."
130
131 self.__itr = _fasta_itr(src)
132
136
138
139 return self.__itr.next()
140
144
146
147 """Provide an iteration through the fasta records in 'src', from
148 'start' to 'stop'.
149
150 """
151 - def __init__(self, src, first, last = None):
152 """
153 :Parameters:
154 - `src` - the fasta file/file handle. file can be gzipped.
155 - `first` - the first record (either its index in the file or
156 its identifier
157 - `last` - the last record to be output (index in the file or identifier)
158 """
159 self.__itr = _fasta_itr(src)
160 self.__first = first
161 self.__last = last
162 if type(first) == int :
163 self.__current = 0
164 elif type(first) == type('') :
165 self.__current = None
166 else :
167 raise ValueError, 'bad first'
168
169 self.__foundFirst = False
170 if self.__first == 0 or self.__first == '' :
171 self.__foundFirst = True
172
173
177
179
180 if not self.__foundFirst :
181 for rec in self.__itr :
182 if type(self.__first) == int :
183 if self.__first == self.__current :
184 self.__foundFirst = True
185 break
186 self.__current += 1
187 else :
188 if rec.header == self.__first :
189 self.__foundFirst = True
190 break
191 self.__current = rec.header
192 if not self.__foundFirst :
193 raise ValueError, 'did not find first record'
194 return rec
195
196 rec = self.__itr.next()
197
198 if self.__last is not None :
199 if type(self.__first) == int :
200 self.__current += 1
201 if self.__current == self.__last :
202 raise StopIteration
203 else :
204 if rec.header == self.__last :
205 raise StopIteration
206 self.__current = rec.header
207
208 return rec
209
210
214
215 - def save(self, fileName) :
216
217 outfile = open(fileName, 'w')
218 for record in self :
219 outfile.write(str(record))
220
222 "Return the record in src with the given name."
223
224 return fasta_itr(src)[name]
225
226
228 """
229 count the number of records in a fasta file
230 """
231
232 num_records = 0
233 for rec in fasta_itr(src) :
234 num_records += 1
235
236 return num_records
237
238
239 -def fasta_split(fileName, num_files, directory = None) :
240 """
241 split a fasta file into a given number of files
242 the resulting files are named by adding a number to the provided file name.
243
244 :Parameters:
245 - `fileName` - the fasta file to split
246 - `num_files` - the number of files to split into
247 - `directory` - the directory into which to write the files
248 """
249
250 num_records = fasta_count(fileName)
251 print num_records
252 if directory is None :
253 base, ext = os.path.splitext(fileName)
254 else :
255 dir, name = os.path.split(fileName)
256 base, ext = os.path.splitext(name)
257 base = os.path.join(directory, base)
258 print base
259 rec_num = 0
260 file_num = 1
261 recs_per_file = num_records / num_files + 1
262 for rec in fasta_itr(fileName) :
263 if rec_num % recs_per_file == 0 :
264 outfile = open(base + '.' + str(file_num) + ext, 'w')
265 file_num += 1
266 outfile.write(str(rec))
267 rec_num += 1
268
270
271 if type(ids) != type({}) :
272 import misc
273 ids = misc.list2dict(ids)
274
275 outfile = open(outfileName, 'w')
276 for rec in fasta_itr(infileName) :
277 if rec.header in ids :
278 outfile.write(str(rec))
279
281
282 rec = fasta_itr(fastaFile).next()
283 if rec.header.find('|') >= 0 :
284 return '|'
285 else :
286 return None
287
288
289 if __name__ == '__main__':
290
291 import sys
292 if len(sys.argv) != 2:
293 print "missing file name"
294 sys.exit(2)
295
296 print 'iterating through all sequences in input file'
297 for rec in fasta_itr(sys.argv[1]):
298 print rec
299
300 print 'iterating through input, from the second sequence'
301 for rec in fasta_slice(sys.argv[1], 1, 3):
302 print rec
303