1
2 from PyML.containers.labels import Labels
3 from PyML.containers.ext import csequencedata
4 from PyML.containers.baseDatasets import WrapperDataSet
5 from PyML.containers import ker
6 from PyML.containers.vectorDatasets import SparseDataSet
7
8 -class SequenceData (WrapperDataSet, csequencedata.SequenceData) :
9
39
40
41 - def copy(self, other, patterns, deepcopy) :
42
43 if patterns is None :
44 patterns = range(len(other))
45 self.container.__init__(self, other, patterns)
46
48
49 print 'reading from', fileName
50 from PyML.utils import fasta
51
52 headerHandler = fastaHeaderHandler
53 if 'headerHandler' in args :
54 headerHandler = args['headerHandler']
55 numPatterns = fasta.fasta_count(fileName)
56 self.container.__init__(self, numPatterns)
57
58 patternIDs = []
59 L = []
60 for record in fasta.fasta_itr(fileName) :
61 self.addPattern(record.sequence)
62 patternID, label = headerHandler(record.header)
63 patternIDs.append(patternID)
64 if label is not None :
65 L.append(label)
66
67 self.attachLabels(Labels(L, patternID = patternIDs, **args))
68
69
75
76
80
81 - def save(self, fileName) :
82
83 fileHandle = open(fileName, 'w')
84
85 for seqid in range(len(self)) :
86 fileHandle.write('>' + self.labels.patternID[seqid] + '\n')
87 seq = self.getSequence(seqid)
88 fileHandle.write(seq + '\n')
89
91
92 kerneltype = 'PositionalKmer'
93 if stringKernel is not None :
94 k = stringKernel.duplicate()
95 else :
96 if 'stringKernel' in args :
97 kerneltype = args['stringKernel']
98 if kerneltype == 'PositionalKmer' :
99 k = ker.PositionalKmerDispatcher(**args)
100 else :
101 raise ValueError, 'unrecognized type of string kernel'
102
103 self.stringKernel = k
104
106
107 return header.split()[0], None
108
110
111
112 kmerList = []
113 for s in sequences :
114 kmers = {}
115 for i in range(len(s) - k + 1) :
116 kmer = s[i:i+k] + addon
117 if kmer not in kmers :
118 kmers[kmer] = 0
119 kmers[kmer] += 1.0
120 kmerList.append(kmers)
121
122 return kmerList
123
124
144