1
2 __docformat__ = "restructuredtext en"
3 import random
4 from PyML.containers import parsers
5
7 """
8 A class that holds the labels of a dataset.
9
10 Attributes::
11
12 L - labels provided by the user (strings)
13 Y - internal label representation - an integer from 0 to numClasses - 1
14 for multilabel problems each pattern has a list of integer labels
15 patternID - a list of the ids of each pattern
16 classDict - a mapping from L to Y
17 classLabels - a list providing the name of class i
18 classSize - a list with the number of patterns in each class
19 numClasses - the number of classes in the data
20 """
21
22 - def __init__(self, arg = None, **args) :
23
24 """
25 :Parameters:
26 - `arg` - a file name from which to read labels, or a list of labels
27
28 :Keywords:
29 - `patternID` - a list of pattern IDs
30 - `patterns` - in case of copy construction, which patterns to copy
31 - `numericLabels` - a Boolean, indicating whether the labels are
32 class labels or numeric values (class labels by default).
33 - `positiveClass` - for a two class problem, the identity of the
34 positive class. If the labels are '+1' and '-1' or '1' and '-1',
35 the positive class is detected automatically.
36 - `classLabels` -
37 - `forgetClassLabels` - when using copy construction the default behavior
38 is to return a Labels object that remembers the set of classes that the
39 original object had, even if some classes are no longer represented.
40 this keyword allows you to change this behavior, so that the classes of
41 the original object are forgotten.
42 """
43
44 self.L = None
45 self.numericLabels = False
46 if 'numericLabels' in args :
47 self.numericLabels = args['numericLabels']
48 self.positiveClass = None
49 if 'positiveClass' in args :
50 self.positiveClass = args['positiveClass']
51 if 'classLabels' in args :
52 self.classLabels = args['classLabels']
53
54 if type(arg) == type('') :
55 args['hint'] = 'csv'
56 p = parsers.parserDispatcher(arg, **args)
57 L, self.patternID = p.readLabels()
58 elif arg.__class__ == self.__class__ :
59 L, self.patternID = self.copy(arg, **args)
60 else :
61 if arg is not None :
62 L = list(arg)
63 if len(L) == 0 : L = None
64 else :
65 L = None
66 if 'patternID' in args :
67 self.patternID = args['patternID']
68 else :
69 self.patternID = range(len(L))
70
71 if L is not None :
72 self.processLabels(L)
73
74 - def copy(self, other, **args) :
75
76 forgetClassLabels = False
77 if 'forgetClassLabels' in args :
78 forgetClassLabels = args['forgetClassLabels']
79 self.numericLabels = other.numericLabels
80
81 if other.L is not None and not self.numericLabels and not forgetClassLabels :
82 self.numClasses = other.numClasses
83 self.classLabels = other.classLabels[:]
84
85 if 'patterns' in args :
86 patternsToCopy = args['patterns']
87 else :
88 patternsToCopy = range(len(other))
89
90 if other.L is None :
91 return ( None, [other.patternID[p] for p in patternsToCopy] )
92 else :
93 return ([other.L[p] for p in patternsToCopy],
94 [other.patternID[p] for p in patternsToCopy])
95
96
97 - def extend(self, other, patterns = None) :
98 """add to a dataset a list of patterns from another dataset"""
99
100 if patterns is None :
101 patterns = range(len(other))
102
103 for p in patterns :
104 if self.L is not None :
105 self.L.append(other.L[p])
106 if self.patternID is not None :
107 self.patternID.append(other.patternID[p])
108 if self.L is not None :
109 self.processLabels(self.L)
110
111
113
114 forgetClassLabels = False
115 if 'forgetClassLabels' in args :
116 forgetClassLabels = args['forgetClassLabels']
117
118 if self.numericLabels or type(L[0]) == type(1.0) :
119 self.Y = [float(y) for y in L]
120 self.L = self.Y
121 return
122 n = len(L)
123 classDict = {}
124 Y = []
125 if not forgetClassLabels and hasattr(self, 'classLabels') :
126 classLabels = self.classLabels
127 numClasses = self.numClasses
128 else :
129 for l in L :
130 classDict[l] = 1
131 classLabels = classDict.keys()
132 classLabels.sort()
133 numClasses = len(classLabels)
134 if classLabels == ['+1', '-1'] or classLabels == ['1', '-1'] :
135 classLabels[0],classLabels[1] = classLabels[1],classLabels[0]
136 if self.positiveClass is not None and numClasses == 2 :
137 if self.positiveClass not in classLabels :
138 raise ValueError, 'unrecognized positiveClass'
139 if classLabels[1] != self.positiveClass :
140 classLabels[0],classLabels[1] = classLabels[1],classLabels[0]
141 if 'rest' in classLabels and numClasses == 2 :
142 if classLabels[1] == 'rest' :
143 classLabels[0],classLabels[1] = classLabels[1],classLabels[0]
144
145 classDict = {}
146 for i in range(len(classLabels)) :
147 classDict[classLabels[i]] = i
148
149 classSize = [0 for i in range(numClasses)]
150 classes = [[] for i in range(numClasses)]
151 for i in range(n) :
152 y = classDict[L[i]]
153 classSize[y] += 1
154 Y.append(y)
155 classes[y].append(i)
156
157 self.L = L
158 self.Y = Y
159 self.classSize = classSize
160 self.classLabels = classLabels
161 self.classDict = classDict
162 self.classes = classes
163 self.numClasses = numClasses
164
165 - def flip(self, patterns) :
166
167 if self.numClasses != 2 :
168 raise ValueError, 'not a two class labeling'
169 for p in patterns :
170 self.L[p] = self.classLabels[(self.Y[p] + 1) % 2]
171 self.processLabels(self.L)
172
174
175 return len(self.patternID)
176
178
179 rep = ''
180 if self.L is not None and type(self.L[0]) == type('') :
181 rep += 'class Label / Size \n'
182 for i in range(self.numClasses) :
183 rep += ' %s : %d\n' % (self.classLabels[i],self.classSize[i])
184
185 return rep
186
188
189 if self.L is None :
190 return False
191 else :
192 return True
193
194 - def save(self, fileName, delim = '\t') :
195
196 fileHandle = open(fileName, 'w')
197 for i in range(len(self)) :
198 if self.L is not None :
199 fileHandle.write(self.patternID[i] + delim + str(self.L[i]) + '\n')
200 else :
201 fileHandle.write(self.patternID[i] + '\n')
202
204
205 for i in range(len(self.L)) :
206 self.L[i] = string.join(self.L[i], ";")
207
208
210 """Merge a list of classes into a new class.
211
212 :Parameters:
213 - `classList` - a list of classes to merge; can either provide the
214 names of the classes or the index.
215 - `newLabel` - the name of the new class (if not given then the label
216 is formed by concatenating the names of the merged classes)
217 """
218
219 if type(classList[0]) == type(1) :
220 classList = [self.classLabels[label] for label in classList]
221
222 if newLabel is None :
223 try :
224 newLabel = "+".join(classList)
225 except :
226 newLabel = str(classList)
227
228 for classLabel in classList :
229 for p in self.classes[self.classDict[classLabel]] :
230 self.L[p] = newLabel
231
232 self.processLabels(self.L, forgetClassLabels = True)
233
235
236 """
237 creates a one-against-the-rest labels object
238
239 :Parameters:
240 - `classLabels` - a single class name, or a list of class names (string
241 or a list of strings)
242 - `className` - if given, the new name given to the class
243
244 """
245
246 patternID = self.patternID[:]
247
248 if type(classLabels) == type(1) or type(classLabels) == type('') :
249 classLabels = [classLabels]
250 if type(classLabels[0]) == type(1) :
251 classLabels = [labels.classLabels[label] for label in classLabels]
252
253 if className is None :
254 className = '+'.join(classLabels)
255
256 newL = []
257 for i in range(len(self)):
258 if self.L[i] in classLabels :
259 newL.append(className)
260 else :
261 newL.append("rest")
262
263 self.processLabels(newL, forgetClassLabels = True)
264
265
267 """Merge a list of classes into a new class.
268
269 :Parameters:
270 - `data` - a dataset container
271 - `classList` - a list of classes to merge; can either provide the
272 names of the classes or the index.
273 - `newLabel` - the name of the new class (if not given then the label
274 is formed by concatenating the names of the merged classes)
275
276 calls Labels.mergeClasses and returns the dataset with the modified labels
277
278 """
279
280 data.labels.mergeClasses(classList, newLabel)
281 data.attachLabels(data.labels)
282
283 return data
284
285
287
288 """
289 creates a one-against-the-rest dataset/labels object
290
291 :Parameters:
292
293 - `data` - a dataset
294 - `classLabels` - a single class name, or a list of class names (string
295 or a list of strings)
296 - `className` - if given, the new name given to the class
297
298 Return value::
299
300 returns a dataset object where all class labels that are different
301 from the given class label/s are converted to a single class
302 """
303
304 data.labels.oneAgainstRest(classLabels, className)
305 data.attachLabels(data.labels)
306
307 return data
308
309
311 """shuffle the vector Y"""
312
313 Yrand = Y[:]
314 random.shuffle(Yrand)
315
316 return Yrand
317
319
320 patterns = [i for i in range(len(data.n))
321 if len(data.labels.L[i].split(';')) == 1]
322
323 return data.__class__(data, patterns = patterns)
324
325
327 """returns a dataset that contains the classes of d that contain
328 at least size patterns"""
329
330 patterns = []
331 for i in range(len(data)) :
332 if data.labels.classSize[data.labels.Y[i]] >= size :
333 patterns.append(i)
334
335 return d.__class__(d, patterns = patterns)
336