1
2 from PyML.containers import labels
3 from PyML.classifiers import svm
4 from PyML.containers import ker
5 from PyML.utils import myio, misc
6 from PyML.evaluators import assess
7
8 import numpy
9 import random
10
11 '''classes for performing feature selection'''
12
13 __docformat__ = "restructuredtext en"
14
15
17
18 '''api for feature selection objects'''
19
20 type = 'featureSelector'
21
22 - def select(self, data, *options, **args) :
23 """
24 invokes ``selectFeatures`` to find predictive features and eliminates
25 the rest of the features from the input dataset
26 """
27
28 features = self.selectFeatures(data, *options, **args)
29 print '*** number of features: *** ', len(features)
30 data.keepFeatures(features)
31
33 """
34 :Returns:
35 a list of predictive features
36 """
37 raise NotImplementedError
38
39 - def score(self, data, **args) :
40 """
41 :Returns:
42 a score for each feature in the input dataset
43 """
44 raise NotImplementedError
45
46 - def rank(self, data, **args) :
47 """
48 :Returns:
49 a ranking of the features in the dataset by converting the scores
50 to ranks
51 """
52 scores = self.score(data, **args)
53
54 return weights2ranks(scores, data)
55
56 - def test(self, data, *options, **args) :
59
60 train = select
61
63 '''Use a two-class feature selection method for multi-class problem
64 by doing feature selection in a one-against-the-rest manner, and
65 returns the union of all the features selected.
66
67 Construction::
68
69 OneAgainstRestSelect(featureSelector) -- featureSelector is either
70 a OneAgainstRestSelect object for copy construction, or a featureSelector
71 object
72 '''
73
75
76 if (not hasattr(featureSelector, 'type') or
77 featureSelector.type != 'featureSelector') :
78 raise ValueError, 'need a feature selector as input'
79
80 if featureSelector.__class__ == self.__class__ :
81 self.featureSelector = featureSelector.featureSelector.__class__(
82 featureSelector.featureSelector)
83 else :
84 self.featureSelector = featureSelector.__class__(featureSelector)
85
98
99
100 -class RFE (FeatureSelector) :
101
102 '''
103 RFE (Recursive Feature Elimination) uses the vector *w* of an SVM for
104 feature selection.
105
106 The method alternates between training a linear SVM and removing the features
107 with the smallest value of the weight vector.
108
109 You can either choose the number of features or let RFE choose the number
110 of features automatically; this is chosen as the minimal number of features
111 such that the number of support vectors is within one standard deviation
112 from the minimum number of support vectors.
113
114 Reference:
115
116 I. Guyon and J. Weston and S. Barnhill and V. Vapnik
117 Gene selection for cancer classification using support vector machines.
118 Machine Learning 46:389-422, 2002.
119
120 '''
121
123
124 self.data = data.__class__(data, deepcopy = 1)
125 if self.selectNumFeatures :
126 self.featureLists = [data.featureID]
127
128
129
130 self.wList = []
131 self.numSV = []
132
133 - def __init__(self, arg = None, **settings) :
134
135 """
136 :Keywords:
137 - `targetNum` - perform backward elimination until this many features are
138 left
139 - `mode` - values - 'byFraction' or 'byNum' (default = 'byFraction')
140 - `numToEliminate` - specifies the number of features to eliminate at each
141 iteration in the byNum mode
142 - `fractionToEliminate` - the fraction of features to eliminate at each
143 iteration in the byFraction mode (default = 0.05)
144 - `autoSelect` [False] - whether the number of features should be chosen
145 automatically
146 - `useScore` - whether to modulate the vector w by the golub coefficient
147 as in RSVM
148
149 """
150
151 self.selectNumFeatures = True
152 self.fractionToEliminate = 0.05
153 self.numToEliminate = 10
154 self.mode = 'byFraction'
155 self.numFeatures = 20
156 self.featureScore = FeatureScore('golub')
157 self.useScore = False
158 self.rankFeatures = False
159
160 if arg is None :
161 self.svm = svm.SVM()
162 elif arg.__class__ == self.__class__ :
163 other = arg
164 self.fractionToEliminate = other.fractionToEliminate
165 self.numToEliminate = other.numToEliminate
166 self.mode = other.mode
167 self.numFeatures = other.numFeatures
168 self.selectNumFeatures = other.selectNumFeatures
169 self.useScore = other.useScore
170 self.svm = other.svm.__class__(other.svm)
171 elif arg.__class__.__name__ == 'SVM' :
172 self.svm = arg.__class__(arg)
173 else :
174 raise ValueError, 'unknown type of argument for RFE ' + str(arg)
175
176 if 'mode' in settings :
177 self.mode = settings['mode']
178 if 'numToEliminate' in settings :
179 self.numToEliminate = settings['numToEliminate']
180 if 'numFeatures' in settings :
181 self.numFeatures = settings['numFeatures']
182 if 'fractionToEliminate' in settings :
183 self.fractionToEliminate = settings['fractionToEliminate']
184 if 'autoSelect' in settings :
185 self.selectNumFeatures = settings['autoSelect']
186 if 'useScore' in settings :
187 self.useScore = settings['useScore']
188
190 rep = '<' + self.__class__.__name__ + ' instance>\n'
191 rep += 'mode: ' + self.mode + '\n'
192 if self.mode == "byNum" :
193 rep += 'number of features to eliminate each iteration : %d\n' \
194 % self.numToEliminate
195 elif self.mode == "byFraction" :
196 rep += 'Fraction to eliminate each iteration : %f\n' \
197 % self.fractionToEliminate
198 rep += 'target number of features : %d\n' % self.numFeatures
199 rep += 'automatic selection of the number of features : %d' % \
200 self.selectNumFeatures
201
202
203 return rep
204
208
210
211 if self.mode == 'byNumber' :
212 numToElim = min(self.numToEliminate,
213 numFeatures - self.numFeatures)
214 elif self.mode == 'byFraction' :
215 numToElim = min(int(self.fractionToEliminate * len(w)),
216 numFeatures - self.numFeatures)
217 else :
218 raise ValueError, 'invalid elimination mode'
219
220 if numToElim == 0: numToElim = 1
221 print 'numFeaturesToEliminate: ', numToElim
222
223 if type(w) == type({}) :
224 w2 = numpy.zeros(numFeatures, numpy.float)
225 for wKey in w.keys():
226 w2[wKey] = w[wKey]
227 w = w2
228
229 w = numpy.absolute(w)
230
231 if self.useScore :
232 w = w * self.featureScore.score(self.data)
233
234 numZero = numpy.sum(numpy.equal(w, 0))
235 if numZero > numToElim : numToElim = numZero
236
237 I = numpy.argsort(w)
238 featuresToEliminate = I[:numToElim]
239
240 self.features = I[numToElim:]
241 self.w = w
242
243 return featuresToEliminate
244
245
247
248 data = self.data
249
250 if data.numFeatures <= self.numFeatures :
251 raise StopIteration
252
253 self.svm.train(data)
254
255
256 self.numSV.append(self.svm.model.numSV)
257
258 featuresToEliminate = self.getFeatures(self.svm.model.warray,
259 data.numFeatures)
260 if self.rankFeatures :
261 if len(self.weights) == 0 :
262 maxWeight = 0
263 else :
264 maxWeight = max(self.weights.values())
265 for feature in featuresToEliminate :
266 self.weights[data.featureID[feature]] = self.w[feature] + maxWeight
267
268 data.eliminateFeatures(featuresToEliminate)
269 print '** numFeatures: ', data.numFeatures
270
271 if self.selectNumFeatures :
272 self.featureLists.append(data.featureID)
273
274
275 - def run(self, data, *options, **args) :
276
277 if data.labels.numClasses != 2 :
278 raise ValueError, 'RFE supports only two class problems'
279
280 self.initialize(data)
281 features = data.featureID[:]
282
283 rfeIter = iter(self)
284 for f in rfeIter : pass
285
286 if self.selectNumFeatures :
287
288
289
290
291 minNumSV = len(self.data) + 1
292
293 for i in range(len(self.numSV)) :
294 print 'numSV', self.numSV[i], minNumSV
295 if self.numSV[i] < minNumSV :
296 minNumSV = self.numSV[i]
297 features = self.featureLists[i]
298
299
300 self.features = data.featureNames2IDs(features)
301
302
304
305 self.run(data, *options, **args)
306
307 return self.features
308
309 - def rank(self, data, *options, **args):
310
311 self.rankFeatures = True
312 self.weights = {}
313
314 self.run(data, *options, **args)
315
316
317 if len(self.weights) == 0 :
318 maxWeight = 0
319 else :
320 maxWeight = max(self.weights.values())
321 print data.numFeatures
322 for feature in range(self.data.numFeatures) :
323 self.weights[self.data.featureID[feature]] = self.w[feature] + maxWeight
324
325 weights = [self.weights[data.featureID[i]]
326 for i in range(data.numFeatures)]
327 I = numpy.argsort(weights)
328
329
330
331 return weights2ranks(weights, data)
332
333
335 '''Multiplicative update uses the vector w of an SVM to do feature selection.
336 At each iteration an svm is trained and the data is multiplied by the
337 weight vector of the classifier.
338
339 Reference:
340
341 J. Weston, A. Elisseeff, M. Tipping and B. Scholkopf.
342 Use of the zero norm with linear models and kernel methods.
343 JMLR special Issue on Variable and Feature selection, 2002.
344 '''
345
346
347 - def __init__(self, arg = None, **settings) :
348
349 self.eps = 0.01
350 self.rankFeatures = False
351
352 if arg.__class__ == self.__class__ :
353 other = arg
354 self.eps = other.eps
355 self.rankFeatures = other.rankFeatures
356 elif arg.__class__.__name__ == 'SVM' :
357 self.svm = arg.__class__(arg)
358
359 if 'eps' in settings :
360 self.eps = settings['eps']
361
362
364 rep = '<' + self.__class__.__name__ + ' instance>\n'
365 rep += 'epsilon : %d\n' % self.eps
366
367 return rep
368
372
374
375 self.scaleData = data.__class__(data, deepcopy = True)
376 if not linearlySeparable (data) :
377 print 'not linearly separable!!!!!!!!!!!!!!!!!!!!!!'
378 self.svm = svm.SVM(ker.LinearRidge())
379 else :
380 self.svm = svm.SVM()
381 print 'linearly separable**************************'
382 self.svm.C = 1000
383
384
386
387 data = self.scaleData
388 self.svm.train(data)
389
390 w = self.svm.model.warray
391 if self.svm.kernel.__class__.__name__ == "LinearRidge" :
392 wRidge = 0.0
393 for i in range(self.svm.model.numSV) :
394 wRidge += self.svm.model.alpha[i] * \
395 self.svm.ridge[self.svm.model.svID[i]]
396 wRidge = abs(wRidge)
397 for i in range(len(data)) :
398 self.svm.ridge[i] *= wRidge
399
400 data.scale(w)
401 self.w = w
402 print 'scaled'
403 wc = numpy.compress(numpy.greater(w, 1e-3), w)
404
405 if numpy.allclose(wc, numpy.ones(len(wc), numpy.float), 0.3) :
406 raise StopIteration
407
408
410 '''XXX for multi-class -- do one against the rest
411 and use the absolute value of the average/maximum value of w to rescale
412 multi-class
413 '''
414
415 if data.labels.numClasses != 2 :
416 raise ValueError, 'MU supports only two class problems'
417
418 self.initialize(data)
419
420 muIter = iter(self)
421 for f in muIter : pass
422
423 featuresToKeep = numpy.nonzero(numpy.greater(self.w, 1e-3))[0]
424
425 print 'numFeatures', len(featuresToKeep)
426
427 return featuresToKeep
428
429
430 -class Random (FeatureSelector) :
431 '''
432 A feature selection method that keeps a random set of features
433
434 Construction::
435
436 Random(numFeatures)
437 '''
438
439 - def __init__(self, arg1, *options, **settings) :
440
441 if arg1.__class__ == self.__class__ :
442 other = arg1
443 self.numFeatures = other.numFeatures
444 elif type(arg1) == type(1) :
445 self.numFeatures = arg1
446 else :
447 raise ValueError, 'bad argument for Random constructor'
448
450 rep = '<' + self.__class__.__name__ + ' instance>\n'
451 rep += 'number of features to keep : %d\n' % self.numFeatures
452
453 return rep
454
461
462
463 -class Filter (FeatureSelector) :
464 '''
465 A simple feature selection method that filters features according
466 to a feature score.
467 It uses a feature score (instance of FeatureScore) to eliminate
468 features in one of three possible modes:
469
470 - keep a specified number of features [default]
471 - eliminate all features whose score is below some threshold
472 - eliminate all features whose score is a certain number of standard deviations
473 above that obtained using random labels
474 '''
475
476 - def __init__(self, arg1, *options, **settings) :
477 """
478 :Keywords:
479 - `numFeatures` - keep ``numFeatures`` features with the highest score
480 - `threshold` - keep all features with score above the threshold
481 - `sigma` - keep features whose score is above the average by this many
482 standard deviations
483 """
484 self.sigma = 2.5
485 if arg1.__class__ == self.__class__ :
486 other = arg1
487 self.featureScore = other.featureScore.__class__(other.featureScore)
488 self.numFeatures = other.numFeatures
489 self.mode = other.mode
490 self.numRand = other.numRand
491 self.sigma = other.sigma
492 try :
493 self.threshold = other.threshold
494 except :
495 pass
496 try :
497 self.significance = other.significance
498 except :
499 pass
500 try :
501 self.numFeatures = other.numFeatures
502 except :
503 pass
504 elif hasattr(arg1, 'score') :
505 self.featureScore = arg1
506 self.mode = "byNum"
507 self.numFeatures = 20
508 self.numRand = 20
509 if 'numFeatures' in settings :
510 self.numFeatures = settings['numFeatures']
511 self.mode = "byNum"
512 if 'sigma' in settings :
513 self.sigma = settings['sigma']
514 self.mode = "bySignificance"
515 if 'threshold' in settings :
516 self.threshold = settings['threshold']
517 self.mode = "byThreshold"
518 else :
519 raise ValueError, 'bad argument for Filter constructor'
520
522 rep = '<' + self.__class__.__name__ + ' instance>\n'
523 rep += 'mode: ' + self.mode + '\n'
524 if self.mode == "byNum" :
525 rep += 'number of features to keep : %d\n' % self.numFeatures
526 elif self.mode == "bySignificance" :
527 rep += 'sigma : %f\n' \
528 % self.sigma
529
530 elif self.mode == "byThreshold" :
531 rep += 'score threshold for keeping features : %f\n' % self.threshold
532 rep += self.featureScore.__repr__()
533
534 return rep
535
536 - def selectFeatures(self, data, targetClass=None, otherClass = None, *options, **args) :
537
538 s = self.featureScore.score(data, targetClass, otherClass, **args)
539
540 if self.mode == "byNum" :
541 featuresToEliminate = numpy.argsort(s)\
542 [:data.numFeatures - self.numFeatures]
543 elif self.mode == "byThreshold" :
544 featuresToEliminate = numpy.nonzero(numpy.less(s, self.threshold))[0]
545 elif self.mode == "bySignificance" :
546 t = self.significanceThreshold(data)
547 self.thresholds = t
548 featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0]
549 else :
550 raise ValueError, 'unknown elimination mode in filter'
551
552 print 'eliminating ',len(featuresToEliminate), ' features'
553
554 return misc.setminus(range(data.numFeatures), featuresToEliminate)
555
556
571
572
573 -def parseArgs(data, targetClass, otherClass = None, **args) :
574 '''parse arguments for a feature scoring function'''
575
576 if 'feature' in args :
577 feature = args['feature']
578 else :
579 feature = None
580 if 'Y' in args :
581 Y = args['Y']
582 if otherClass is None :
583 otherI = numpy.nonzero(numpy.not_equal(Y, targetClass))[0]
584 else :
585 otherI = numpy.nonzero(numpy.equal(Y, otherClass))[0]
586 targetClassSize = numpy.sum(numpy.equal(Y, targetClass))
587 else :
588 Y = None
589 if otherClass is None :
590 otherI = numpy.nonzero(numpy.not_equal(data.labels.Y, targetClass))[0]
591 else :
592 otherI = data.labels.classes[otherClass]
593 targetClassSize = len(data.labels.classes[targetClass])
594
595 otherClassSize = len(otherI)
596
597 return Y, targetClassSize, otherClassSize, otherI, feature
598
599
601
602 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
603 data, targetClass, otherClass, **args)
604
605 if Y is None : Y = data.labels.Y
606 if data.__class__.__name__ != 'DataSet' :
607 raise ValueError, 'data should be of type DataSet'
608
609 Xsort = numpy.sort(data.X, 0)
610 d = data.numFeatures
611 n = len(data)
612 Isort = numpy.argsort(data.X, 0)
613 print Isort
614 print Y
615 succRate = numpy.zeros(d, numpy.float)
616 threshold = numpy.zeros(d, numpy.float)
617 num1 = numpy.sum(numpy.equal(Y, 1))
618 num0 = n - num1
619
620 for i in range(d) :
621 succRate[i] = 0
622 num0below = 0
623 num1below = 0
624 for j in range(0, n - 1) :
625 if Y[Isort[j][i]] == 1 :
626 num1below += 1
627 else :
628 num0below += 1
629 num0above = num0 - num0below
630 num1above = num1 - num1below
631 currSuccRate = float(max(num0above + num1below, num0below + num1above)) / \
632 float(n)
633 if currSuccRate > succRate[i] :
634 succRate[i] = currSuccRate
635 threshold[i] = (Xsort[j][i] + Xsort[j + 1][i]) / 2
636
637 return succRate,threshold
638
639
640 -def predictivity(data, targetClass, otherClass = None, **args) :
641
642 '''A feature score for discrete data; the score for feature i is:
643 s_i = P(Fi | C1) - P(Fi | C2),
644 where P(Fi | C) is the estimated probability of Feature i being nonzero given
645 the class variable
646 This is estimated as:
647 s_i = # of patterns in target class that have feature i /
648 no. of patterns in target class
649 -
650 # of patterns in other class that have feature i /
651 no. of patterns in other class
652 '''
653
654 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
655 data, targetClass, otherClass, **args)
656
657
658 s1 = numpy.array(featureCount(data, targetClass=targetClass, Y=Y,
659 feature=feature)) / float(targetClassSize)
660
661 s2 = numpy.array(featureCount(data, I = otherI, Y=Y,
662 feature=feature)) / float(otherClassSize)
663
664 return (s1 - s2)
665
666
667 -def countDiff(data, targetClass, otherClass = None, **args) :
668 '''A feature score for discrete data; the score for feature i is:
669 s_i = (#(Fi | C ) - #(Fi | not C)) / #(Fi | C)
670 '''
671
672 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
673 data, targetClass, otherClass, **args)
674
675 s1 = featureCount(data, targetClass=targetClass, Y=Y,
676 feature=feature)
677
678 s2 = featureCount(data, I = otherI, Y=Y,
679 feature=feature)
680
681 s = (s1 - s2) / float(targetClassSize)
682
683 return s
684
685
686 -def sensitivity(data, targetClass, otherClass = None, **args) :
687 '''A feature score for discrete data
688 (alternatively, with a threshold it could be used for continuous data)
689 s_i = #(Fi | C) / #(C)
690 '''
691
692 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
693 data, targetClass, otherClass, **args)
694
695 return (featureCount(data, targetClass=targetClass, Y=Y, feature=feature) /
696 float(targetClassSize))
697
698
699
700 -def ppv(data, targetClass, otherClass = None, **args) :
701 '''A feature score for discrete data
702 s_i = #(Fi | C) / #(Fi)
703 '''
704
705 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
706 data, targetClass, otherClass, **args)
707
708 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature)
709
710 s2 = featureCount(data, feature = feature)
711
712 numpy.putmask(s2, numpy.equal(s2, 0), 1)
713
714 if type(s1) == type(1) :
715 return float(s1) / float(s2)
716 else :
717 return numpy.array(s1, numpy.float)/s2
718
719 -def ppvThreshold(data, targetClass, otherClass = None, **args) :
720 '''A feature score for discrete data
721 s_i = #(Fi | C) / #(Fi) if #(Fi | C) > threshold and 0 otherwise
722 '''
723
724 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
725 data, targetClass, otherClass, **args)
726 if 'threshold' in args :
727 threshold = args['threshold']
728 else :
729 threshold = 2
730
731 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature)
732
733 numpy.putmask(s1, numpy.less_equal(s1, threshold), 0)
734
735 s2 = featureCount(data, feature = feature)
736
737 numpy.putmask(s2, numpy.equal(s2, 0), 1)
738
739 if type(s1) == type(1) :
740 return float(s1) / float(s2)
741 else :
742 return numpy.array(s1, numpy.float)/s2
743
744
745 -def specificity(data, targetClass, otherClass = None, **args) :
746 '''A feature score for discrete data
747 s_i = #(Fi | C) / #(Fi)
748
749 or perhaps: 1 - #(Fi | not C) / #(not C)
750
751 '''
752
753 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
754 data, targetClass, otherClass, **args)
755
756 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature)
757
758 s2 = featureCount(data, feature = feature)
759
760 numpy.putmask(s2, numpy.equal(s2, 0), 1)
761
762 if type(s1) == type(1) :
763 return float(s1) / float(s2)
764 else :
765 return numpy.array(s1, numpy.float)/s2
766
767
768 -def usefullness(data, targetClass, otherClass = None, **args) :
769 '''A feature score for discrete data
770 optional arguments:
771 threshold
772 fraction
773 '''
774
775 if 'threshold' in args :
776 threshold = args['threshold']
777 else :
778 threshold = 5
779 if 'fraction' in args :
780 fraction = args['fraction']
781 else :
782 fraction = 0.0
783
784 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
785 data, targetClass, otherClass, **args)
786
787 threshold = max(threshold, fraction * float(targetClassSize))
788
789 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature)
790
791 s2 = featureCount(data, I = otherI, Y=Y,
792 feature=feature) / float(otherClassSize)
793
794 s2 = 1 - s2
795
796 numpy.putmask(s2, numpy.less(s1, threshold), 0.0)
797
798 return s2
799
800
801 -def abundance(data, targetClass, otherClass = None, **args) :
802
803 '''Fraction of patterns that have a feature: A(F,C) = #(F | C) \ #(C)'''
804
805 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
806 data, targetClass, otherClass, **args)
807
808 s = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) / \
809 float(targetClassSize)
810
811 return s
812
813
814
815 -def oddsRatio(data, targetClass, otherClass = None, **args) :
816
817
818 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs(
819 data, targetClass, otherClass, **args)
820
821 count1 = numpy.array(featureCount(data, targetClass=targetClass, Y=Y,
822 feature=feature), numpy.float)
823 count2 = numpy.array(featureCount(data, I=otherI, Y=Y,
824 feature=feature), numpy.float)
825
826 pseudoCount1 = 1.0 / float(targetClassSize)
827 pseudoCount2 = 1.0 / float(otherClassSize)
828 numpy.putmask(count1, numpy.equal(count1, 0), pseudoCount1)
829 numpy.putmask(count2, numpy.equal(count2, 0), pseudoCount2)
830 numpy.putmask(count1, numpy.equal(count1, targetClassSize),
831 targetClassSize - pseudoCount1)
832 numpy.putmask(count2, numpy.equal(count2, len(otherI)),
833 len(otherI) - pseudoCount2)
834
835
836 s = (count1 * (otherClassSize - count2)) / (count2 * (targetClassSize - count1))
837
838 return s
839
840 -def logOddsRatio(data, targetClass, otherClass = None, **args) :
841
842 return numpy.log(oddsRatio(data, targetClass, otherClass, **args))
843
844
845
847
848 if type(data.X[0]) == type({}) :
849 raise valueError, "Wrong type of dataset"
850 if data.labels.numClasses != 2 :
851 raise valueError, 'not a two class problem'
852
853 K = numpy.dot (data.X, numpy.transpose (data.X))
854
855 w = numpy.zeros(data.numFeatures, numpy.float)
856 for i in range(len(data)) :
857 bestInClass = 0
858 simInClass = -1e10
859 bestOutOfClass = 0
860 simOutOfClass = -1e10
861 for j in range(len(data)) :
862 if j == i : continue
863 if data.labels.Y[i] == data.labels.Y[j] :
864 if K[i][j] > simInClass :
865 bestInClass = j
866 simInClass = K[i][j]
867 else :
868 if K[i][j] > simOutOfClass :
869 bestOutOfClass = j
870 simOutOfClass = K[i][j]
871 w += data.X[bestInClass] - data.X[bestOutOfClass]
872
873 return w / len(data)
874
875
876 -def golub(data, targetClass, otherClass, **args) :
877 '''The Golub feature score:
878 s = (mu1 - mu2) / sqrt(sigma1^2 + sigma2^2)
879 '''
880
881 if 'Y' in args :
882 Y = args['Y']
883 targetClassSize = numpy.sum(numpy.equal(Y, targetClass))
884 otherClassSize = numpy.sum(numpy.equal(Y, otherClass))
885 else :
886 Y = None
887 targetClassSize = data.labels.classSize[targetClass]
888 otherClassSize = data.labels.classSize[otherClass]
889
890 m1 = numpy.array(featureMean(data, targetClass, Y))
891 m2 = numpy.array(featureMean(data, otherClass, Y))
892 s1 = numpy.array(featureStd(data, targetClass, Y))
893 s2 = numpy.array(featureStd(data, otherClass, Y))
894
895 s = numpy.sqrt(s1**2 + s2**2)
896 m = (m1 + m2) / 2.0
897
898
899 numpy.putmask(s, numpy.equal(s, 0), m)
900
901 numpy.putmask(s, numpy.equal(s, 0) ,1)
902
903 g = (m1 - m2) / s
904
905 return g
906
907 -def succ(data, targetClass, otherClass, **args) :
908 """the score of feature j is the success rate of a classifier that
909 classifies into the target class all points whose value of the feature
910 are higher than some threshold (linear 1-d classifier).
911 """
912 Y = data.labels.Y
913 numPos = float(data.labels.classSize[targetClass])
914 numNeg = len(data) - numPos
915 s = numpy.zeros(data.numFeatures, numpy.float_)
916 values = numpy.zeros(data.numFeatures, numpy.float_)
917 balanced = False
918 if 'balanced' in args :
919 balanced = args['balanced']
920
921
922 for j in range(data.numFeatures) :
923 feat = data.getFeature(j)
924 I = numpy.argsort(feat)
925 feat = numpy.sort(feat)
926 posBelow = 0
927 negBelow = 0
928 for i in range(len(data)) :
929 if Y[I[i]] == targetClass :
930 posBelow += 1
931 else :
932 negBelow += 1
933
934
935
936 if i < len(data)-1 and feat[i] != feat[i + 1] :
937 if balanced :
938 succRate = max(posBelow / numPos + (numNeg - negBelow) / numNeg,
939 (numPos - posBelow) / numPos + negBelow / numNeg)
940 else :
941 succRate = max(posBelow + (numNeg - negBelow),
942 (numPos - posBelow) + negBelow)
943 if succRate > s[j] :
944 s[j] = succRate
945 values[j] = feat[i]
946
947 if not balanced :
948 s = s / len(data)
949 else :
950 s = s / 2.0
951
952 if 'getValues' in args and args['getValues'] :
953 return s,values
954 else :
955 return s
956
958 """the score of feature j is the success rate of a classifier that
959 classifies into the target class all points whose value of the feature
960 are higher than some threshold (linear 1-d classifier).
961 """
962
963 return succ(data, targetClass, otherClass, **{'balanced' : True})
964
965 -def roc(data, targetClass, otherClass, **args) :
966
967 rocN = None
968 if 'rocN' in args :
969 rocN = args['rocN']
970 s = numpy.zeros(data.numFeatures, numpy.float_)
971 for i in range(data.numFeatures) :
972 featureValues = data.getFeature(i)
973 s[i] = assess.roc(None, data.labels.Y, featureValues, rocN, targetClass)[2]
974
975
976
977
978
979 return s
980
981
983 '''
984 returns a vector where component i gives the number of patterns where
985 feature i is nonzero
986 INPUTS:
987 data - a dataset
988 targetClass - class for which to count (optional, default behavior is
989 to look at all patterns)
990 Y - alternative label vector (optional)
991 feature - either a feature or list of features - counts the number of
992 patterns for which the feature or list of features is non-zero
993 I - a list of indices on which to do feature count
994 OPTIONS:
995 "complement" - look at the complement of the target class
996 '''
997
998 singleFeature = 0
999 if 'feature' in args and args['feature'] is not None :
1000 feature = args['feature']
1001 singleFeature = 1
1002 featureCount = 0
1003 else :
1004 featureCount = numpy.zeros(data.numFeatures)
1005
1006 if 'Y' in args and args['Y'] is not None :
1007 Y = args['Y']
1008 elif 'labels' in args :
1009 Y = args['labels'].Y
1010 elif data.labels.L is not None :
1011 Y = data.labels.Y
1012
1013 if "targetClass" in args :
1014 targetClass = args['targetClass']
1015 if "complement" in options :
1016 I = numpy.nonzero(numpy.not_equal(Y, targetClass))[0]
1017 else :
1018 I = numpy.nonzero(numpy.equal(Y, targetClass))[0]
1019 else :
1020 I = range(len(data))
1021
1022 if 'I' in args :
1023 I = args['I']
1024
1025 if singleFeature :
1026 featureCount = data.featureCount(feature, I)
1027 else :
1028 featureCount = data.featureCounts(I)
1029
1030 return featureCount
1031
1032
1034 '''returns a vector where component i is the mean of feature i
1035 INPUT:
1036 data - a dataset
1037 targetClass - class for which to take the mean (optional)
1038 Y - alternative label vector (optional)
1039 '''
1040
1041 if targetClass is None :
1042 I = range(len(data))
1043 elif Y is None :
1044 I = numpy.nonzero(numpy.equal(data.labels.Y, targetClass))[0]
1045 else :
1046 I = numpy.nonzero(numpy.equal(Y, targetClass))[0]
1047
1048 return data.mean(I)
1049
1050
1051
1052 -def featureStd(data, targetClass = None, Y = None) :
1053 '''returns a vector where component i is the standard deviation of feature i
1054 INPUT:
1055 data - a dataset
1056 targetClass - class for which to take the mean (optional)
1057 Y - alternative label vector (optional)
1058 '''
1059
1060 if targetClass == None :
1061 I = range(len(data))
1062 elif Y == None :
1063 I = numpy.nonzero(numpy.equal(data.labels.Y, targetClass))[0]
1064 else :
1065 I = numpy.nonzero(numpy.equal(Y, targetClass))[0]
1066
1067 if len(I) == 0 :
1068 return numpy.zeros(data.numFeatures, numpy.float_)
1069
1070 return data.std(I)
1071
1072
1074 '''removes from the data features whose feature count is below a threshold
1075 data - a dataset
1076 threshold - number of occurrences of the feature below which it will be
1077 eliminated
1078 '''
1079
1080 fCount = featureCount(data)
1081
1082 below = numpy.nonzero(numpy.less(fCount, threshold))[0]
1083 data.eliminateFeatures(below)
1084
1085
1086
1088 '''Compute a set of nonredundant features for a 0/1 sparse dataset
1089 a feature is defined as redundant if there is another feature which has
1090 nonzero value for exactly the same patterns, and has a larger weight
1091 INPUT: a dataset and a list of weights for each feature in the data
1092 weights are optional.
1093 OUTPUT: a list of redundant features
1094 '''
1095
1096
1097
1098 bestFeature = {}
1099 featureWeight = {}
1100
1101 for f in range(data.numFeatures) :
1102 if f % 100 == 0 :
1103 print f
1104 pattern = ''
1105 for i in range(len(data)) :
1106 if data.X[i].has_key(f) :
1107 pattern += '1'
1108 else :
1109 pattern += '0'
1110 if pattern in bestFeature :
1111 if w is not None :
1112 if featureWeight[pattern] < w[f] :
1113 featureWeight[pattern] = w[f]
1114 bestFeature[pattern] = f
1115 else :
1116 if w is not None :
1117 featureWeight[pattern] = w[f]
1118 bestFeature[pattern] = f
1119
1120 nonredundant = bestFeature.values()
1121
1122 return nonredundant
1123
1124
1126 """base class for objects that have a 'score' function
1127 for scoring the features of a dataset
1128 """
1129
1130 type = 'featureScorer'
1131
1133
1134 raise NotImplementedError
1135
1136 train = score
1137
1138 - def test(self, data, *options, **args) :
1141
1143 """
1144 A class for scoring the features of a dataset
1145 USAGE:
1146 construction:
1147 f = FeatureScore(scoreName, mode = modeValue)
1148 or using copy construction :
1149 f = FeatureScore(otherFeatureScore)
1150 scoreName is the type of filter; available filters are:
1151 "predictivity", "oddsRatio", "golub"
1152 mode is one of the following:
1153 oneAgainstRest (default)
1154 oneAgainstOne
1155 """
1156
1157 scoreFuncs = {"predictivity" : predictivity,
1158 "oddsRatio" : "oddsRatio", "logOddsRatio" : logOddsRatio,
1159 "golub" : golub, "countDiff" : countDiff,
1160 "usefullness" : usefullness, "abundance" : abundance,
1161 "specificity" : specificity, "ppv" : ppv,
1162 "ppvThreshold" : ppvThreshold,
1163 "succ" : succ,
1164 "balancedSucc" : balancedSucc, "roc" : roc}
1165
1166
1167
1168
1169
1170
1171 multiClass = ["IG"]
1172
1173
1174
1175
1176
1177
1178 asym = ["predictivity", "logOddsRatio", "golub"]
1179
1180 - def __init__(self, arg1 = None, *options, **args) :
1181
1182 self.mode = "oneAgainstOne"
1183 self.scoreName = "predictivity"
1184 self.scoreFunc = predictivity
1185 self.minClassSize = 5
1186 self.bothSides = True
1187
1188 if arg1.__class__ == self.__class__ :
1189 other = arg1
1190 self.mode = other.mode
1191 self.scoreName = other.scoreName
1192 self.scoreFunc = other.scoreFunc
1193 self.bothSides = other.bothSides
1194 elif arg1.__class__ == ''.__class__ :
1195 scoreName = arg1
1196 if scoreName in self.scoreFuncs :
1197 self.scoreFunc = self.scoreFuncs[scoreName]
1198 else :
1199 raise ValueError, 'unknown filter name'
1200 self.scoreName = scoreName
1201 elif arg1.__class__.__base__.__name__ == 'FeatureScorer' :
1202 self.scoreFunc = arg1.score
1203 self.scoreName = ''
1204
1205 if 'mode' in args :
1206 if args['mode'] == "oneAgainstRest" :
1207 self.mode = "oneAgainstRest"
1208 if 'minClassSize' in args :
1209 self.minClassSize = args['minClassSize']
1210
1211
1213 rep = '<' + self.__class__.__name__ + ' instance>\n'
1214 rep += 'score name : ' + self.scoreName + '\n'
1215 rep += 'mode : ' + self.mode + '\n'
1216
1217 return rep
1218
1219
1220
1221 - def score(self, data, *options, **args) :
1222
1223 if 'targetClass' in args :
1224 targetClass = args['targetClass']
1225 else :
1226 targetClass = None
1227 if 'otherClass' in args :
1228 otherClass = args['otherClass']
1229 else :
1230 otherClass = None
1231
1232 if (targetClass is not None and otherClass is not None) or (
1233 self.scoreName in self.multiClass) :
1234 return self.scoreFunc(data, targetClass, otherClass, **args)
1235 elif data.labels.numClasses == 2 :
1236 return self._score(data, **args)
1237 elif self.mode == "oneAgainstRest" :
1238 if targetClass is not None :
1239 labels = labels.oneAgainstRest(data.labels, targetClass)
1240 return self._score(data, 1, 0, Y=labels.Y)
1241 else :
1242 raise ValueError, 'need to specify a target class'
1243 elif self.mode == 'oneAgainstOne' :
1244 return self.oneAgainstOne(data, targetClass, **args)
1245
1246 train = score
1247
1248 - def _score(self, data, class1 = None, class2 = None, **args) :
1249
1250 if class1 is None and class2 is None :
1251 class1 = 0
1252 class2 = 1
1253
1254 if self.scoreName in self.asym or not self.bothSides :
1255 s = numpy.absolute(
1256 self.scoreFunc(data, class1, class2, **args))
1257 else :
1258 s = numpy.maximum(
1259 self.scoreFunc(data, class1, class2, **args),
1260 self.scoreFunc(data, class2, class1, **args))
1261
1262 return s
1263
1265 '''XXXX change maximum into average or add this as another option'''
1266
1267 if 'Y' in args :
1268 Y = args['Y']
1269 classSize = misc.count(Y)
1270 else :
1271 classSize = data.labels.classSize
1272
1273 s = numpy.zeros(data.numFeatures, numpy.float_)
1274
1275 if targetClass is None :
1276 for class1 in range(data.labels.numClasses - 1) :
1277 for class2 in range(class1 + 1, data.labels.numClasses) :
1278 if (classSize[class1] > self.minClassSize and
1279 classSize[class2] > self.minClassSize) :
1280
1281 t = self._score(data, class1, class2, **args)
1282 s = numpy.maximum(s, t)
1283
1284 else :
1285 for class2 in range(data.labels.numClasses) :
1286 if class2 != targetClass and classSize[class2] > self.minClassSize:
1287 t = self._score(data, class1, class2, **args)
1288 s = numpy.maximum(s, t)
1289
1290 return s
1291
1293
1295
1296 self.measure = 'successRate'
1297 self.targetNumFeatures = 2
1298 if arg.__class__ == self.__class__ :
1299 self.measure = arg.measure
1300 self.targetNumFeatures = arg.targetNumfeatures
1301 self.classifier = arg.classifier.__class(arg.classifier)
1302 else :
1303 self.classifier = arg.__class__(arg)
1304 if 'targetNumFeatures' in args :
1305 self.targetNumFeatures = args['targetNumFeatures']
1306 if 'measure' in args :
1307 self.measure = args['measure']
1308
1310
1311 self.eliminated = []
1312 self.measures = []
1313 cvArgs = {}
1314 import re
1315 rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area")
1316 match = rocExp.match(self.measure)
1317 if match is not None :
1318 measureStr = 'rocNarea'
1319 cvArgs['rocN'] = match.groupdict()['rocN']
1320 else :
1321 measureStr = self.measure
1322
1323 print cvArgs
1324 data = _data.__class__(_data, deepcopy = True)
1325 for i in range(self.targetNumFeatures, _data.numFeatures) :
1326 maxScore = 0
1327
1328 for feature in range(data.numFeatures) :
1329 featureName = data.featureID[feature]
1330 data.eliminateFeatures([feature])
1331 res = self.classifier.stratifiedCV(data, **cvArgs)
1332 score = getattr(res, measureStr)
1333 if score > maxScore :
1334 maxScore = score
1335 bestFeatureName = featureName
1336 data = _data.__class__(_data, deepcopy = True)
1337 data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
1338 data = _data.__class__(_data, deepcopy = True)
1339 self.eliminated.append(bestFeatureName)
1340 data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
1341 self.measures.append(maxScore)
1342
1343 return misc.setminus(range(_data.numFeatures),
1344 _data.featureNames2IDs(self.eliminated))
1345
1347 '''returns 1 if data is linearly separable and 0 otherwise.
1348 More specifically, it trains a soft margin SVM and checks if all
1349 training points are correclty classified
1350 '''
1351
1352 s = svm.SVM(C = 1000)
1353 s.train(data)
1354 r = s.test(data)
1355 r.computeStats()
1356
1357 successRate = r.get('successRate')
1358 if successRate == 1 :
1359 return True
1360 else :
1361 return False
1362
1363
1365
1366 r = myio.load(resultsFileName)
1367
1368 numFeatures = {}
1369 if type(r) == type({}) :
1370 info = misc.extractAttribute(r, 'foldInfo')
1371 for key in info :
1372 numFeat = []
1373 for lines in info[key] :
1374 for line in lines.split('\n') :
1375 if line.find('number of features') == 0 :
1376 numFeat.append(float(line.split(':')[1]))
1377 numFeatures[key] = numpy.average(numFeat)
1378 return numFeatures
1379
1381
1382 if type(weights) == type({}) :
1383 weights = [weights[data.featureID[i]]
1384 for i in range(data.numFeatures)]
1385 weights = numpy.array(weights)
1386 I = numpy.argsort(-weights)
1387 ranks = [data.featureID[i] for i in I]
1388
1389 return ranks
1390
1391 -def featureReport(data, score = 'roc', targetClass = 1, otherClass = 0) :
1392
1393 if score == 'roc' :
1394 s = roc(data, targetClass, otherClass)
1395 elif score == 'golub' :
1396 s = golub(data, targetClass, otherClass)
1397
1398 for i in range(data.numFeatures) :
1399 print data.featureID[i], s[i]
1400