Package PyML :: Package classifiers :: Module modelSelection
[frames] | no frames]

Source Code for Module PyML.classifiers.modelSelection

  1  from PyML.utils import misc 
  2   
  3  from baseClassifiers import Classifier,IteratorClassifier 
  4  from composite import CompositeClassifier 
  5  import svm 
  6   
  7  '''classes for model selection''' 
  8   
  9  __docformat__ = "restructuredtext en" 
 10   
 11   
12 -class Param (IteratorClassifier) :
13 14 """ 15 A class for training a classifier with several values of a parameter. 16 Training trains a classifier for each value of the parameter. 17 Testing returns a list evaluating each trained classifier on the given 18 dataset. 19 20 Example:: 21 22 p = Param(svm.SVM(), 'C', [0.1, 1, 10, 100, 1000]) 23 """ 24
25 - def __init__(self, arg, attribute = 'C', values = [0.1, 1, 10, 100, 1000]) :
26 """ 27 :Parameters: 28 - `arg` - another Param object, or the classifier to be used 29 - `attribute` - the attribute of the classifier that needs tuning 30 - `values` - a list of values to try 31 """ 32 33 if arg.__class__ == self.__class__ : 34 other = arg 35 self.attribute = other.attribute 36 self.values = other.values[:] 37 self.classifiers = [classifier.__class__(classifier) 38 for classifier in other.classifiers] 39 for i in range(len(self)) : 40 misc.mysetattr(self.classifiers[i], self.attribute, self.values[i]) 41 elif hasattr(arg, 'type') and arg.type == 'classifier' : 42 self.attribute = attribute 43 self.values = values 44 self.classifiers = [arg.__class__(arg) 45 for i in range(len(self.values))] 46 for i in range(len(self)) : 47 misc.mysetattr(self.classifiers[i], self.attribute, self.values[i]) 48 elif type(arg) == type([]) : 49 self.classifiers = [arg[i].__class__(arg[i]) 50 for i in range(len(arg))]
51
52 - def __len__(self) :
53 54 return len(self.classifiers)
55
56 - def __repr__(self) :
57 58 rep = '<' + self.__class__.__name__ + ' instance>\n' 59 rep += 'classifier:\n' 60 rep += self.classifiers[0].__repr__() 61 rep += 'attribute: %s\n' % self.attribute 62 rep += 'values:' + str(self.values) + '\n' 63 64 return rep
65 66
67 - def train(self, data, **args) :
68 69 for classifier in self.classifiers : 70 classifier.train(data, **args)
71 #self.log.trainingTime = self.getTrainingTime() 72 73
74 -class ParamGrid (Param) :
75 """ 76 A class for training and testing a classifier on a grid of parameter 77 values for two attributes of the classifier. 78 79 Example:: 80 81 p = ParamGrid(svm.SVM(ker.Gaussian()), 'C', [0.1, 1, 10, 100, 1000], 82 'kernel.gamma', [0.001, 0.01, 0.1, 1, 10]) 83 """ 84
85 - def __init__(self, arg, 86 attribute1 = 'C', values1 = [0.1, 1, 10, 100, 1000], 87 attribute2 = 'kernel.gamma', values2 = [0.001, 0.01, 0.1, 1, 10]) :
88 89 """ 90 :Parameters: 91 - `arg` - another Param object, or the classifier to be used 92 - `attribute1` - the first attribute of the classifier that needs tuning 93 - `values1` - a list of values to try for attribute1 94 - `attribute2` - the second attribute 95 - `values2` - a list of values to try for attribute2 96 97 """ 98 99 100 if arg.__class__ == self.__class__ : 101 other = arg 102 self.attribute1 = other.attribute1 103 self.values1 = other.values1[:] 104 self.attribute2 = other.attribute2 105 self.values2 = other.values2[:] 106 self.classifiers = [classifier.__class__(classifier) 107 for classifier in other.classifiers] 108 elif hasattr(arg, 'type') and arg.type == 'classifier' : 109 self.attribute1 = attribute1 110 self.values1 = values1 111 self.attribute2 = attribute2 112 self.values2 = values2 113 114 self.classifiers = [arg.__class__(arg) 115 for i in range(len(values1) * len(values2))] 116 117 for i in range(len(self.values1)) : 118 for j in range(len(self.values2)) : 119 classifierID = i * len(self.values2) + j 120 misc.mysetattr(self.classifiers[classifierID], 121 self.attribute1, 122 self.values1[i]) 123 misc.mysetattr(self.classifiers[classifierID], 124 self.attribute2, 125 self.values2[j])
126 127
128 - def __repr__(self) :
129 130 rep = '<' + self.__class__.__name__ + ' instance>\n' 131 rep += 'classifier:\n' 132 rep += self.classifiers[0].__repr__() 133 rep += 'attribute1: %s\n' % self.attribute1 134 rep += 'values1:' + str(self.values1) + '\n' 135 rep += 'attribute2: %s\n' % self.attribute2 136 rep += 'values2:' + str(self.values2) + '\n' 137 138 return rep
139 140
141 -class ModelSelector (CompositeClassifier) :
142 """ 143 A model selector decides on the best classifier parameters 144 using the param object it receives as input. 145 Parameters are chosen according to the success rate in CV (or success 146 on a dataset provided to the train method. 147 148 """ 149 150 attributes = {'numFolds' : 5, 151 'measure' : 'balancedSuccessRate', 152 'foldsToPerform' : 5,} 153
154 - def __init__(self, arg, **args) :
155 """ 156 :Parameters: 157 - `arg` - another ModelSelector or a Param object 158 159 :Keywords: 160 - `measure` - which measure of accuracy to use for selecting the 161 best classifier (default = 'balancedSuccessRate') 162 supported measures are: 'balancedSuccessRate', 'successRate', 163 'roc', 'roc50' (you can substitute any number instead of 50) 164 - `numFolds` - number of CV folds to use when performing model selection 165 - `foldsToPerform` - the number of folds to actually perform 166 """ 167 168 169 Classifier.__init__(self, **args) 170 171 if arg.__class__ == self.__class__ : 172 self.param = arg.param.__class__(arg.param) 173 self.measure = arg.measure 174 self.numFolds = arg.numFolds 175 elif arg.__class__.__name__.find('Param') >= 0 : 176 self.param = arg.__class__(arg) 177 else : 178 raise ValueError, 'wrong type of input for ModelSelector' 179 180 self.classifier = None
181
182 - def __repr__(self) :
183 184 rep = '<' + self.__class__.__name__ + ' instance>\n' 185 if self.classifier is not None : 186 rep += self.classifier.__repr__() 187 else : 188 rep += self.param.__repr__() 189 190 return rep
191 192
193 - def train(self, data, **args) :
194 """ 195 :Keywords: 196 - `train` - boolean - whether to train the best classifier 197 (default: True) 198 """ 199 200 Classifier.train(self, data, **args) 201 202 maxSuccessRate = 0 203 bestClassifier = None 204 classifierIdx = 0 205 args['numFolds'] = self.numFolds 206 args['foldsToPerform'] = self.foldsToPerform 207 for r in self.param.stratifiedCV(data, **args) : 208 successRate = getattr(r, self.measure) 209 if successRate > maxSuccessRate : 210 bestClassifier = classifierIdx 211 maxSuccessRate = successRate 212 classifierIdx += 1 213 214 self.log.maxSuccessRate = maxSuccessRate 215 216 self.classifier = self.param.classifiers[bestClassifier].__class__( 217 self.param.classifiers[bestClassifier]) 218 219 if 'train' not in args or args['train'] is True : 220 self.classifier.train(data, **args) 221 222 self.classifier.log.trainingTime = self.getTrainingTime() 223 self.classifier.log.classifier = self.classifier.__class__(self.classifier)
224 225
226 - def save(self, fileHandle) :
227 228 self.classifier.save(fileHandle)
229 230
231 -class SVMselect (ModelSelector) :
232 """ 233 A model selector for searching for best parameters for an 234 SVM classifier with a Gaussian kernel 235 Its search strategy is as follows: 236 First optimize the width of the Gaussian (gamma) for a fixed (low) 237 value of C, and then optimize C. 238 """ 239 240 attributes = {'C' : [0.01, 0.1, 1, 10, 100, 1000], 241 'gamma' : [0.001, 0.01, 0.1, 1, 10], 242 'Clow' : 10, 243 'numFolds' : 5, 244 'measure' : 'balancedSuccessRate'} 245
246 - def __init__(self, arg = None, **args) :
247 """ 248 :Parameters: 249 - `arg` - another ModelSelector object 250 251 :Keywords: 252 - `C` - a list of values to try for C 253 - `gamma` - a list of value to try for gamma 254 - `measure` - which measure of accuracy to use for selecting the 255 best classifier (default = 'balancedSuccessRate') 256 supported measures are: 'balancedSuccessRate', 'successRate', 257 'roc', 'roc50' (you can substitute another number instead of 50) 258 - `numFolds` - number of CV folds to use when performing model selection 259 """ 260 261 Classifier.__init__(self, arg, **args) 262 263 self.classifier = None
264
265 - def __repr__(self) :
266 267 rep = '<' + self.__class__.__name__ + ' instance>\n' 268 if self.classifier is not None : 269 rep += self.classifier.__repr__() 270 rep += 'C: ' + str(self.C) + '\n' 271 rep += 'gamma: ' + str(self.gamma) + '\n' 272 273 return rep
274
275 - def train(self, data, **args) :
276 """ 277 :Keywords: 278 - `train` - boolean - whether to train the best classifier 279 (default: True) 280 - `vdata` - data to use for testing instead of using cross-validation 281 (not implemented yet) 282 """ 283 Classifier.train(self, data, **args) 284 285 import ker 286 kernel = ker.Gaussian() 287 gammaSelect = ModelSelector(Param(svm.SVM(kernel, C = self.Clow), 288 'kernel.gamma', self.gamma), 289 measure = self.measure, 290 numFolds = self.numFolds) 291 gammaSelect.train(data) 292 293 kernel = ker.Gaussian(gamma = gammaSelect.classifier.kernel.gamma) 294 cSelect = ModelSelector(Param(svm.SVM(kernel), 'C', self.C), 295 measure = self.measure, 296 numFolds = self.numFolds) 297 cSelect.train(data) 298 299 self.classifier = cSelect.classifier.__class__(cSelect.classifier) 300 301 if 'train' not in args or args['train'] is True : 302 self.classifier.train(data, **args) 303 304 self.classifier.log.trainingTime = self.getTrainingTime() 305 self.classifier.log.classifier = self.classifier.__class__(self.classifier)
306