Package PyML :: Package classifiers :: Module svm
[frames] | no frames]

Source Code for Module PyML.classifiers.svm

  1  import os 
  2  import random 
  3  import numpy 
  4   
  5  from PyML.utils import misc 
  6  from PyML.classifiers.baseClassifiers import Classifier 
  7  from PyML.classifiers.ext.libsvm import C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR 
  8  from PyML.classifiers.ext.libsvm import LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED 
  9  from PyML.classifiers.ext import csvmodel,libsvm,mylibsvm 
 10  from PyML.classifiers.ext import csvmodel 
 11  from PyML.utils import arrayWrap 
 12  from PyML.evaluators import assess 
 13  from PyML.containers.vectorDatasets import VectorDataSet, SparseDataSet 
 14  from PyML.containers import ker 
 15   
 16  from PyML.classifiers.ext import csmo 
 17  from PyML.classifiers.ext import cgist 
 18   
 19   
 20  """various flavors of SVMs and training algorithms""" 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24  containersNotSupported = ['PySparseDataSet', 'PyVectorDataSet'] 
 25   
26 -class SVM (Classifier) :
27 """ 28 An SVM classifier class. 29 30 SVM is trained using either libsvm, or using a PyML SMO implementation 31 based on libsvm 32 """ 33 34 svm_type = C_SVC 35 attributes = {'C' : 10, 36 'nu' : 0.5, 37 'Cmode': 'classProb', 38 'optimizer' : 'libsvm', 39 'cacheSize' : 256, 40 'nu' : 0.1, 41 'eps' : 0.01} 42
43 - def __init__(self, arg = None, **args):
44 45 """ 46 :Parameters: 47 - `arg` - another SVM object or a kernel object; if no argument is given 48 the kernel function of the training dataset is used 49 50 :Keywords: 51 - `C` - the svm C parameter 52 - `Cmode` - the way the C parameter is used; values: 'equal', 'classProb', 53 'fromData'. 54 In 'equal' mode C is set to be the same for both classes 55 In 'classProb' mode each class is assigned a C value that is 56 proportional to the size of the other class. This results in 57 margin error costs being proportional to the ratio of the 58 sizes of the two classes. 59 This is useful for datasets with an unbalanced class distribution. 60 In 'fromData' the value of C for each pattern is taken from the 61 'C' attribute of the training data. 62 - `optimizer` - which optimizer to use. values: 'libsvm' -- run libsvm 63 'mysmo' - use the PyML native optmizer (based on libsvm) 64 'gist' - use a gist-like optimizer. 65 - `cacheSize` - size of the kernel cache (in MB). 66 """ 67 68 Classifier.__init__(self, arg, **args) 69 70 self.kernel = None 71 if arg.__class__ == self.__class__ : 72 if arg.kernel is not None : 73 self.kernel = arg.kernel.__class__(arg.kernel) 74 elif hasattr(arg, 'type') and arg.type == 'kernel' : 75 self.kernel = arg.__class__(arg) 76 elif arg is not None : 77 raise ValueError, 'unknown type of argument'
78
79 - def __repr__(self) :
80 81 rep = ['<' + self.__class__.__name__ + ' instance>'] 82 if hasattr(self, 'C') : 83 rep.append('C : %f' % self.C) 84 rep.append('Cmode: %s' % self.Cmode) 85 if hasattr(self, 'kernel') and self.kernel is not None : 86 rep.append(str(self.kernel)) 87 if hasattr(self, 'model') : 88 if hasattr(self, 'model') : 89 rep.append(str(self.model)) 90 91 return '\n'.join(rep)
92
93 - def save(self, fileName) :
94 95 """ 96 save an SVM model to a file. 97 use the loadSVM method to then load the saved model 98 be sure the call the SVM train function as: 99 train(data, saveSpace=False) 100 101 :Parameters: 102 - `fileName` - a file name or file handle 103 """ 104 105 self.model.save(fileName)
106
107 - def train(self, data, **args) :
108 109 """ 110 train an SVM 111 112 :Keywords: 113 - `saveSpace` -- whether to save memory when constructing an SVM model 114 [default: True] 115 you need to set this keyword to False if you want to save the 116 resulting model 117 """ 118 119 if data.__class__.__name__ in containersNotSupported : 120 raise ValueError, 'convert your data into one of the C++ containers' 121 122 Classifier.train(self, data, **args) 123 if self.kernel is not None : 124 data.attachKernel(self.kernel) 125 126 # libsvm optimizer can only be used with vector data: 127 if (not data.isVector) and self.optimizer == 'libsvm' : 128 self.optimizer = 'mysmo' 129 130 if self.optimizer == 'libsvm' : 131 alpha,b,svID = self.trainLibsvm(data, **args) 132 elif self.optimizer == 'gist' : 133 alpha,b,svID = self.trainGist(data, **args) 134 elif self.optimizer == 'gradient' : 135 alpha,b,svID = self.trainGradient(data, **args) 136 else : 137 alpha,b,svID = self.trainMySMO(data, **args) 138 139 self.model = self.modelDispatcher(data, svID, alpha, b, **args) 140 141 self.trained = True 142 self.log.numSV = len(alpha) 143 self.log.trainingTime = self.getTrainingTime()
144 145
146 - def modelDispatcher(self, data, svID, alpha, b, **args) :
147 148 if (data.kernel.__class__.__name__.find('Linear') == 0 149 and data.isVector) : 150 return LinearSVModel(data, svID, alpha, b, **args) 151 else : 152 return SVModel(data, svID, alpha, b, **args)
153 154
155 - def trainLibsvm(self, data, **args) :
156 157 # setting C for the positive and negative classes 158 if (self.svm_type == ONE_CLASS or 159 self.svm_type == EPSILON_SVR or 160 self.svm_type == NU_SVR) : 161 Cpos = 0 162 Cneg = 0 163 else : 164 if data.labels.numClasses != 2 : 165 raise ValueError, 'svm is a two class classifier' 166 if self.Cmode == "classProb": 167 Cpos = self.C * (float(data.labels.classSize[0]) / float(len(data))) 168 Cneg = self.C * (float(data.labels.classSize[1]) / float(len(data))) 169 else: 170 Cpos = Cneg = self.C 171 172 print 'Cpos, Cneg: ', Cpos,Cneg 173 174 # prepare data for the libsvm wrapper : 175 # set kernel: 176 if hasattr(self, 'kernel') and self.kernel is not None : 177 kernel = self.kernel 178 else : 179 kernel = data.kernel 180 kernelType = kernel.__class__.__name__ 181 182 param = libsvm.svm_parameter() 183 misc.update(param, 184 kernel_type = LINEAR, 185 svm_type = self.svm_type, 186 cache_size = self.cacheSize, 187 eps = self.eps, 188 C = self.C, 189 nu = self.nu, 190 degree = 2, 191 p = 0.1, 192 shrinking = 1, 193 nr_weight = 0, 194 coef0 = 0) 195 196 if kernelType == "Polynomial" : 197 # (gamma x' y + coef0)^degree 198 param.kernel_type = POLY 199 param.degree = kernel.degree 200 param.coef0 = kernel.additiveConst 201 param.gamma = 1 202 elif kernelType == "Gaussian": 203 # exp(-gamma * |x - y|^2) 204 param.kernel_type = RBF 205 param.gamma = kernel.gamma 206 elif kernelType == "Cosine" : 207 # i'm using the sigmoid kernel as the cosine kernel 208 param.kernel_type = SIGMOID 209 210 s=libsvm.DecisionFunction() 211 212 prob = libsvm.svm_problem() 213 data.libsvm_construct(prob) 214 libsvm.svm_train_one_pyml(prob.this, param.this, Cpos, Cneg, s.this) 215 mylibsvm.libsvm_destroy(prob) 216 217 b = -s.rho 218 219 numSV = s.numSV 220 alpha = arrayWrap.doubleVector2list(s.alpha) 221 svID = arrayWrap.intVector2list(s.svID) 222 223 return alpha, b, svID
224
225 - def getC(self, data) :
226 227 if self.Cmode == "fromData" : 228 C = data.C 229 elif self.Cmode == "classProb": 230 Cpos = self.C * (float(data.labels.classSize[0]) / float(len(data))) 231 Cneg = self.C * (float(data.labels.classSize[1]) / float(len(data))) 232 c = [Cneg, Cpos] 233 C = [c[data.labels.Y[i]] for i in range(len(data))] 234 else: 235 C = [self.C for i in range(len(data))] 236 237 return C
238
239 - def trainGist(self, data, **args) :
240 241 if data.labels.numClasses != 2 : 242 raise ValueError, 'svm is a two class classifier' 243 244 alpha, b = runGist(self, data) 245 246 svID = [i for i in range(len(alpha)) 247 if alpha[i] > 0] 248 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha)) 249 if alpha[i] > 0] 250 251 return alpha, b, svID
252
253 - def trainGradient(self, data, **args) :
254 255 if data.labels.numClasses != 2 : 256 raise ValueError, 'svm is a two class classifier' 257 258 alpha, b = runGradientDescent(self, data) 259 260 svID = [i for i in range(len(alpha)) 261 if alpha[i] > 0] 262 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha)) 263 if alpha[i] > 0] 264 265 return alpha, b, svID
266 267
268 - def trainMySMO(self, data, **args) :
269 270 if data.labels.numClasses != 2 : 271 raise ValueError, 'svm is a two class classifier' 272 print 'training using MySMO' 273 alpha, b = runMySMO(self, data) 274 svID = [i for i in range(len(alpha)) 275 if alpha[i] > 0] 276 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha)) 277 if alpha[i] > 0] 278 b = - b 279 280 return alpha, b, svID
281
282 - def decisionFunc(self, data, i) :
283 284 return self.model.decisionFunc(data, i)
285
286 - def classify(self, data, i) :
287 288 margin = self.decisionFunc(data, i) 289 if margin > 0 : 290 return (1,margin) 291 else: 292 return (0,margin)
293 294
295 -def loadSVM(fileName, **args) :
296 297 """ 298 returns a trained SVM object constructed from a saved SVM model. 299 300 The saved SVM model stores the support vectors in sparse 301 vector format. When creating the model it then represents the 302 support vectors in some dataset container. The type of the 303 container needs to agree with the type of dataset of your test 304 data. By default the support vectors are represented using the 305 SparseDataSet container. You can set this using the 'datasetClass' 306 keyword argument e.g. datasetClass = SparseDataSet 307 """ 308 309 if 'datasetClass' in args : 310 datasetClass = args['datasetClass'] 311 else : 312 datasetClass = SparseDataSet 313 314 data = None 315 infile = open(fileName) 316 for line in infile : 317 if line[0] != '#' : break 318 if line.find('b=') > 0 : 319 b = float(line[3:]) 320 if line.find('alpha=') > 0 : 321 tokens = line.split('=')[1].split() 322 alpha = [float(token) for token in tokens] 323 if line.find('k=') > 0 : 324 import PyML 325 exec 'kernel = ' + line.split('k=')[1] 326 infile.close() 327 328 data = datasetClass(fileName, **args) 329 data.attachKernel(kernel) 330 # defining a nonelinear kernel is not enough, need to do the attach kernel for the data. 331 s = SVM(kernel) 332 s.labels = misc.Container() 333 s.labels.addAttributes(data.labels, ['numClasses', 'classLabels']) 334 s.featureID = data.featureID[:] 335 if (kernel.__class__.__name__ == 'Linear' or 336 kernel.__class__.__name__ == 'Cosine') : 337 s.model = LinearSVModel(data, range(len(data)), alpha, b, **args) 338 else : 339 s.model = SVModel(data, range(len(data)), alpha, b, **args) 340 341 return s
342
343 -class SVR (SVM) :
344 """A class for SVM regression (libsvm wrapper). 345 """ 346 347 svm_type = EPSILON_SVR 348 resultsObject = assess.RegressionResults 349 classify = SVM.decisionFunc 350
351 - def __repr__(self) :
352 rep = '<' + self.__class__.__name__ + ' instance>\n' 353 354 return rep
355 356 357
358 -class OneClassSVM (SVM) :
359 """wrapper for the libsvm one-class SVM""" 360 361 svm_type = ONE_CLASS 362 resultsObject = misc.DecisionFuncResults 363
364 - def __repr__(self) :
365 366 rep = '<' + self.__class__.__name__ + ' instance>\n' 367 368 return rep
369
370 -class SVC (Classifier) :
371 372 attributes = {'lineSampleSize' : 10, 373 'nu' : 0.1, 374 'eps' : 0.001} 375 376
377 - def __init__(self, arg=None, **args) :
378 379 Classifier.__init__(self, arg, **args)
380
381 - def train(self, data, **args) :
382 383 Classifier.train(self, data, **args) 384 self.oneClass = OneClassSVM(nu = self.nu, eps = self.eps) 385 self.oneClass.train(data) 386 self.data = data 387 print 'computing connected components' 388 self.clusters = self.connectedComponents()
389
390 - def decisionFunc(self, data, i) :
391 392 return self.oneClass.decisionFunc(data, i)
393
394 - def classify(self, data, i) :
395 396 margin = self.decisionFunc(data, i) 397 if margin > 0 : 398 return (1,margin) 399 else: 400 return (0,margin)
401 402
403 - def adjacent(self, i, j) :
404 405 xi = numpy.array(self.data.getPattern(i)) 406 xj = numpy.array(self.data.getPattern(j)) 407 stepSize = 1.0 / (self.lineSampleSize + 1) 408 lambdas = numpy.arange(0, 1, stepSize) 409 X = [] 410 for l in lambdas[1:] : 411 X.append((xi * l + xj * (1 - l)).tolist()) 412 testdata = VectorDataSet(X) 413 414 for i in range(len(testdata)) : 415 f = self.decisionFunc(testdata, i) 416 if f < 0 : 417 return False 418 return True
419
420 - def connectedComponents(self) :
421 422 # the set of patterns that do not belong in a connected component 423 patterns = set(range(len(self.data))) 424 # start with an empty set of connected components (clusters): 425 clusters = [] 426 # all the patterns that are currently in a cluster: 427 incluster = set() 428 while len(patterns) > 0 : 429 cluster = set() 430 fringe = [patterns.pop()] 431 while fringe : 432 pattern = fringe.pop() 433 if pattern not in cluster : 434 cluster.add(pattern) 435 if pattern in patterns : patterns.remove(pattern) 436 incluster.add(pattern) 437 fringe.extend([neighbor for neighbor in patterns 438 if self.adjacent(pattern, neighbor)]) 439 440 clusters.append([i for i in cluster]) 441 442 return clusters
443 444
445 -class SVModel (object) :
446
447 - def __init__(self, data, svID, alpha, b, **args) :
448 449 self.saveSpace = True 450 if 'saveSpace' in args : 451 self.saveSpace = args['saveSpace'] 452 453 self.alpha = alpha 454 self.b = b 455 self.svID = svID 456 self.numSV = len(svID) 457 if not data.isWrapper or not self.saveSpace : 458 self.svdata = data.__class__(data, patterns = svID) 459 if data.isWrapper : 460 self.cmodel = csvmodel.SVModel(data.castToBase(), svID, alpha, b)
461
462 - def __repr__(self) :
463 464 rep = '<' + self.__class__.__name__ + ' instance>\n' 465 rep += 'number of SVs: %d\n' % len(self) 466 467 return rep
468
469 - def __len__(self) :
470 471 return self.numSV
472
473 - def setBias(self, bias) :
474 475 self.b = bias 476 if hasattr(self, 'cmodel') : 477 self.cmodel.b = bias
478
479 - def decisionFunc(self, data, i) :
480 481 if hasattr(self, 'cmodel') : 482 return self.cmodel.decisionFunc(data.castToBase(), i) 483 sum = 0.0 484 for j in range(len(self)) : 485 sum += self.svdata.kernel.eval( 486 self.svdata, self.svdata.X[j], data.X[i]) * self.alpha[j] 487 488 return sum + self.b
489
490 - def save(self, fileName) :
491 492 if self.saveSpace : 493 raise ValueError, 'in order to save a dataset you need to train ' \ 494 'as: s.train(data, saveSpace = False)' 495 496 if type(fileName) == type('') : 497 outfile = open(fileName, 'w') 498 else : 499 outfile = fileName 500 501 outfile.write('#b=' + str(self.b) + '\n') 502 outfile.write('#alpha=') 503 alphaStr = [str(alpha) for alpha in self.alpha] 504 outfile.write(' '.join(alphaStr)) 505 outfile.write('\n') 506 outfile.write('#k=' + self.svdata.kernel.dump() + '\n') 507 format = 'sparse' 508 if self.svdata.__class__.__name__ == 'VectorDataSet' : 509 format = 'csv' 510 self.svdata.save(outfile, format = format)
511
512 -class LinearSVModel (SVModel) :
513
514 - def __init__(self, data, svID, alpha, b, **args) :
515 516 self.saveSpace = True 517 if 'saveSpace' in args : 518 self.saveSpace = args['saveSpace'] 519 if not self.saveSpace : 520 self.svdata = data.__class__(data, patterns = svID) 521 self.alpha = alpha 522 self.svID = svID 523 self.numSV = len(svID) 524 self.b = b 525 526 if data.isWrapper : 527 if data.__class__.__name__ == 'SparseDataSet' : 528 self.cmodel = csvmodel.LinearSparseSVModel(data, svID, alpha, b) 529 else : 530 self.cmodel = csvmodel.LinearSVModel(data, svID, alpha, b) 531 self.w = self.cmodel.getWvec(); 532 self.warray = self.w 533 else : 534 self.w = self.computeW(data, svID, alpha) 535 if type(self.w) == type({}) : 536 self.warray = numpy.zeros(data.numFeatures, numpy.float_) 537 for i in range(data.numFeatures) : 538 if data.featureKey[i] in self.w : 539 self.warray[i] = self.w[data.featureKey[i]] 540 else : 541 self.warray[i] = 0 542 else : 543 self.warray = self.w 544 self.dotProduct = data.dotProduct 545 print 'constructed model'
546
547 - def __repr__(self) :
548 549 rep = '<' + self.__class__.__name__ + ' instance>\n' 550 rep += 'number of SVs: %d\n' % len(self) 551 552 return rep
553
554 - def decisionFunc(self, data, i) :
555 556 if hasattr(self, 'cmodel') : 557 return self.cmodel.decisionFunc(data, i) 558 else : 559 return data.dotProduct(self.w, data.X[i]) + self.b
560
561 - def computeW(self, data, svID, alpha) :
562 563 if type(data.X[0]) == type({}) : #Sparse dataset 564 w = {} 565 for i in range(len(svID)): 566 svKeys = data.X[svID[i]].keys() 567 for svKey in svKeys: 568 if not w.has_key(svKey): 569 w[svKey] = 0.0 570 w[svKey] += data.X[svID[i]][svKey] * alpha[i] 571 else : # nonsparse dataset 572 w = numpy.zeros(len(data.featureID), numpy.float_) 573 for i in range(len(svID)): 574 w += alpha[i] * data.X[svID[i]] 575 576 return w
577 578
579 -def runMySMO(svmInstance, data) :
580 581 C = svmInstance.getC(data) 582 alphaVec = arrayWrap.doubleVector() 583 b = csmo.runSMO(data.castToBase(), C, alphaVec, int(svmInstance.cacheSize)) 584 alpha = [alphaVec[i] for i in range(len(alphaVec))] 585 586 return alpha,b
587
588 -def runGist(classifier, data) :
589 590 C = classifier.getC(data) 591 alphaVec = arrayWrap.doubleVector() 592 cgist.runGist(data.castToBase(), C, alphaVec, 593 int(classifier.cacheSize), 10000) 594 alpha = [alphaVec[i] for i in range(len(alphaVec))] 595 596 return alpha, 0.0
597
598 -def runGradientDescent(classifier, data) :
599 600 C = classifier.getC(data) 601 alphaVec = arrayWrap.doubleVector() 602 cgist.runGradientDescent(data.castToBase(), C, alphaVec, 603 int(classifier.cacheSize), 10000) 604 alpha = [alphaVec[i] for i in range(len(alphaVec))] 605 606 return alpha, 0.0
607