1 import os
2 import random
3 import numpy
4
5 from PyML.utils import misc
6 from PyML.classifiers.baseClassifiers import Classifier
7 from PyML.classifiers.ext.libsvm import C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR
8 from PyML.classifiers.ext.libsvm import LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED
9 from PyML.classifiers.ext import csvmodel,libsvm,mylibsvm
10 from PyML.classifiers.ext import csvmodel
11 from PyML.utils import arrayWrap
12 from PyML.evaluators import assess
13 from PyML.containers.vectorDatasets import VectorDataSet, SparseDataSet
14 from PyML.containers import ker
15
16 from PyML.classifiers.ext import csmo
17 from PyML.classifiers.ext import cgist
18
19
20 """various flavors of SVMs and training algorithms"""
21
22 __docformat__ = "restructuredtext en"
23
24 containersNotSupported = ['PySparseDataSet', 'PyVectorDataSet']
25
26 -class SVM (Classifier) :
27 """
28 An SVM classifier class.
29
30 SVM is trained using either libsvm, or using a PyML SMO implementation
31 based on libsvm
32 """
33
34 svm_type = C_SVC
35 attributes = {'C' : 10,
36 'nu' : 0.5,
37 'Cmode': 'classProb',
38 'optimizer' : 'libsvm',
39 'cacheSize' : 256,
40 'nu' : 0.1,
41 'eps' : 0.01}
42
44
45 """
46 :Parameters:
47 - `arg` - another SVM object or a kernel object; if no argument is given
48 the kernel function of the training dataset is used
49
50 :Keywords:
51 - `C` - the svm C parameter
52 - `Cmode` - the way the C parameter is used; values: 'equal', 'classProb',
53 'fromData'.
54 In 'equal' mode C is set to be the same for both classes
55 In 'classProb' mode each class is assigned a C value that is
56 proportional to the size of the other class. This results in
57 margin error costs being proportional to the ratio of the
58 sizes of the two classes.
59 This is useful for datasets with an unbalanced class distribution.
60 In 'fromData' the value of C for each pattern is taken from the
61 'C' attribute of the training data.
62 - `optimizer` - which optimizer to use. values: 'libsvm' -- run libsvm
63 'mysmo' - use the PyML native optmizer (based on libsvm)
64 'gist' - use a gist-like optimizer.
65 - `cacheSize` - size of the kernel cache (in MB).
66 """
67
68 Classifier.__init__(self, arg, **args)
69
70 self.kernel = None
71 if arg.__class__ == self.__class__ :
72 if arg.kernel is not None :
73 self.kernel = arg.kernel.__class__(arg.kernel)
74 elif hasattr(arg, 'type') and arg.type == 'kernel' :
75 self.kernel = arg.__class__(arg)
76 elif arg is not None :
77 raise ValueError, 'unknown type of argument'
78
80
81 rep = ['<' + self.__class__.__name__ + ' instance>']
82 if hasattr(self, 'C') :
83 rep.append('C : %f' % self.C)
84 rep.append('Cmode: %s' % self.Cmode)
85 if hasattr(self, 'kernel') and self.kernel is not None :
86 rep.append(str(self.kernel))
87 if hasattr(self, 'model') :
88 if hasattr(self, 'model') :
89 rep.append(str(self.model))
90
91 return '\n'.join(rep)
92
93 - def save(self, fileName) :
94
95 """
96 save an SVM model to a file.
97 use the loadSVM method to then load the saved model
98 be sure the call the SVM train function as:
99 train(data, saveSpace=False)
100
101 :Parameters:
102 - `fileName` - a file name or file handle
103 """
104
105 self.model.save(fileName)
106
107 - def train(self, data, **args) :
108
109 """
110 train an SVM
111
112 :Keywords:
113 - `saveSpace` -- whether to save memory when constructing an SVM model
114 [default: True]
115 you need to set this keyword to False if you want to save the
116 resulting model
117 """
118
119 if data.__class__.__name__ in containersNotSupported :
120 raise ValueError, 'convert your data into one of the C++ containers'
121
122 Classifier.train(self, data, **args)
123 if self.kernel is not None :
124 data.attachKernel(self.kernel)
125
126
127 if (not data.isVector) and self.optimizer == 'libsvm' :
128 self.optimizer = 'mysmo'
129
130 if self.optimizer == 'libsvm' :
131 alpha,b,svID = self.trainLibsvm(data, **args)
132 elif self.optimizer == 'gist' :
133 alpha,b,svID = self.trainGist(data, **args)
134 elif self.optimizer == 'gradient' :
135 alpha,b,svID = self.trainGradient(data, **args)
136 else :
137 alpha,b,svID = self.trainMySMO(data, **args)
138
139 self.model = self.modelDispatcher(data, svID, alpha, b, **args)
140
141 self.trained = True
142 self.log.numSV = len(alpha)
143 self.log.trainingTime = self.getTrainingTime()
144
145
147
148 if (data.kernel.__class__.__name__.find('Linear') == 0
149 and data.isVector) :
150 return LinearSVModel(data, svID, alpha, b, **args)
151 else :
152 return SVModel(data, svID, alpha, b, **args)
153
154
156
157
158 if (self.svm_type == ONE_CLASS or
159 self.svm_type == EPSILON_SVR or
160 self.svm_type == NU_SVR) :
161 Cpos = 0
162 Cneg = 0
163 else :
164 if data.labels.numClasses != 2 :
165 raise ValueError, 'svm is a two class classifier'
166 if self.Cmode == "classProb":
167 Cpos = self.C * (float(data.labels.classSize[0]) / float(len(data)))
168 Cneg = self.C * (float(data.labels.classSize[1]) / float(len(data)))
169 else:
170 Cpos = Cneg = self.C
171
172 print 'Cpos, Cneg: ', Cpos,Cneg
173
174
175
176 if hasattr(self, 'kernel') and self.kernel is not None :
177 kernel = self.kernel
178 else :
179 kernel = data.kernel
180 kernelType = kernel.__class__.__name__
181
182 param = libsvm.svm_parameter()
183 misc.update(param,
184 kernel_type = LINEAR,
185 svm_type = self.svm_type,
186 cache_size = self.cacheSize,
187 eps = self.eps,
188 C = self.C,
189 nu = self.nu,
190 degree = 2,
191 p = 0.1,
192 shrinking = 1,
193 nr_weight = 0,
194 coef0 = 0)
195
196 if kernelType == "Polynomial" :
197
198 param.kernel_type = POLY
199 param.degree = kernel.degree
200 param.coef0 = kernel.additiveConst
201 param.gamma = 1
202 elif kernelType == "Gaussian":
203
204 param.kernel_type = RBF
205 param.gamma = kernel.gamma
206 elif kernelType == "Cosine" :
207
208 param.kernel_type = SIGMOID
209
210 s=libsvm.DecisionFunction()
211
212 prob = libsvm.svm_problem()
213 data.libsvm_construct(prob)
214 libsvm.svm_train_one_pyml(prob.this, param.this, Cpos, Cneg, s.this)
215 mylibsvm.libsvm_destroy(prob)
216
217 b = -s.rho
218
219 numSV = s.numSV
220 alpha = arrayWrap.doubleVector2list(s.alpha)
221 svID = arrayWrap.intVector2list(s.svID)
222
223 return alpha, b, svID
224
225 - def getC(self, data) :
226
227 if self.Cmode == "fromData" :
228 C = data.C
229 elif self.Cmode == "classProb":
230 Cpos = self.C * (float(data.labels.classSize[0]) / float(len(data)))
231 Cneg = self.C * (float(data.labels.classSize[1]) / float(len(data)))
232 c = [Cneg, Cpos]
233 C = [c[data.labels.Y[i]] for i in range(len(data))]
234 else:
235 C = [self.C for i in range(len(data))]
236
237 return C
238
240
241 if data.labels.numClasses != 2 :
242 raise ValueError, 'svm is a two class classifier'
243
244 alpha, b = runGist(self, data)
245
246 svID = [i for i in range(len(alpha))
247 if alpha[i] > 0]
248 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha))
249 if alpha[i] > 0]
250
251 return alpha, b, svID
252
254
255 if data.labels.numClasses != 2 :
256 raise ValueError, 'svm is a two class classifier'
257
258 alpha, b = runGradientDescent(self, data)
259
260 svID = [i for i in range(len(alpha))
261 if alpha[i] > 0]
262 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha))
263 if alpha[i] > 0]
264
265 return alpha, b, svID
266
267
269
270 if data.labels.numClasses != 2 :
271 raise ValueError, 'svm is a two class classifier'
272 print 'training using MySMO'
273 alpha, b = runMySMO(self, data)
274 svID = [i for i in range(len(alpha))
275 if alpha[i] > 0]
276 alpha = [alpha[i] * (data.labels.Y[i] * 2 - 1) for i in range(len(alpha))
277 if alpha[i] > 0]
278 b = - b
279
280 return alpha, b, svID
281
285
287
288 margin = self.decisionFunc(data, i)
289 if margin > 0 :
290 return (1,margin)
291 else:
292 return (0,margin)
293
294
296
297 """
298 returns a trained SVM object constructed from a saved SVM model.
299
300 The saved SVM model stores the support vectors in sparse
301 vector format. When creating the model it then represents the
302 support vectors in some dataset container. The type of the
303 container needs to agree with the type of dataset of your test
304 data. By default the support vectors are represented using the
305 SparseDataSet container. You can set this using the 'datasetClass'
306 keyword argument e.g. datasetClass = SparseDataSet
307 """
308
309 if 'datasetClass' in args :
310 datasetClass = args['datasetClass']
311 else :
312 datasetClass = SparseDataSet
313
314 data = None
315 infile = open(fileName)
316 for line in infile :
317 if line[0] != '#' : break
318 if line.find('b=') > 0 :
319 b = float(line[3:])
320 if line.find('alpha=') > 0 :
321 tokens = line.split('=')[1].split()
322 alpha = [float(token) for token in tokens]
323 if line.find('k=') > 0 :
324 import PyML
325 exec 'kernel = ' + line.split('k=')[1]
326 infile.close()
327
328 data = datasetClass(fileName, **args)
329 data.attachKernel(kernel)
330
331 s = SVM(kernel)
332 s.labels = misc.Container()
333 s.labels.addAttributes(data.labels, ['numClasses', 'classLabels'])
334 s.featureID = data.featureID[:]
335 if (kernel.__class__.__name__ == 'Linear' or
336 kernel.__class__.__name__ == 'Cosine') :
337 s.model = LinearSVModel(data, range(len(data)), alpha, b, **args)
338 else :
339 s.model = SVModel(data, range(len(data)), alpha, b, **args)
340
341 return s
342
355
356
357
359 """wrapper for the libsvm one-class SVM"""
360
361 svm_type = ONE_CLASS
362 resultsObject = misc.DecisionFuncResults
363
365
366 rep = '<' + self.__class__.__name__ + ' instance>\n'
367
368 return rep
369
370 -class SVC (Classifier) :
371
372 attributes = {'lineSampleSize' : 10,
373 'nu' : 0.1,
374 'eps' : 0.001}
375
376
380
381 - def train(self, data, **args) :
389
393
395
396 margin = self.decisionFunc(data, i)
397 if margin > 0 :
398 return (1,margin)
399 else:
400 return (0,margin)
401
402
404
405 xi = numpy.array(self.data.getPattern(i))
406 xj = numpy.array(self.data.getPattern(j))
407 stepSize = 1.0 / (self.lineSampleSize + 1)
408 lambdas = numpy.arange(0, 1, stepSize)
409 X = []
410 for l in lambdas[1:] :
411 X.append((xi * l + xj * (1 - l)).tolist())
412 testdata = VectorDataSet(X)
413
414 for i in range(len(testdata)) :
415 f = self.decisionFunc(testdata, i)
416 if f < 0 :
417 return False
418 return True
419
421
422
423 patterns = set(range(len(self.data)))
424
425 clusters = []
426
427 incluster = set()
428 while len(patterns) > 0 :
429 cluster = set()
430 fringe = [patterns.pop()]
431 while fringe :
432 pattern = fringe.pop()
433 if pattern not in cluster :
434 cluster.add(pattern)
435 if pattern in patterns : patterns.remove(pattern)
436 incluster.add(pattern)
437 fringe.extend([neighbor for neighbor in patterns
438 if self.adjacent(pattern, neighbor)])
439
440 clusters.append([i for i in cluster])
441
442 return clusters
443
444
446
447 - def __init__(self, data, svID, alpha, b, **args) :
448
449 self.saveSpace = True
450 if 'saveSpace' in args :
451 self.saveSpace = args['saveSpace']
452
453 self.alpha = alpha
454 self.b = b
455 self.svID = svID
456 self.numSV = len(svID)
457 if not data.isWrapper or not self.saveSpace :
458 self.svdata = data.__class__(data, patterns = svID)
459 if data.isWrapper :
460 self.cmodel = csvmodel.SVModel(data.castToBase(), svID, alpha, b)
461
463
464 rep = '<' + self.__class__.__name__ + ' instance>\n'
465 rep += 'number of SVs: %d\n' % len(self)
466
467 return rep
468
470
471 return self.numSV
472
474
475 self.b = bias
476 if hasattr(self, 'cmodel') :
477 self.cmodel.b = bias
478
480
481 if hasattr(self, 'cmodel') :
482 return self.cmodel.decisionFunc(data.castToBase(), i)
483 sum = 0.0
484 for j in range(len(self)) :
485 sum += self.svdata.kernel.eval(
486 self.svdata, self.svdata.X[j], data.X[i]) * self.alpha[j]
487
488 return sum + self.b
489
490 - def save(self, fileName) :
491
492 if self.saveSpace :
493 raise ValueError, 'in order to save a dataset you need to train ' \
494 'as: s.train(data, saveSpace = False)'
495
496 if type(fileName) == type('') :
497 outfile = open(fileName, 'w')
498 else :
499 outfile = fileName
500
501 outfile.write('#b=' + str(self.b) + '\n')
502 outfile.write('#alpha=')
503 alphaStr = [str(alpha) for alpha in self.alpha]
504 outfile.write(' '.join(alphaStr))
505 outfile.write('\n')
506 outfile.write('#k=' + self.svdata.kernel.dump() + '\n')
507 format = 'sparse'
508 if self.svdata.__class__.__name__ == 'VectorDataSet' :
509 format = 'csv'
510 self.svdata.save(outfile, format = format)
511
513
514 - def __init__(self, data, svID, alpha, b, **args) :
515
516 self.saveSpace = True
517 if 'saveSpace' in args :
518 self.saveSpace = args['saveSpace']
519 if not self.saveSpace :
520 self.svdata = data.__class__(data, patterns = svID)
521 self.alpha = alpha
522 self.svID = svID
523 self.numSV = len(svID)
524 self.b = b
525
526 if data.isWrapper :
527 if data.__class__.__name__ == 'SparseDataSet' :
528 self.cmodel = csvmodel.LinearSparseSVModel(data, svID, alpha, b)
529 else :
530 self.cmodel = csvmodel.LinearSVModel(data, svID, alpha, b)
531 self.w = self.cmodel.getWvec();
532 self.warray = self.w
533 else :
534 self.w = self.computeW(data, svID, alpha)
535 if type(self.w) == type({}) :
536 self.warray = numpy.zeros(data.numFeatures, numpy.float_)
537 for i in range(data.numFeatures) :
538 if data.featureKey[i] in self.w :
539 self.warray[i] = self.w[data.featureKey[i]]
540 else :
541 self.warray[i] = 0
542 else :
543 self.warray = self.w
544 self.dotProduct = data.dotProduct
545 print 'constructed model'
546
548
549 rep = '<' + self.__class__.__name__ + ' instance>\n'
550 rep += 'number of SVs: %d\n' % len(self)
551
552 return rep
553
555
556 if hasattr(self, 'cmodel') :
557 return self.cmodel.decisionFunc(data, i)
558 else :
559 return data.dotProduct(self.w, data.X[i]) + self.b
560
561 - def computeW(self, data, svID, alpha) :
562
563 if type(data.X[0]) == type({}) :
564 w = {}
565 for i in range(len(svID)):
566 svKeys = data.X[svID[i]].keys()
567 for svKey in svKeys:
568 if not w.has_key(svKey):
569 w[svKey] = 0.0
570 w[svKey] += data.X[svID[i]][svKey] * alpha[i]
571 else :
572 w = numpy.zeros(len(data.featureID), numpy.float_)
573 for i in range(len(svID)):
574 w += alpha[i] * data.X[svID[i]]
575
576 return w
577
578
580
581 C = svmInstance.getC(data)
582 alphaVec = arrayWrap.doubleVector()
583 b = csmo.runSMO(data.castToBase(), C, alphaVec, int(svmInstance.cacheSize))
584 alpha = [alphaVec[i] for i in range(len(alphaVec))]
585
586 return alpha,b
587
589
590 C = classifier.getC(data)
591 alphaVec = arrayWrap.doubleVector()
592 cgist.runGist(data.castToBase(), C, alphaVec,
593 int(classifier.cacheSize), 10000)
594 alpha = [alphaVec[i] for i in range(len(alphaVec))]
595
596 return alpha, 0.0
597
599
600 C = classifier.getC(data)
601 alphaVec = arrayWrap.doubleVector()
602 cgist.runGradientDescent(data.castToBase(), C, alphaVec,
603 int(classifier.cacheSize), 10000)
604 alpha = [alphaVec[i] for i in range(len(alphaVec))]
605
606 return alpha, 0.0
607