1
2 import sys
3 import math
4 import os
5
6 from ext import ckernel
7 from ext import cstringkernel
8 from PyML.utils import misc
9 from PyML.base.pymlObject import PyMLobject
10 from ext.ckernel import NONE, COSINE, TANIMOTO, DICES
11
12 normalizationMethods = ['none', 'cosine', 'tanimoto', 'dices']
13
14 __docformat__ = "restructuredtext en"
15
16 """functionality for dealing with kernels and kernel objects"""
17
19 """base class for kernel objects
20
21 each kernel class defines an ``eval`` function:
22 eval(self, datai, i, j, dataj = None) that evaluates the kernel between
23 patterns i and j of dataset ``datai``; if dataj is given then pattern j
24 is assumed to come from dataset ``dataj``
25
26 """
27
28 type = 'kernel'
29
31
32 rep = '<' + self.__class__.__name__ + ' instance>\n'
33
34 return rep
35
37 """
38 returns a string that can be used to construct an equivalent object
39 """
40 kstr = self.__module__ + '.' + self.__class__.__name__ + '(' + \
41 self.constructionParams() + ')'
42
43 return kstr
44
46
47 raise NotImplementedError
48
49 - def eval(self, datai, i, j, dataj = None) :
50
51 """evaluate the kernel function between
52 patterns i and j of dataset ``datai``; if dataj is given then pattern j
53 is assumed to come from dataset ``dataj``
54 """
55
56 raise NotImplementedError
57
58
59 -class Linear (Kernel, ckernel.Linear) :
60 """A Linear kernel (dot product)
61
62 Construction:
63 k = Linear()
64 """
65
66 - def __init__(self, arg = None, **args) :
67
68 if arg.__class__ == self.__class__ :
69 ckernel.Linear.__init__(self, arg)
70 else :
71 ckernel.Linear.__init__(self)
72 if 'normalization' in args :
73 self.normalization = args['normalization']
74 else :
75 self.normalization = NONE
76
80
81 - def eval (self, datai, i, j, dataj = None) :
86
87 -class Cosine (Kernel, ckernel.Cosine) :
88 """A Cosine kernel (dot product)
89 Construction:
90 k = Cosine()
91 """
92
99
103
104 - def eval (self, datai, i, j, dataj = None) :
109
110
112 """
113 A Polynomial kernel
114 K(x,y) = (x \dot y + additiveConst) ** degree
115
116 Construction:
117 k = Polynomial(degree, additiveConst)
118
119 Attributes:
120 additiveConst, degree - kernel parameters
121 """
122
123 attributes = {'normalization' : NONE,
124 'degree' : 2,
125 'additiveConst' : 1.0}
126
128
129 if arg.__class__ == self.__class__ :
130 ckernel.Polynomial.__init__(self, arg)
131 else :
132 ckernel.Polynomial.__init__(self)
133 for attribute in self.attributes :
134 if attribute in args :
135 setattr(self, attribute, args[attribute])
136 else :
137 setattr(self, attribute, self.attributes[attribute])
138 if arg != 2 :
139 self.degree = arg
140
142
143 rep = '<' + self.__class__.__name__ + ' instance>\n'
144 rep += 'degree : ' + str(self.degree) + '\n'
145 rep += 'affine coefficient : ' + str(self.additiveConst)
146
147 return rep
148
150
151 return 'degree = ' + str(self.degree) + ',' + \
152 'additiveConst = ' + str(self.additiveConst)
153
154 - def eval (self, datai, i, j, dataj = None) :
159
160 -class Gaussian (Kernel, ckernel.Gaussian) :
161
162 """
163 A Gaussian (RBF) kernel
164 K(x,y) = exp( - gamma * ||x - y||**2
165
166 Construction:
167 k = Gaussian(gamma)
168
169 Attributes:
170 gamma - kernel width parameter
171 """
172
173 attributes = {'normalization' : NONE,
174 'gamma' : 1.0,}
175
176 - def __init__(self, arg = 1.0, **args) :
177
178 if arg.__class__ == self.__class__ :
179 ckernel.Gaussian.__init__(self, arg)
180 else :
181 ckernel.Gaussian.__init__(self)
182 for attribute in self.attributes :
183 if attribute in args :
184 setattr(self, attribute, args[attribute])
185 else :
186 setattr(self, attribute, self.attributes[attribute])
187 if arg != 1.0 :
188 self.gamma = arg
189
191
192 rep = '<' + self.__class__.__name__ + ' instance>\n'
193 rep += 'gamma : ' + str(self.gamma)
194
195 return rep
196
198
199 return 'gamma = ' + str(self.gamma)
200
201 - def eval (self, datai, i, j, dataj = None) :
206
207
209
210 if 'normalization' in args :
211 if args['normalization'].lower() not in normalizationMethods :
212 raise ValueError, 'unrecognized normalization method'
213 args['normalization'] = normalizationMethods.index(args['normalization'].lower())
214 if type(kernel) == type('') :
215 kernel = kernel.lower()
216 if kernel == 'linear' :
217 k = Linear(**args)
218 elif kernel == 'cosine' :
219 k = Cosine()
220 elif kernel == 'polynomial' or kernel == 'poly' :
221 k = Polynomial(**args)
222 elif kernel == 'rbf' or kernel == 'gaussian' :
223 k = Gaussian(**args)
224 else :
225 raise ValueError, 'unrecognized type of kernel'
226 elif hasattr(kernel, 'type') and kernel.type == 'dataset' :
227 other = kernel
228 k = other._kernel.__class__(other._kernel)
229 elif hasattr(kernel, 'type') and kernel.type == 'kernel' :
230 k = eval(kernel.__class__.__name__ + '(kernel)')
231
232
233 if hasattr(data, '_kernel') :
234 data._kernel.thisown = True
235
236 data._kernel = k
237 data._kernel.thisown = False
238 data.setKernel(k.castToBase())
239 kernelName = k.__class__.__name__
240 if kernelName == 'Cosine' or kernelName == 'Gaussian' or k.normalization != NONE :
241 data.computeNorms()
242
243
245
246 """compute a kernel matrix and save it to a file in tab delimited format
247
248 :Parameters:
249 - `data` - a dataset
250 - `fileName` - file name to save the kernel
251
252 :Keywords:
253 - `format` - the format in which to save the kernel: pyml or gist formats [default: 'gist']
254 gist format has an additional header line that contains the ids.
255 """
256
257 if fileName is None or fileName == '-' :
258 outfile = sys.stdout
259 fileName = 'stdout'
260 else :
261 outfile = open(fileName, 'w')
262
263 format = 'gist'
264 if 'format' in args :
265 format = args['format']
266 import tempfile
267 tmpfile = tempfile.mktemp()
268 ckernel.kernel2file(data.castToBase(), tmpfile)
269 tmp = open(tmpfile)
270 outfile = open(fileName, 'w')
271 if format == 'gist' :
272 outfile.write(fileName + '\t')
273 outfile.write('\t'.join(data.labels.patternID) + '\n')
274 i = 0
275 for line in tmp :
276 if data.labels.patternID is not None :
277 outfile.write(data.labels.patternID[i])
278 i += 1
279 outfile.write(line)
280 os.remove(tmpfile)
281
282 -def averageEntry(fileName, ignoreDiagonal = True, delim = None) :
283
284 s = 0
285 numEntries = 0
286 file = open(fileName)
287 i = 0
288 for line in file :
289 tokens = line.split(delim)
290 for token in tokens :
291 try :
292 val = float(token)
293 s += val
294 numEntries += 1
295 except :
296 pass
297 i += 1
298 return s / numEntries
299
300
302 """
303 A string kernel inspired by Raetsch et al's weighted degree kernel
304 """
305
306 values = {'mink' : 6,
307 'maxk' : 8,
308 'mismatches' : 1,
309 'mismatchProfile' : [0,0,1,1,1,1,2,2,3,3,3,3],
310 'maxShift' : 0,
311 'noShiftStart' : 0,
312 'noShiftEnd' : 0,
313 }
314
315 values.update(args)
316
317 if len(values['mismatchProfile']) < values['maxk'] and values['mismatches'] > 0 :
318 raise ValueError, 'mismatchProfile not long enough'
319
320
321 if values['mismatches'] == 0 :
322 values['mismatchProfile'] = [0 for i in range(values['mink'],
323 values['maxk'] + 1) ]
324
325 return cstringkernel.PositionalKmer(values['mink'],
326 values['maxk'],
327 values['mismatches'],
328 values['mismatchProfile'],
329 values['maxShift'],
330 values['noShiftStart'],
331 values['noShiftEnd'])
332
333
334 -def combineKernels(ker1file, ker2file, kerOutFile, operation = 'add', **args) :
335 """combine two kernels by either adding or multiplying them.
336 In the case of addition the resulting kernel is of the form:
337 K_out(i,j) = weight * K1(i,j) + (1-weight) * K2(i,j)
338 where the default weight is 0.5
339 In the case of multiplication the resulting kernel is:
340 K_out(i,j) = (const1 + K1(i,j)) * (const2 + K2(i, j))
341 where const1 and const2 are 0 by default.
342
343 Notes: It is assumed that the kernels have the same size and the ids
344 are in the same order (an exception is raised if this is not satisfied).
345
346 :Parameters:
347 - `operation` - which operation to perform between the kernels; it is
348 a string with supported values 'add' or 'multiply' (add by default)
349
350 :Keywords:
351 - `weight` - weighting of kernels for kernel addition
352 - `const1,const2` - additive factor in case of kernel multiplication
353 """
354
355 weight = 0.5
356 if 'weight' in args :
357 weight = args['weight']
358 const1 = 0
359 if 'const1' in args :
360 const1 = args['const1']
361 const2 = 0
362 if 'const2' in args :
363 const2 = args['const2']
364 import misc
365 delim1 = misc.getDelim(ker1file)
366 delim2 = misc.getDelim(ker2file)
367 ker1 = open(ker1file)
368 ker2 = open(ker2file)
369 kerOut = open(kerOutFile, 'w')
370
371
372 line1 = ker1.readline()
373 try :
374 float(line1.split(delim1)[-1])
375 except :
376 line1 = ker1.readline()
377 line2 = ker2.readline()
378 try :
379 float(line2.split(delim2)[-1])
380 except :
381 line2 = ker2.readline()
382
383
384 firstToken = 0
385 try :
386 float(tokens1[0])
387 except :
388 firstToken = 1
389
390 while len(line1) > 0 :
391 tokens1 = line1.split(delim1)
392 tokens2 = line2.split(delim2)
393 if firstToken > 0 :
394 if tokens1[0] != tokens2[0] :
395 print tokens1[0], tokens2[0]
396 raise ValueError, 'kernels do not have the same ids'
397 kerOut.write(tokens1[0] + delim1)
398 if operation == 'add' :
399 outTokens = [str(float(tokens1[i]) * weight +
400 float(tokens2[i]) * (1-weight))
401 for i in range(firstToken, len(tokens1))]
402 else :
403 outTokens = [str((const1 + float(tokens1[i])) *
404 (const2 + float(tokens2[i])))
405 for i in range(firstToken, len(tokens1))]
406 kerOut.write(delim1.join(outTokens) + '\n')
407 line1 = ker1.readline()
408 line2 = ker2.readline()
409
410 -def sortKernel(kernelInFile, kernelOutFile, format = 'gist', **args) :
411 """
412 sort a kernel matrix according to its pattern ID
413
414 :Parameters:
415 - `kernelInFile` - the kernel input file name
416 - `kernelOutFile` - the output file name
417 - `format` - whether to output the kernel in gist format
418
419 :Keywords:
420 - `delim` - the field delimiter (default = tab)
421 """
422
423 from PyML.containers import KernelData
424 kdata = KernelData(kernelInFile)
425 idDict = misc.list2dict(kdata.labels.patternID, range(len(kdata)))
426 ids = kdata.labels.patternID[:]
427 ids.sort()
428 delim = '\t'
429 if 'delim' in args :
430 delim = args['delim']
431 kernelFile = open(kernelOutFile, 'w')
432 if format == 'gist' :
433 kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n')
434
435 for id1 in ids :
436 kernelFile.write(id1 + delim)
437 tokens = [str(kdata.kernel.eval(kdata, idDict[id1], idDict[id2]))
438 for id2 in ids]
439 kernelFile.write(delim.join(tokens) + '\n')
440
441
442 -def commonKernel(kernelFile1, kernelFile2, kernelOutFileName1, kernelOutFileName2) :
443
444 delim = ' '
445 from datafunc import KernelData
446 import misc
447 kdata1 = KernelData(kernelFile1)
448 kdata2 = KernelData(kernelFile2)
449 print 'loaded data'
450 ids = misc.intersect(kdata1.labels.patternID, kdata2.labels.patternID)
451 ids.sort()
452 idDict1 = misc.list2dict(ids)
453
454 if len(ids) != len(kdata1) :
455 kernelOutFile1 = open(kernelOutFileName1, 'w')
456 idDict = {}
457 for i in range(len(kdata1)) :
458 if kdata1.labels.patternID[i] in idDict1 :
459 idDict[kdata1.labels.patternID[i]] = i
460 for id1 in ids :
461 print id1
462 kernelOutFile1.write(id1 + delim)
463 tokens = [str(kdata1.kernel.eval(kdata1, idDict[id1], idDict[id2]))
464 for id2 in ids]
465 kernelOutFile1.write(delim.join(tokens) + '\n')
466
467 if len(ids) != len(kdata2) :
468 kernelOutFile2 = open(kernelOutFileName2, 'w')
469 idDict = {}
470 for i in range(len(kdata2)) :
471 if kdata2.labels.patternID[i] in idDict1 :
472 idDict[kdata2.labels.patternID[i]] = i
473 for id1 in ids :
474 print id1
475 kernelOutFile2.write(id1 + delim)
476 tokens = [str(kdata2.kernel.eval(kdata2, idDict[id1], idDict[id2]))
477 for id2 in ids]
478 kernelOutFile2.write(delim.join(tokens) + '\n')
479
480
481 -def expandKernel(inKernelFile, referenceKernelFile, outKernelFile, **args) :
482
483 """
484 Given a kernel matrix that might have missing entries, fill those as 0
485 on the basis of the patterns in a reference kernel (it is checked that
486 the reference kernel is sorted).
487
488 :Parameters:
489 - `inKernelFile` - input kernel file name
490 - `referenceKernelFile` - file name for the reference kernel
491 - `outKernelFile` - file name to output expanded kernel
492 """
493
494 if 'format' in args :
495 format = args['format']
496 else :
497 format = 'gist'
498 delim = '\t'
499
500 from datafunc import KernelData
501 import misc
502 import numpy
503
504 inKernel = KernelData(inKernelFile)
505 refKernel = KernelData(referenceKernelFile)
506 print 'loaded data'
507 ids = refKernel.labels.patternID[:]
508 ids.sort()
509 if ids != refKernel.labels.patternID :
510 raise ValueError, 'reference kernel not sorted'
511
512 idDict = misc.list2dict(inKernel.labels.patternID)
513 outKernel = open(outKernelFile, 'w')
514 if format == 'gist' :
515 outKernel.write(outKernelFile + delim)
516 outKernel.write(delim.join(ids) + '\n')
517
518 for i in range(len(refKernel)) :
519 outKernel.write(id1 + delim)
520 for j in range(len(refKernel)) :
521 values = numpy.zeros(len(refKernel), numpy.float_)
522 if ids[i] in idDict and ids[j] in idDict :
523 values[j] = inKernel.kernel.eval(inKernel,
524 idDict[ids[i]],idDict[ids[j]])
525 tokens = [str(value) for value in values]
526 outKernel.write(delim.join(tokens) + '\n')
527
528 -def showKernel(dataOrMatrix, fileName = None, useLabels = True, **args) :
529
530 labels = None
531 if hasattr(dataOrMatrix, 'type') and dataOrMatrix.type == 'dataset' :
532 data = dataOrMatrix
533 k = data.getKernelMatrix()
534 labels = data.labels
535 else :
536 k = dataOrMatrix
537 if 'labels' in args :
538 labels = args['labels']
539
540 import matplotlib
541
542 if fileName is not None and fileName.find('.eps') > 0 :
543 matplotlib.use('PS')
544 from matplotlib import pylab
545
546 pylab.matshow(k)
547
548
549 if useLabels and labels.L is not None :
550 numPatterns = 0
551 for i in range(labels.numClasses) :
552 numPatterns += labels.classSize[i]
553
554
555 pylab.axhline(numPatterns, color = 'black', linewidth = 1)
556 pylab.axvline(numPatterns, color = 'black', linewidth = 1)
557 pylab.axis([0, len(labels), 0, len(labels)])
558 if fileName is not None :
559 pylab.savefig(fileName)
560 pylab.close()
561
562
563 -def sortKernel2(kernelInFile, kernelOutFile, ids, format = 'gist', **args) :
564 """
565 sort a kernel matrix according to the given list of ids
566
567 :Parameters:
568 - `kernelInFile` - the kernel input file name
569 - `kernelOutFile` - the output file name
570 - `format` - whether to output the kernel in gist format
571
572 :Keywords:
573 - `delim` - the field delimiter (default = tab)
574 """
575
576 from PyML.containers import KernelData
577 kdata = KernelData(kernelInFile)
578 K = kdata.getKernelMatrix()
579 idDict = misc.list2dict(ids, range(len(ids)))
580
581 delim = '\t'
582 if 'delim' in args :
583 delim = args['delim']
584 kernelFile = open(kernelOutFile, 'w')
585 if format == 'gist' :
586 kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n')
587
588 for id1 in ids :
589 kernelFile.write(id1 + delim)
590 tokens = [str(K[idDict[id1]][idDict[id2]]) for id2 in ids]
591 kernelFile.write(delim.join(tokens) + '\n')
592