1
2 import arrayWrap
3 import ker
4 from ext import ckernel
5 from ext import caggregate
6 from ext import csparsedataset
7 from ext import cvectordataset
8 from ext import ckerneldata
9 from ext import csequencedata
10 from ext import cstringkernel
11
12 import string
13 import numpy
14 import math
15 import copy
16 import random
17
18 import misc,myio
19 import parsers
20
21 """classes for reading and handling a dataset"""
22
23 __docformat__ = "restructuredtext en"
24
25
27 """A sparse dataset container"""
28
32
34
35 return len(self.featureID)
36
38
39 raise ValueError, 'do not call this function!'
40
41 numFeatures = property (getNumFeatures, setNumFeatures,
42 None, 'The number of features in a dataset')
43
44 - def copy(self, other, patternsToCopy, deepcopy) :
45 """copy the X variable of a sparse dataset
46 INPUT:
47 other - the other dataset
48 patternsToCopy - a list of patterns to copy
49 deepcopy - a 0/1 flag telling whether to do deepcopy or not"""
50
51 X = None
52
53 if patternsToCopy is None :
54 patternsToCopy = range(len(other))
55
56 featureKeyDict = {}
57 if other.X is not None :
58 X = []
59 for i in patternsToCopy:
60 if deepcopy :
61 X.append(copy.deepcopy(other.X[i]))
62 else :
63 X.append(other.X[i])
64 if len(patternsToCopy) < len(other) :
65 for featureKey in other.X[i] :
66 featureKeyDict[featureKey] = 1
67
68 if len(patternsToCopy) == len(other) :
69 self.featureKeyDict = copy.deepcopy(other.featureKeyDict)
70 self.featureKey = other.featureKey[:]
71 self.featureID = other.featureID[:]
72 else :
73 self.featureKey = featureKeyDict.keys()
74 self.featureKey.sort()
75 self.featureKeyDict = {}
76 for i in range(len(self.featureKey)) :
77 self.featureKeyDict[self.featureKey[i]] = i
78 self.featureID = [other.featureID[i] for i in range(other.numFeatures)
79 if other.featureKey[i] in self.featureKeyDict]
80
81 self.X = X
82
83
84
88
90
91 if type(x) == type({}) :
92 self.X.append(x)
93 else :
94 xDict = {}
95 for i in range(len(x)) :
96 xDict[i] = x[i]
97 self.X.append(xDict)
98
100
101 hashID = hash(id)
102 if hashID in self.featureKeyDict :
103 raise ValueError, 'feature already exists, or hash problem'
104 for i in range(len(self)) :
105 if values[i] != 0 :
106 self.X[i][hashID] = values[i]
107
108
109 pos = numpy.searchsorted(self.featureKey, hashID)
110 self.featureKey.insert(pos, hashID)
111 self.featureID.insert(pos, id)
112 self.featureKeyDict = misc.list2dict(self.featureKey, range(len(self.featureKey)))
113
114
118
122
123 - def extendX(self, other, patterns) :
124
125 for p in patterns :
126 self.X.append(other.X[p])
127
128
130 """eliminate a list of features from a dataset
131 INPUT:
132 featureList - a list of features to eliminate; these are numbers
133 between 0 and numFeatures-1 (indices of features, not their IDs)"""
134
135 if len(featureList) == 0 : return
136
137 if self.verbose :
138 print 'eliminating features...'
139
140 if type(featureList[0]) == type('') :
141 featureList = self.featureNames2IDs(features)
142
143 elimDict = {}
144 for feature in featureList :
145 elimDict[self.featureKey[feature]] = 1
146
147 featureKeyDict = {}
148 for i in range(len(self)) :
149 if self.verbose and i % 1000 == 0 and i > 0 :
150 print i
151 featureKeys = self.X[i].keys()
152 for featureKey in featureKeys :
153 if featureKey in elimDict :
154 del self.X[i][featureKey]
155 else :
156 featureKeyDict[featureKey] = 1
157
158 oldFeatureKey = self.featureKey
159 self.featureKey = featureKeyDict.keys()
160 self.featureKey.sort()
161 self.featureKeyDict = {}
162 for i in range(len(self.featureKey)) :
163 self.featureKeyDict[self.featureKey[i]] = i
164 self.featureID = [self.featureID[i] for i in range(len(self.featureID))
165 if oldFeatureKey[i] in self.featureKeyDict]
166
167
169 """F is a list where F[i] is a dictionary whose entries are the non
170 zero entries of feature number i:
171 F[self.featureKeyDict[f]][i] = X[i][f]
172 """
173
174 F = [{} for i in range(self.numFeatures)]
175
176 for i in range(len(self)) :
177 for f in self.X[i].keys() :
178 F[self.featureKeyDict[f]][i] = self.X[i][f]
179
180 self.F = F
181
182
184
185 if patterns is None :
186 patterns = range(len(self))
187 values = numpy.zeros(len(patterns), numpy.float_)
188 for i in range(len(patterns)) :
189 if self.featureKey[feature] in self.X[patterns[i]] :
190 values[i] = self.X[patterns[i]][self.featureKey[feature]]
191
192 return values
193
194
196
197 if type(x) == type(1) :
198 x = self.X[x]
199 if other is not None :
200 y = other.X[y]
201 else :
202 y = self.X[y]
203 sum = 0.0
204 xKeys = x.keys()
205 for xKey in xKeys :
206 if y.has_key(xKey) :
207 sum += y[xKey] * x[xKey]
208 return sum
209
210 - def norm(self, pattern, p = 1) :
211
212 sum = 0.0
213 for xKey in self.X[pattern] :
214 if p == 1 :
215 sum += abs(self.X[pattern][xKey])
216 elif p == 2 :
217 sum += self.X[pattern][xKey] * self.X[pattern][xKey]
218 else :
219 raise ValueError, 'wrong value for p'
220
221 if p == 1 :
222 return sum
223 else :
224 return math.sqrt(sum)
225
227 """normalize dataset according to the p-norm, p=1,2"""
228
229 for i in range(len(self)) :
230 norm = self.norm(i, p)
231 if norm == 0 : continue
232 for xKey in self.X[i] :
233 self.X[i][xKey] /= norm
234
235
237 """rescale the columns of the data matrix by a weight vector w:
238 set X[i][j] = X[i][j] * w[j]
239 w is either a dictionary or an array
240 """
241
242 if type(w) != type({}) :
243 wDict = {}
244 for i in range(self.numFeatures) :
245 wDict[self.featureKey[i]] = w[i]
246 w = wDict
247 for i in range(len(self)) :
248 for featureKey in self.X[i] :
249 if featureKey in w :
250 self.X[i][featureKey] *= w[featureKey]
251 else :
252 self.X[i][featureKey] = 0.0
253
254
255 - def mean(self, patterns = None) :
256
257 if patterns is None : patterns = range(len(self))
258
259 featureMean = numpy.zeros(self.numFeatures, numpy.float_)
260
261 for i in patterns :
262 for featureKey in self.X[i] :
263 featureMean[self.featureKeyDict[featureKey]] += self.X[i][featureKey]
264
265 return featureMean / len(patterns)
266
268 """subtract the input array from the data.
269 the sparsity of the data is not altered, ie, zero entries are not
270 made nonzero by the translation
271 """
272 for i in range(len(self)) :
273 for featureKey in self.X[i] :
274 self.X[i][featureKey] -= translation[self.featureKeyDict[featureKey]]
275
276 - def std(self, patterns = None) :
277
278 if patterns is None : patterns = range(len(self))
279
280 featureSq = numpy.zeros(self.numFeatures, numpy.float_)
281
282 for i in patterns :
283 for featureKey in self.X[i] :
284 featureSq[self.featureKeyDict[featureKey]] += self.X[i][featureKey]**2
285
286 featureVar = featureSq / float(len(patterns)) - self.mean(patterns)**2
287
288 return numpy.sqrt(numpy.clip(featureVar, 0, 1e10))
289
291
292 if patterns is None :
293 patterns = range(len(self))
294
295 count = 0
296 featureKey = self.featureKey[feature]
297 for i in patterns :
298 if data.X[i].has_key(featureKey) and data.X[i][featureKey] != 0 :
299 count += 1
300
301 return count
302
304
305 if patterns is None :
306 patterns = range(len(self))
307
308 counts = numpy.zeros(self.numFeatures, numpy.float_)
309 for i in patterns :
310 for featureKey in data.X[i] :
311 feature = data.featureKeyDict[featureKey]
312 if data.X[i][featureKey] != 0 :
313 counts[feature] += 1
314
315 return counts
316
317
318
319
321 """A non-sparse dataset container; uses a numpy array"""
322
324 """the number of patterns in the dataset"""
325
326 if self.X is not None :
327 return len(self.X)
328 else :
329 raise ValueError, "no data here!"
330
332
333 return len(self.featureID)
334
336
337 raise ValueError, 'do not call this function!'
338
339 numFeatures = property (getNumFeatures, setNumFeatures,
340 None, 'The number of features in a dataset')
341
345
347
348 if type(x) == type(1) :
349 x = self.X[x]
350 if other is not None :
351 y = other.X[y]
352 else :
353 y = self.X[y]
354
355 return numpy.dot(x, y)
356
358
359 self.X = numpy.zeros((numPatterns, numFeatures), numpy.float_)
360
362
363 for j in range(len(x)) :
364 self.X[i][j] = x[j]
365
369
370 - def extendX(self, other, patterns) :
371
372 X = self.X
373 self.X = numpy.zeros((len(self) + len(patterns), len(self.numFeatures)),
374 numpy.float_)
375 for i in range(len(X)) :
376 self.X[i] = X[i]
377 for i in patterns :
378 self.X[i + len(X)] = other.X[i]
379
383
384 - def copy(self, other, patternsToCopy, deepcopy) :
385 """deepcopy is performed by default, so the deepcopy flag is ignored"""
386
387 X = None
388 K = None
389 numFeatures = None
390 if patternsToCopy is None :
391 patternsToCopy = range(len(other))
392 else :
393
394 if hasattr(other, 'origID') :
395 self.origID = [other.origID[p] for p in patternsToCopy]
396 else :
397 self.origID = patternsToCopy[:]
398
399 if other.X is not None :
400 numFeatures = other.numFeatures
401 X = numpy.take(other.X, patternsToCopy)
402
403 self.X = X
404 self.featureID = other.featureID[:]
405 self.featureKey = other.featureKey[:]
406 self.featureKeyDict = copy.deepcopy(other.featureKeyDict)
407
408
409
411 """eliminate a list of features from a dataset
412 Input:
413 featureList - a list of features to eliminate; these are numbers
414 between 0 and numFeatures-1 (indices of features, not their IDs)"""
415
416 if len(featureList) == 0 : return
417 if type(featureList[0]) == type('') :
418 featureList = self.featureNames2IDs(features)
419 featuresToTake = misc.setminus(range(self.numFeatures), featureList)
420 featuresToTake.sort()
421 self.featureID = [self.featureID[i] for i in featuresToTake]
422 self.featureKey = [self.featureKey[i] for i in featuresToTake]
423 self.featureKeyDict = {}
424 for i in range(len(self.featureKey)) :
425 self.featureKeyDict[self.featureKey[i]] = i
426
427 self.X = numpy.take(self.X, featuresToTake, 1)
428
429
430
432
433 if patterns is None :
434 patterns = range(len(self))
435 values = numpy.zeros(len(patterns), numpy.float_)
436 for i in range(len(patterns)) :
437 values[i] = self.X[i][feature]
438
439 return values
440
441 - def norm(self, pattern, p = 1) :
442
443 if p == 1 :
444 return numpy.sum(numpy.absolute(self.X[pattern]))
445 elif p == 2 :
446 return math.sqrt(numpy.sum(numpy.dot(self.X[pattern])))
447 else :
448 raise ValueError, 'wrong value of p'
449
451 """normalize dataset according to the p-norm, p=1,2"""
452
453 for i in range(len(self)) :
454 norm = self.norm(i, p)
455 if norm == 0 : continue
456 self.X[i] = self.X[i] / norm
457
459 """rescale the columns of the data matrix by a weight vector w:
460 set X[i][j] = X[i][j] / w[j]
461 """
462
463 self.X = self.X * w
464
466
467 self.X = self.X - numpy.resize(c, (len(self), len(c)))
468
469 - def mean(self, patterns = None) :
470
471 if patterns is None or len(patterns) == len(self) :
472 return numpy.mean(self.X)
473
474 featureMean = numpy.zeros(self.numFeatures, numpy.float_)
475
476 for i in patterns :
477 featureMean += self.X[i]
478
479 return featureMean / len(patterns)
480
481
482 - def std(self, patterns = None) :
483
484 if patterns is None or len(patterns) == len(self) :
485 return numpy.std(self.X) * len(self) / (len(self) - 1)
486
487 featureSq = numpy.zeros(self.numFeatures, numpy.float_)
488
489 for i in patterns :
490 featureSq += self.X[i]**2
491
492 featureVar = featureSq / float(len(patterns)) - self.mean(patterns)**2
493
494 return numpy.sqrt(numpy.clip(featureVar, 0, 1e10))
495
497
498 if patterns is None :
499 patterns = range(len(self))
500
501 count = 0
502 for p in patterns :
503 if data.X[p][feature] != 0 : count+=1
504
505 return count
506
508
509 if patterns is None :
510 patterns = range(len(self))
511
512 counts = numpy.zeros(self.numFeatures)
513 for i in patterns :
514 counts += numpy.not_equal(data.X[i], 0)
515
516 return counts
517
518 - def csvwrite(self, fileName, delim = ' ', idCol = -1) :
519
520 fileHandle = open(fileName, 'w')
521 if self.labels.numClasses == 2 :
522 Y = [self.labels.Y[i] * 2 - 1 for i in range(len(self))]
523 else :
524 Y = self.labels.Y
525
526 for i in range(len(self)) :
527 outstr = ''
528 for j in range(self.numFeatures) :
529 outstr += str(self.X[i][j]) + delim
530 fileHandle.write(outstr + str(Y[i]) + '\n')
531 fileHandle.close()
532