1
2 import numpy
3
4 from PyML.containers.baseDatasets import WrapperDataSet, BaseVectorDataSet
5 from PyML.utils import arrayWrap,misc
6 from ext import csparsedataset,cvectordataset
7
9 """A base class for vector dataset containers implemented in C++"""
10
16
17 - def copy(self, other, patterns, deepcopy) :
18 """
19 copy a wrapper dataset
20
21 :Parameters:
22 - `other` - the other dataset
23 - `patternsToCopy` - a list of patterns to copy
24 - `deepcopy` - a 0/1 flag telling whether to do deepcopy or not
25 """
26
27 if patterns is None :
28 patterns = range(len(other))
29 self.container.__init__(self, other, patterns)
30 self.featureDict = other.featureDict.copy()
31 self.featureID = other.featureID[:]
32
33
37
38
40
41 if type(x) == type({}) :
42 keys,values = arrayWrap.dict2vectors(x)
43 elif type(x) == type(numpy.array(1)) or type(x) == type([]) :
44 keys = arrayWrap.longVector([])
45 values = arrayWrap.doubleVector(x)
46 else:
47 raise TypeError,"data vectors must be dictionary, list or arrays"
48 self.container.addPattern(self, keys, values)
49
51
52 if len(values) != self.size() :
53 raise ValueError, \
54 'number of values provided does not match dataset size'
55 if type(id) == type(1) :
56 id = str(id)
57 hashID = hash(id)
58 if not hasattr(self, 'featureKeyDict') :
59 self.addFeatureKeyDict()
60 if hashID in self.featureKeyDict :
61 raise ValueError, 'Feature already exists, or hash clash'
62 if type(values) != type([]) :
63 values = [v for v in values]
64
65 self.container.addFeature(self, hashID, values)
66 self.updateFeatureDict(id)
67
69
70 if len(other) != len(self) :
71 raise ValueError, 'number of examples does not match'
72 if not hasattr(self, 'featureKeyDict') :
73 self.addFeatureKeyDict()
74 for id in other.featureID :
75 if hash(id) in self.featureKeyDict :
76 raise ValueError, 'Feature already exists, or hash clash'
77
78 self.container.addFeatures(self, other)
79 self.updateFeatureDict(other)
80
81
83
84 if i < 0 or i >= len(self) :
85 raise ValueError, 'Index out of range'
86 return self.container.getPattern(self, i)
87
88 - def extendX(self, other, patterns) :
91
93 """eliminate a list of features from a dataset
94 INPUT:
95 featureList - a list of features to eliminate; these are numbers
96 between 0 and numFeatures-1 (indices of features, not their IDs)"""
97
98 if len(featureList) == 0 : return
99 if type(featureList[0]) == type('') :
100 featureList = self.featureNames2IDs(featureList)
101 featureList.sort()
102 if type(featureList) != type([]) :
103 featureList = list(featureList)
104 if max(featureList) >= self.numFeatures or min(featureList) < 0 :
105 raise ValueError, 'Bad feature list'
106
107 self.container.eliminateFeatures(self, featureList)
108 self.updateFeatureDict(featureList)
109
111 """rescale the columns of the data matrix by a weight vector w:
112 set X[i][j] = X[i][j] * w[j]
113 """
114
115 if type(w) == type(1.0) :
116 w = [w for i in range(self.numFeatures)]
117 if type(w) != type([]) :
118 w = list(w)
119
120 self.container.scale(self, w)
121
127
128 - def mean(self, patterns = None) :
129
130 if patterns is None : patterns = range(len(self))
131 if type(patterns) != type([]) : patterns = list(patterns)
132 if min(patterns) < 0 or max(patterns) >= len(self) :
133 raise ValueError, 'Pattern index out of range'
134 return self.container.mean(self, patterns)
135
136 - def std(self, patterns = None) :
137
138 if patterns is None : patterns = range(len(self))
139 if type(patterns) != type([]) : patterns = list(patterns)
140 if min(patterns) < 0 or max(patterns) >= len(self) :
141 raise ValueError, 'Pattern index out of range'
142 return self.container.standardDeviation(self, patterns)
143
145
146 if patterns is None : patterns = range(len(self))
147 if type(patterns) != type([]) : patterns = list(patterns)
148 if min(patterns) < 0 or max(patterns) >= len(self) :
149 raise ValueError, 'Pattern index out of range'
150 return self.container.featureCount(self, feature, patterns)
151
153
154 if patterns is None : patterns = range(len(self))
155 if type(patterns) != type([]) : patterns = list(patterns)
156 if min(patterns) < 0 or max(patterns) >= len(self) :
157 raise ValueError, 'Pattern index out of range'
158 return self.container.featureCounts(self, patterns)
159
160 - def nonzero(self, feature, patterns = None) :
161
162 if patterns is None : patterns = range(len(self))
163 if type(patterns) != type([]) : patterns = list(patterns)
164 if min(patterns) < 0 or max(patterns) >= len(self) :
165 raise ValueError, 'Pattern index goes outside of range'
166 return self.container.nonzero(self, feature, patterns)
167
169
170 return [self.featureKeyDict[featureKey] for featureKey in
171 self.container.commonFeatures(self, pattern1, pattern2)]
172
179
180
181 -class VectorDataSet (BaseCVectorDataSet, cvectordataset.VectorDataSet) :
182
183 - def __init__(self, arg = None, **args):
186
188
189 if type(x) == type(numpy.array(1)) or type(x) == type([]) :
190 values = arrayWrap.doubleVector(x)
191 else:
192 raise TypeError, "data vectors must be list or array"
193 self.container.addPattern(self, values)
194
195
197
198 if arg.__class__ == self.__class__ :
199
200 other = arg
201 self.featureID.extend(other.featureID)
202 elif type(arg) == type([]) :
203
204 eliminated = misc.list2dict(arg)
205 self.featureID = [self.featureID[i] for i in range(len(self.featureID))
206 if i not in eliminated]
207 elif type(arg) == type(1) or type(arg) == type('') :
208
209 id = arg
210 self.featureID.append(id)
211 self.featureDict[id] = self.numFeatures - 1
212 return
213
214 self.featureDict = {}
215 for i in range(self.numFeatures) :
216 self.featureDict[self.featureID[i]] = i
217
218
219 -class SparseDataSet (BaseCVectorDataSet, csparsedataset.SparseDataSet) :
220
221 - def __init__(self, arg = None, **args):
224
226
227 if arg.__class__ == self.__class__ :
228 other = arg
229 self.featureID.extend(other.featureID)
230 self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))
231 elif type(arg) == type([]) :
232
233 eliminated = misc.list2dict(arg)
234 self.featureID = [self.featureID[i] for i in range(len(self.featureID))
235 if i not in eliminated]
236 elif type(arg) == type(1) or type(arg) == type('') :
237
238 id = arg
239 self.featureID.append(id)
240 self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))
241
242 self.featureDict = {}
243 for i in range(len(self.featureID)) :
244 self.featureDict[self.featureID[i]] = i
245