Package PyML :: Package preproc :: Module preproc
[frames] | no frames]

Source Code for Module PyML.preproc.preproc

  1   
  2  from PyML.base.pymlObject import PyMLobject 
  3  from PyML.utils import misc 
  4  import numpy 
  5  import math 
  6   
  7  import random 
  8   
9 -def pca(X, numcomp = None) :
10 '''returns the matrix X as represented in the numcomp leading principal 11 components 12 if numcomp is None, all principal components are returned''' 13 14 d = numpy.shape(X)[1] 15 if numcomp is None : 16 numcomp = d 17 18 [u,s,v] = numpy.linalg.svd(X) 19 v = numpy.transpose(v) 20 v = v[:,:numcomp] 21 print numpy.shape(X) 22 return numpy.matrixmultiply(X, v)
23 24
25 -def centerColumns(X) :
26 '''returns X - mean(X), where the mean is taken over the columns of X''' 27 28 m = numpy.mean(X) 29 30 n = numpy.shape(X)[0] 31 32 return X - numpy.resize(m, (n,len(m)))
33 34
35 -def centerRows(X) :
36 37 return numpy.transpose(centerColumns(numpy.transpose(X)))
38 39
40 -def standardizeColumns(X) :
41 '''returns (X - mean(X)) / std(X) ''' 42 43 m = numpy.mean(X) 44 std = numpy.std(X) 45 46 n = numpy.shape(X)[0] 47 48 return (X - numpy.resize(m, (n,len(m))))/numpy.resize(std, (n,len(std)))
49
50 -def standardizeRows(X) :
51 '''returns (X - mean(X)) / std(X) ''' 52 53 return numpy.transpose(standardizeColumns(numpy.transpose(X)))
54 55
56 -def maxvar(X, numVariables) :
57 '''returns the numVariables variables with the highest variance''' 58 59 s = numpy.std(X) 60 I = numpy.argsort(s) 61 62 Xout = numpy.take(X, I[-numVariables:], 1) 63 64 return Xout
65 66
67 -def dmat(X) :
68 '''returns the Euclidean distance-squared matrix''' 69 70 K = numpy.matrixmultiply (X, numpy.transpose (X)) 71 n = numpy.shape(K)[0] 72 D = numpy.zeros((n,n), numpy.float) 73 74 for i in range(1, n-1) : 75 for j in range(i+1, n) : 76 D[i,j] = K[i,i] - 2 * K[i,j] + K[j,j] 77 D[j,i] = D[i,j] 78 79 return D
80 81
82 -def norm2(x) :
83 '''return the 2-norm of a vector given as a list or numpy array''' 84 85 x = numpy.asarray(x) 86 87 return math.sqrt(numpy.sum(x*x))
88 89
90 -def normalizeNorm(X) :
91 '''normalize each row of X to unit vectors''' 92 93 (numRows, numCols) = numpy.shape(X) 94 Xnorm = numpy.zeros((numRows, numCols), numpy.float) 95 96 for i in range(numRows) : 97 Xnorm[i] = X[i] / norm2(X[i]) 98 99 return Xnorm
100
101 -class Correlator (object) :
102
103 - def __init__(self, data) :
104 105 if type(data) == type('') : 106 print 'file name:', data 107 data = datafunc.PyVectorDataSet(data, idColumn = 0, headerRow = True, hint = 'csv') 108 109 self.data = data 110 self.idDict = misc.list2dict(data.labels.patternID, 111 range(len(data))) 112 113 print numpy.shape(data.X) 114 self.mean = numpy.mean(data.X, 1) 115 self.std = std(data.X, 1) 116 eps = 1e-5 117 I = numpy.nonzero(numpy.less(self.std, eps))[0] 118 print 'num zeros:',len(I) 119 numpy.put(self.std, I, 1) 120 121 self.numCorrelations = 10000 122 correlations = numpy.zeros(self.numCorrelations, numpy.float) 123 124 for i in range(self.numCorrelations) : 125 i1 = random.randrange(0, len(data)) 126 i2 = random.randrange(0, len(data)) 127 correlations[i] = self._corrcoef(i1, i2) 128 self.meanCorrelation = numpy.mean(correlations) 129 self.numCorrelations = 1000
130
131 - def corrcoef(self, id1, id2) :
132 133 if id1 == id2 : return 1.0 134 if type(id1) == type(1) : 135 return self._corrcoef(id1, id2) 136 if id1 not in self.idDict and id2 not in self.idDict : 137 return self.meanCorrelation 138 if id1 in self.idDict and id2 in self.idDict : 139 return self._corrcoef(self.idDict[id1], self.idDict[id2]) 140 else : 141 # we want to assume that id1 not in data: 142 if id2 not in self.idDict : 143 id1,id2 = id2,id1 144 i2 = self.idDict[id2] 145 correlations = numpy.zeros(self.numCorrelations, numpy.float) 146 for i in range(self.numCorrelations) : 147 i1 = random.randrange(0, len(self.data)) 148 correlations[i] = self._corrcoef(i1, i2) 149 return numpy.mean(correlations)
150
151 - def _corrcoef(self, i1, i2) :
152 153 return numpy.dot(self.data.X[i1] - self.mean[i1], 154 self.data.X[i2] - self.mean[i2]) / \ 155 (len(self.data.X[i1]) * self.std[i1] * self.std[i2])
156 157
158 -def corrcoef2(X) :
159 '''compute the correlation between the rows of the matrix X 160 more space efficient than numpy version''' 161 162 (n,d) = numpy.shape(X) 163 164 m = numpy.mean(X, 1) 165 std = numpy.std(X, 1) 166 167 K = numpy.ones((n,n), numpy.float) 168 169 for i in range(0, n - 1) : 170 for j in range(i + 1, n) : 171 K[i][j] = numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j]) 172 K[j][i] = K[i][j] 173 174 return K
175
176 -def std(m,axis=0):
177 """std(m,axis=0) returns the standard deviation along the given 178 dimension of m. The result is unbiased with division by N-1. 179 If m is of integer type returns a floating point answer. 180 """ 181 x = numpy.asarray(m) 182 n = float(x.shape[axis]) 183 mx = numpy.asarray(numpy.mean(x,axis)) 184 if axis < 0: 185 axis = len(x.shape) + axis 186 mx.shape = mx.shape[:axis] + (1,) + mx.shape[axis:] 187 x = x - mx 188 return numpy.sqrt(numpy.add.reduce(x*x,axis)/(n))
189
190 -def corrcoef(X) :
191 192 (n,d) = numpy.shape(X) 193 194 Xn = standardizeRows(X) 195 196 return numpy.dot(Xn, numpy.transpose(Xn)) / (d - 1)
197
198 -def corrcoefij(X, i, j) :
199 200 (n,d) = numpy.shape(X) 201 202 m = numpy.mean(X, 1) 203 std = numpy.std(X, 1) 204 205 206 return numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j])
207 208
209 -class Standardizer (PyMLobject) :
210 """ 211 class for performing feature normalization 212 213 For each feature the Standardizer subtracts the feature's mean 214 and divides by its standard deviation 215 216 this rescaling is composed of two operations: 217 218 1. ``centering`` -- subtract from a feature its mean value; 219 this is referred to as 'translation'; the translation attribute 220 gives the value with which to translate each feature 221 2. ``scaling`` -- divide a feature by a scale, e.g. its standard deviation; 222 the 'scale' attribute gives the value with which to scale each feature 223 224 the 'train' method of the class computes the translation and scaling 225 factors, and performs normalization of the training data 226 the 'test' method uses values computed on the training data to normalize 227 the test data. 228 229 **caveat:** 230 Beware of performing training multiple times on the same dataset: 231 if a dataset has already been standardized, re-standardization 232 will recompute mean and standard deviation, which will be approximately 233 0 and 1 for each feature; subsequent application on test data will 234 have no effect. Because of this an exception is raised if the user 235 attempts to re-train an already trained Rescale object. 236 """ 237 238 attributes = {'translate' : True, 239 'rescale' : True, 240 'translation' : None, 241 'scale' : None} 242
243 - def __init__(self, **args) :
244 245 PyMLobject.__init__(self, args)
246
247 - def train(self, data, *options, **args) :
248 249 if self.translation is not None or self.scale is not None : 250 raise ValueError, 'object already trained' 251 if self.translate : 252 self.translation = data.mean() 253 if self.rescale : 254 self.scale = numpy.array(data.std()) 255 # need to avoid division by 0, so 256 # scales that are equal to 0 are replaced with a value of 1 257 eps = 1e-5 258 I = numpy.nonzero(numpy.less(self.scale, eps))[0] 259 numpy.put(self.scale, I, 1) 260 # checking for nan: 261 for i in range(len(self.scale)) : 262 if self.scale[i] == 0 and self.scale[i] == 1 : 263 self.scale[i] = 1 264 265 self.preproc(data)
266
267 - def preproc(self, data) :
268 269 if self.translate : 270 data.translate(self.translation) 271 if self.rescale : 272 data.scale(1.0 / self.scale)
273
274 - def test(self, data, *options, **args) :
275 276 self.preproc(data)
277