Package PyML :: Package datagen :: Module sample
[frames] | no frames]

Source Code for Module PyML.datagen.sample

  1   
  2  import random 
  3  from PyML.utils import misc 
  4   
  5  """ 
  6  a collection of functions for sampling from a dataset 
  7  """ 
  8   
9 -def shuffle(x) :
10 """ 11 shuffle a list 12 """ 13 14 shuffled = x[:] 15 random.shuffle(shuffled) 16 17 return shuffled
18
19 -def sample(data, size, **args) :
20 """ 21 sample from a dataset without replacement 22 23 :Parameters: 24 - `data` - a dataset object 25 - `size` - can be one of the following: 26 An integer - in this case the given number of patterns are chosen. 27 A list - size[i] specifies how many examples to sample from 28 class i (data.labels.classLabels will tell how they are indexed). 29 A dictionary whose keys are the class names e.g. {'+1': 100, '-1':100}. 30 If an entry in the list or dictionary is 'all' then all members of 31 the corresponding class are sampled. 32 33 :Keywords: 34 - `stratified` - whether to perform stratified sampling [default: True]. 35 This applies only when a global 'size' parameter is provided 36 - `seed` - random number generator seed 37 """ 38 39 stratified = True 40 if 'stratified' in args : 41 stratified = args['stratified'] 42 if 'seed' in args : 43 seed = args['seed'] 44 rand = random.Random(seed) 45 else : 46 rand = random.Random() 47 48 patterns = [] 49 if type(size) == type(1) : 50 if stratified : 51 fraction = float(size) / float(len(data)) 52 patterns = [] 53 for i in range(data.labels.numClasses) : 54 if i < data.labels.numClasses - 1 : 55 numToSample = int(fraction * data.labels.classSize[i]) 56 else : 57 numToSample = size - len(patterns) 58 I = data.labels.classes[i][:] 59 rand.shuffle(I) 60 patterns.extend(I[:numToSample]) 61 else : 62 I = range(len(data)) 63 rand.shuffle(I) 64 patterns = I[:size] 65 elif type(size) == type([]) : 66 for i in range(len(size)) : 67 if size[i] == 'all' : 68 patterns.extend(data.labels.classes[i][:]) 69 else : 70 I = data.labels.classes[i][:] 71 rand.shuffle(I) 72 patterns.extend(I[:size[i]]) 73 elif type(size) == type({}) : 74 for classLabel in size : 75 if size[classLabel] == 'all' : 76 patterns.extend(data.labels.classes[data.labels.classDict[ 77 classLabel]][:]) 78 else : 79 I = data.labels.classes[data.labels.classDict[classLabel]][:] 80 rand.shuffle(I) 81 patterns.extend(I[:size[classLabel]]) 82 83 return data.__class__(data, patterns = patterns)
84
85 -def splitDataset(data, fraction, **args) :
86 """ 87 split a dataset into two. 88 randomly splits a dataset into two datasets whose sizes are determined 89 by the 'fraction' parameter (the first dataset will contain that fraction 90 of the examples). 91 92 for example: 93 train, test = splitDataset(data, 0.7) 94 will split the data -- 70% for training and 30% for test 95 96 :Parameters: 97 - `data` - a dataset object 98 - `fraction` - the fraction of the examples to put in the first split 99 100 :Keywords: 101 - `stratified` - whether to perform stratified splitting, i.e. whether to 102 keep the class ratio in the two datasets [default: True] 103 - `seed` - random number generator seed 104 - `indicesOnly` - if this flag is set, the indices of the two splits are 105 returned instead of the datasets [default: False] 106 """ 107 108 if 'seed' in args : 109 seed = args['seed'] 110 rand = random.Random(seed) 111 else : 112 rand = random.Random() 113 114 indicesOnly = False 115 if 'indicesOnly' in args : 116 indicesOnly = args['indicesOnly'] 117 118 if data.__class__.__name__ == 'Labels' : 119 labels = data 120 else : 121 labels = data.labels 122 123 sampleSize = int(len(data) * fraction) 124 125 stratified = True 126 if 'stratified' in args : 127 stratified = args['stratified'] 128 129 if stratified : 130 patterns = [] 131 for i in range(labels.numClasses) : 132 if i < labels.numClasses - 1 : 133 numToSample = int(fraction * labels.classSize[i]) 134 else : 135 numToSample = sampleSize - len(patterns) 136 I = labels.classes[i][:] 137 rand.shuffle(I) 138 patterns.extend(I[:numToSample]) 139 else : 140 I = range(len(data)) 141 rand.shuffle(I) 142 patterns = I[:sampleSize] 143 patterns.sort() 144 145 if not indicesOnly : 146 return (data.__class__(data, patterns = patterns), 147 data.__class__(data, patterns = misc.setminus(range(len(data)), patterns) ) ) 148 else : 149 return patterns, misc.setminus(range(len(data)), patterns)
150
151 -def bootstrap(data, **args) :
152 """ 153 return a bootstrap sample from a dataset 154 155 :Parameters: 156 - `data` - a dataset object 157 158 :Keywords: 159 - `stratified` - whether to perform stratified bootstrapping, i.e. whether to 160 keep the class ratio 161 - `seed` - random number generator seed 162 """ 163 164 if 'seed' in args : 165 seed = args['seed'] 166 rand = random.Random(seed) 167 else : 168 rand = random.Random() 169 stratified = True 170 if 'stratified' in args : 171 stratified = args['stratified'] 172 if not data.labels.isLabeled() : 173 stratified = False 174 175 if not stratified : 176 patterns = [rand.randint(0, len(data) - 1) for i in range(len(data))] 177 else : 178 patterns = [] 179 for c in range(len(data.labels.classLabels)) : 180 classSize = len(data.labels.classes[c]) 181 patterns.extend([data.labels.classes[c][rand.randint(0, classSize - 1)] 182 for i in range(classSize)]) 183 184 patterns.sort() 185 return data.__class__(data, patterns = patterns)
186