1
2 import random
3 from PyML.utils import misc
4
5 """
6 a collection of functions for sampling from a dataset
7 """
8
10 """
11 shuffle a list
12 """
13
14 shuffled = x[:]
15 random.shuffle(shuffled)
16
17 return shuffled
18
19 -def sample(data, size, **args) :
20 """
21 sample from a dataset without replacement
22
23 :Parameters:
24 - `data` - a dataset object
25 - `size` - can be one of the following:
26 An integer - in this case the given number of patterns are chosen.
27 A list - size[i] specifies how many examples to sample from
28 class i (data.labels.classLabels will tell how they are indexed).
29 A dictionary whose keys are the class names e.g. {'+1': 100, '-1':100}.
30 If an entry in the list or dictionary is 'all' then all members of
31 the corresponding class are sampled.
32
33 :Keywords:
34 - `stratified` - whether to perform stratified sampling [default: True].
35 This applies only when a global 'size' parameter is provided
36 - `seed` - random number generator seed
37 """
38
39 stratified = True
40 if 'stratified' in args :
41 stratified = args['stratified']
42 if 'seed' in args :
43 seed = args['seed']
44 rand = random.Random(seed)
45 else :
46 rand = random.Random()
47
48 patterns = []
49 if type(size) == type(1) :
50 if stratified :
51 fraction = float(size) / float(len(data))
52 patterns = []
53 for i in range(data.labels.numClasses) :
54 if i < data.labels.numClasses - 1 :
55 numToSample = int(fraction * data.labels.classSize[i])
56 else :
57 numToSample = size - len(patterns)
58 I = data.labels.classes[i][:]
59 rand.shuffle(I)
60 patterns.extend(I[:numToSample])
61 else :
62 I = range(len(data))
63 rand.shuffle(I)
64 patterns = I[:size]
65 elif type(size) == type([]) :
66 for i in range(len(size)) :
67 if size[i] == 'all' :
68 patterns.extend(data.labels.classes[i][:])
69 else :
70 I = data.labels.classes[i][:]
71 rand.shuffle(I)
72 patterns.extend(I[:size[i]])
73 elif type(size) == type({}) :
74 for classLabel in size :
75 if size[classLabel] == 'all' :
76 patterns.extend(data.labels.classes[data.labels.classDict[
77 classLabel]][:])
78 else :
79 I = data.labels.classes[data.labels.classDict[classLabel]][:]
80 rand.shuffle(I)
81 patterns.extend(I[:size[classLabel]])
82
83 return data.__class__(data, patterns = patterns)
84
86 """
87 split a dataset into two.
88 randomly splits a dataset into two datasets whose sizes are determined
89 by the 'fraction' parameter (the first dataset will contain that fraction
90 of the examples).
91
92 for example:
93 train, test = splitDataset(data, 0.7)
94 will split the data -- 70% for training and 30% for test
95
96 :Parameters:
97 - `data` - a dataset object
98 - `fraction` - the fraction of the examples to put in the first split
99
100 :Keywords:
101 - `stratified` - whether to perform stratified splitting, i.e. whether to
102 keep the class ratio in the two datasets [default: True]
103 - `seed` - random number generator seed
104 - `indicesOnly` - if this flag is set, the indices of the two splits are
105 returned instead of the datasets [default: False]
106 """
107
108 if 'seed' in args :
109 seed = args['seed']
110 rand = random.Random(seed)
111 else :
112 rand = random.Random()
113
114 indicesOnly = False
115 if 'indicesOnly' in args :
116 indicesOnly = args['indicesOnly']
117
118 if data.__class__.__name__ == 'Labels' :
119 labels = data
120 else :
121 labels = data.labels
122
123 sampleSize = int(len(data) * fraction)
124
125 stratified = True
126 if 'stratified' in args :
127 stratified = args['stratified']
128
129 if stratified :
130 patterns = []
131 for i in range(labels.numClasses) :
132 if i < labels.numClasses - 1 :
133 numToSample = int(fraction * labels.classSize[i])
134 else :
135 numToSample = sampleSize - len(patterns)
136 I = labels.classes[i][:]
137 rand.shuffle(I)
138 patterns.extend(I[:numToSample])
139 else :
140 I = range(len(data))
141 rand.shuffle(I)
142 patterns = I[:sampleSize]
143 patterns.sort()
144
145 if not indicesOnly :
146 return (data.__class__(data, patterns = patterns),
147 data.__class__(data, patterns = misc.setminus(range(len(data)), patterns) ) )
148 else :
149 return patterns, misc.setminus(range(len(data)), patterns)
150
152 """
153 return a bootstrap sample from a dataset
154
155 :Parameters:
156 - `data` - a dataset object
157
158 :Keywords:
159 - `stratified` - whether to perform stratified bootstrapping, i.e. whether to
160 keep the class ratio
161 - `seed` - random number generator seed
162 """
163
164 if 'seed' in args :
165 seed = args['seed']
166 rand = random.Random(seed)
167 else :
168 rand = random.Random()
169 stratified = True
170 if 'stratified' in args :
171 stratified = args['stratified']
172 if not data.labels.isLabeled() :
173 stratified = False
174
175 if not stratified :
176 patterns = [rand.randint(0, len(data) - 1) for i in range(len(data))]
177 else :
178 patterns = []
179 for c in range(len(data.labels.classLabels)) :
180 classSize = len(data.labels.classes[c])
181 patterns.extend([data.labels.classes[c][rand.randint(0, classSize - 1)]
182 for i in range(classSize)])
183
184 patterns.sort()
185 return data.__class__(data, patterns = patterns)
186