Line | Hits | Source |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2004, the JUNG Project and the Regents of the University | |
3 | * of California | |
4 | * All rights reserved. | |
5 | * | |
6 | * This software is open-source under the BSD license; see either | |
7 | * "license.txt" or | |
8 | * http://jung.sourceforge.net/license.txt for a description. | |
9 | * | |
10 | * Created on Aug 12, 2004 | |
11 | */ | |
12 | package edu.uci.ics.jung.algorithms.cluster; | |
13 | ||
14 | import java.util.Arrays; | |
15 | import java.util.Collection; | |
16 | import java.util.Comparator; | |
17 | import java.util.HashMap; | |
18 | import java.util.HashSet; | |
19 | import java.util.Iterator; | |
20 | import java.util.LinkedList; | |
21 | import java.util.Map; | |
22 | import java.util.Set; | |
23 | ||
24 | import cern.jet.random.engine.DRand; | |
25 | import cern.jet.random.engine.RandomEngine; | |
26 | import edu.uci.ics.jung.algorithms.cluster.KMeansClusterer.NotEnoughClustersException; | |
27 | import edu.uci.ics.jung.algorithms.importance.VoltageRanker; | |
28 | import edu.uci.ics.jung.graph.ArchetypeGraph; | |
29 | import edu.uci.ics.jung.graph.ArchetypeVertex; | |
30 | import edu.uci.ics.jung.graph.Vertex; | |
31 | import edu.uci.ics.jung.graph.decorators.UserDatumNumberVertexValue; | |
32 | import edu.uci.ics.jung.statistics.DiscreteDistribution; | |
33 | ||
34 | /** | |
35 | * <p>Clusters vertices of a <code>Graph</code> based on their ranks as | |
36 | * calculated by <code>VoltageRanker</code>. This algorithm is based on, | |
37 | * but not identical with, the method described in the paper below. | |
38 | * The primary difference is that Wu and Huberman assume a priori that the clusters | |
39 | * are of approximately the same size, and therefore use a more complex | |
40 | * method than k-means (which is used here) for determining cluster | |
41 | * membership based on co-occurrence data.</p> | |
42 | * | |
43 | * <p>The algorithm proceeds as follows: | |
44 | * <ul> | |
45 | * <li/>first, generate a set of candidate clusters as follows: | |
46 | * <ul> | |
47 | * <li/>pick (widely separated) vertex pair, run VoltageRanker | |
48 | * <li/>group the vertices in two clusters according to their voltages | |
49 | * <li/>store resulting candidate clusters | |
50 | * </ul> | |
51 | * <li/>second, generate k-1 clusters as follows: | |
52 | * <ul> | |
53 | * <li/>pick a vertex v as a cluster 'seed' | |
54 | * <br>(Wu/Huberman: most frequent vertex in candidate clusters) | |
55 | * <li/>calculate co-occurrence over all candidate clusters of v with each other | |
56 | * vertex | |
57 | * <li/>separate co-occurrence counts into high/low; | |
58 | * high vertices constitute a cluster | |
59 | * <li/>remove v's vertices from candidate clusters; continue | |
60 | * </ul> | |
61 | * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage") | |
62 | * cluster. | |
63 | * </ul></p> | |
64 | * | |
65 | * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into | |
66 | * clusters, the number of clusters returned by this algorithm may be less than the | |
67 | * number of clusters requested. The number of clusters will never be more than | |
68 | * the number requested, however.</p> | |
69 | * | |
70 | * @author Joshua O'Madadhain | |
71 | * @see <a href="http://www.hpl.hp.com/research/idl/papers/linear/">'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman</a> | |
72 | * @see VoltageRanker | |
73 | * @see KMeansClusterer | |
74 | */ | |
75 | public class VoltageClusterer | |
76 | { | |
77 | public final static String VOLTAGE_KEY = "edu.uci.ics.jung.algorithms.cluster.VoltageClusterer.KEY"; | |
78 | ||
79 | protected int num_candidates; | |
80 | protected KMeansClusterer kmc; | |
81 | protected UserDatumNumberVertexValue vv; | |
82 | protected VoltageRanker vr; | |
83 | protected RandomEngine rand; | |
84 | ||
85 | /** | |
86 | * Creates an instance of a VoltageCluster with the specified parameters. | |
87 | * These are mostly parameters that are passed directly to VoltageRanker | |
88 | * and KMeansClusterer. | |
89 | * | |
90 | * @param num_candidates the number of candidate clusters to create | |
91 | * @param rank_iterations the number of iterations to run VoltageRanker | |
92 | * @param cluster_iterations the number of iterations to run KMeansClusterer | |
93 | * @param cluster_convergence the convergence value for KMeansClusterer | |
94 | */ | |
95 | public VoltageClusterer(int num_candidates, int rank_iterations, | |
96 | double rank_convergence, int cluster_iterations, double cluster_convergence) | |
97 | 0 | { |
98 | 0 | if (num_candidates < 1) |
99 | 0 | throw new IllegalArgumentException("must generate >=1 candidates"); |
100 | ||
101 | 0 | this.num_candidates = num_candidates; |
102 | 0 | this.vv = new UserDatumNumberVertexValue(VOLTAGE_KEY); |
103 | 0 | this.vr = new VoltageRanker(vv, rank_iterations, rank_convergence); |
104 | 0 | this.kmc = new KMeansClusterer(cluster_iterations, cluster_convergence); |
105 | 0 | rand = new DRand(); |
106 | 0 | } |
107 | ||
108 | protected void setRandomSeed(int random_seed) | |
109 | { | |
110 | 0 | rand = new DRand(random_seed); |
111 | 0 | } |
112 | ||
113 | /** | |
114 | * Returns a community (cluster) centered around <code>v</code>. | |
115 | * @param v the vertex whose community we wish to discover | |
116 | */ | |
117 | public Collection getCommunity(ArchetypeVertex v) | |
118 | { | |
119 | 0 | return cluster_internal(v.getGraph(), v, 2); |
120 | } | |
121 | ||
122 | /** | |
123 | * Clusters the vertices of <code>g</code> into | |
124 | * <code>num_clusters</code> clusters, based on their connectivity. | |
125 | * @param g the graph whose vertices are to be clustered | |
126 | * @param num_clusters the number of clusters to identify | |
127 | */ | |
128 | public Collection cluster(ArchetypeGraph g, int num_clusters) | |
129 | { | |
130 | 0 | return cluster_internal(g, null, num_clusters); |
131 | } | |
132 | ||
133 | /** | |
134 | * Clears the voltage decoration values from the vertices of <code>g</code>. | |
135 | */ | |
136 | public void clear(ArchetypeGraph g) | |
137 | { | |
138 | 0 | vv.clear(g); |
139 | 0 | } |
140 | ||
141 | /** | |
142 | * Does the work of <code>getCommunity</code> and <code>cluster</code>. | |
143 | * @param g the graph whose vertices we're clustering | |
144 | * @param origin the center (if one exists) of the graph to cluster | |
145 | * @param num_clusters | |
146 | */ | |
147 | protected Collection cluster_internal(ArchetypeGraph g, | |
148 | ArchetypeVertex origin, int num_clusters) | |
149 | { | |
150 | // generate candidate clusters | |
151 | // repeat the following 'samples' times: | |
152 | // * pick (widely separated) vertex pair, run VoltageRanker | |
153 | // * use k-means to identify 2 communities in ranked graph | |
154 | // * store resulting candidate communities | |
155 | 0 | Set vertices = g.getVertices(); |
156 | 0 | int num_vertices = vertices.size(); |
157 | 0 | ArchetypeVertex[] v = new ArchetypeVertex[num_vertices]; |
158 | 0 | int i = 0; |
159 | 0 | for (Iterator iter = vertices.iterator(); iter.hasNext(); ) |
160 | 0 | v[i++] = (ArchetypeVertex)iter.next(); |
161 | ||
162 | 0 | LinkedList candidates = new LinkedList(); |
163 | ||
164 | 0 | for (int j = 0; j < num_candidates; j++) |
165 | { | |
166 | ArchetypeVertex source; | |
167 | 0 | if (origin == null) |
168 | 0 | source = v[(int)(rand.nextDouble() * num_vertices)]; |
169 | else | |
170 | 0 | source = origin; |
171 | 0 | ArchetypeVertex target = null; |
172 | do | |
173 | { | |
174 | 0 | target = v[(int)(rand.nextDouble() * num_vertices)]; |
175 | } | |
176 | 0 | while (source == target); |
177 | 0 | vr.calculateVoltages((Vertex)source, (Vertex)target); |
178 | ||
179 | 0 | Map voltage_ranks = new HashMap(); |
180 | 0 | for (Iterator iter = vertices.iterator(); iter.hasNext(); ) |
181 | { | |
182 | 0 | ArchetypeVertex w = (ArchetypeVertex)iter.next(); |
183 | 0 | double[] voltage = {vv.getNumber(w).doubleValue()}; |
184 | 0 | voltage_ranks.put(w, voltage); |
185 | } | |
186 | ||
187 | // Map clusterMap; | |
188 | Collection clusters; | |
189 | try | |
190 | { | |
191 | 0 | clusters = kmc.cluster(voltage_ranks, 2); |
192 | 0 | Iterator iter = clusters.iterator(); |
193 | 0 | candidates.add(((Map)iter.next()).keySet()); |
194 | 0 | candidates.add(((Map)iter.next()).keySet()); |
195 | // candidates.addAll(clusters); | |
196 | } | |
197 | 0 | catch (NotEnoughClustersException e) |
198 | { | |
199 | // ignore this candidate, continue | |
200 | 0 | } |
201 | } | |
202 | ||
203 | // repeat the following k-1 times: | |
204 | // * pick a vertex v as a cluster seed | |
205 | // (Wu/Huberman: most frequent vertex in candidates) | |
206 | // * calculate co-occurrence (in candidate clusters) | |
207 | // of this vertex with all others | |
208 | // * use k-means to separate co-occurrence counts into high/low; | |
209 | // high vertices are a cluster | |
210 | // * remove v's vertices from candidate clusters | |
211 | ||
212 | 0 | Collection clusters = new LinkedList(); |
213 | 0 | Collection remaining = new HashSet(g.getVertices()); |
214 | 0 | Object[] seed_candidates = getSeedCandidates(candidates); |
215 | 0 | int seed_index = 0; |
216 | ||
217 | 0 | for (int j = 0; j < (num_clusters - 1); j++) |
218 | { | |
219 | 0 | if (remaining.isEmpty()) |
220 | 0 | break; |
221 | ||
222 | Object seed; | |
223 | 0 | if (seed_index == 0 && origin != null) |
224 | 0 | seed = origin; |
225 | else | |
226 | { | |
227 | 0 | do { seed = seed_candidates[seed_index++]; } |
228 | 0 | while (!remaining.contains(seed)); |
229 | } | |
230 | ||
231 | 0 | Map occur_counts = getObjectCounts(candidates, seed); |
232 | 0 | if (occur_counts.size() < 2) |
233 | 0 | break; |
234 | ||
235 | // now that we have the counts, cluster them... | |
236 | try | |
237 | { | |
238 | 0 | Collection high_low = kmc.cluster(occur_counts, 2); |
239 | // ...get the cluster with the highest-valued centroid... | |
240 | 0 | Iterator h_iter = high_low.iterator(); |
241 | 0 | Map cluster1 = (Map)h_iter.next(); |
242 | 0 | Map cluster2 = (Map)h_iter.next(); |
243 | 0 | double[] centroid1 = DiscreteDistribution.mean(cluster1.values()); |
244 | 0 | double[] centroid2 = DiscreteDistribution.mean(cluster2.values()); |
245 | // double[] centroid1 = (double[])h_iter.next(); | |
246 | // double[] centroid2 = (double[])h_iter.next(); | |
247 | Collection new_cluster; | |
248 | 0 | if (centroid1[0] >= centroid2[0]) |
249 | 0 | new_cluster = cluster1.keySet(); |
250 | else | |
251 | 0 | new_cluster = cluster2.keySet(); |
252 | ||
253 | // ...remove the elements of new_cluster from each candidate... | |
254 | 0 | for (Iterator iter = candidates.iterator(); iter.hasNext(); ) |
255 | { | |
256 | 0 | Collection cluster = (Collection)iter.next(); |
257 | 0 | cluster.removeAll(new_cluster); |
258 | } | |
259 | 0 | clusters.add(new_cluster); |
260 | 0 | remaining.removeAll(new_cluster); |
261 | } | |
262 | 0 | catch (NotEnoughClustersException nece) |
263 | { | |
264 | // all remaining vertices are in the same cluster | |
265 | 0 | break; |
266 | 0 | } |
267 | } | |
268 | ||
269 | // identify remaining vertices (if any) as a 'garbage' cluster | |
270 | 0 | if (!remaining.isEmpty()) |
271 | 0 | clusters.add(remaining); |
272 | ||
273 | 0 | return clusters; |
274 | } | |
275 | ||
276 | protected Object getRandomElement(Collection c) | |
277 | { | |
278 | 0 | return c.toArray()[(int)(rand.nextDouble() * c.size())]; |
279 | } | |
280 | ||
281 | /** | |
282 | * Returns an array of cluster seeds, ranked in decreasing order | |
283 | * of number of appearances in the specified collection of candidate | |
284 | * clusters. | |
285 | * @param candidates | |
286 | */ | |
287 | protected Object[] getSeedCandidates(Collection candidates) | |
288 | { | |
289 | 0 | final Map occur_counts = getObjectCounts(candidates, null); |
290 | ||
291 | 0 | Object[] occurrences = occur_counts.keySet().toArray(); |
292 | 0 | Arrays.sort(occurrences, new Comparator() |
293 | { | |
294 | public int compare(Object arg0, Object arg1) | |
295 | { | |
296 | double[] count0 = (double[])occur_counts.get(arg0); | |
297 | double[] count1 = (double[])occur_counts.get(arg1); | |
298 | if (count0[0] < count1[0]) | |
299 | return -1; | |
300 | else if (count0[0] > count1[0]) | |
301 | return 1; | |
302 | else | |
303 | return 0; | |
304 | } | |
305 | }); | |
306 | 0 | return occurrences; |
307 | } | |
308 | ||
309 | protected Map getObjectCounts(Collection candidates, Object seed) | |
310 | { | |
311 | 0 | Map occur_counts = new HashMap(); |
312 | 0 | for (Iterator iter = candidates.iterator(); iter.hasNext(); ) |
313 | { | |
314 | 0 | Collection candidate = (Collection)iter.next(); |
315 | 0 | if (seed == null || candidate.contains(seed)) |
316 | { | |
317 | 0 | for (Iterator c_iter = candidate.iterator(); c_iter.hasNext(); ) |
318 | { | |
319 | 0 | Object element = c_iter.next(); |
320 | 0 | double[] count = (double[])occur_counts.get(element); |
321 | 0 | if (count == null) |
322 | { | |
323 | 0 | count = new double[1]; |
324 | 0 | occur_counts.put(element, count); |
325 | } | |
326 | 0 | else count[0]++; |
327 | } | |
328 | } | |
329 | } | |
330 | ||
331 | 0 | return occur_counts; |
332 | } | |
333 | } |
this report was generated by version 1.0.5 of jcoverage. |
copyright © 2003, jcoverage ltd. all rights reserved. |