| 1 | /* $Id$Revision: */ |
| 2 | /* vim:set shiftwidth=4 ts=8: */ |
| 3 | |
| 4 | /************************************************************************* |
| 5 | * Copyright (c) 2011 AT&T Intellectual Property |
| 6 | * All rights reserved. This program and the accompanying materials |
| 7 | * are made available under the terms of the Eclipse Public License v1.0 |
| 8 | * which accompanies this distribution, and is available at |
| 9 | * http://www.eclipse.org/legal/epl-v10.html |
| 10 | * |
| 11 | * Contributors: See CVS logs. Details at http://www.graphviz.org/ |
| 12 | *************************************************************************/ |
| 13 | |
| 14 | #define STANDALONE |
| 15 | #include "general.h" |
| 16 | #include "SparseMatrix.h" |
| 17 | #include "clustering.h" |
| 18 | |
| 19 | |
| 20 | |
| 21 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level){ |
| 22 | Multilevel_Modularity_Clustering grid; |
| 23 | int n = A->n, i, j; |
| 24 | |
| 25 | assert(A->type == MATRIX_TYPE_REAL); |
| 26 | assert(SparseMatrix_is_symmetric(A, FALSE)); |
| 27 | |
| 28 | if (!A) return NULL; |
| 29 | assert(A->m == n); |
| 30 | grid = MALLOC(sizeof(struct Multilevel_Modularity_Clustering_struct)); |
| 31 | grid->level = level; |
| 32 | grid->n = n; |
| 33 | grid->A = A; |
| 34 | grid->P = NULL; |
| 35 | grid->R = NULL; |
| 36 | grid->next = NULL; |
| 37 | grid->prev = NULL; |
| 38 | grid->delete_top_level_A = FALSE; |
| 39 | grid->matching = MALLOC(sizeof(real)*(n)); |
| 40 | grid->deg = NULL; |
| 41 | grid->agglomerate_regardless = FALSE; |
| 42 | |
| 43 | if (level == 0){ |
| 44 | real modularity = 0; |
| 45 | int *ia = A->ia, *ja = A->ja, n = A->n; |
| 46 | real deg_total = 0; |
| 47 | real *deg, *a = (real*) (A->a); |
| 48 | real *indeg; |
| 49 | |
| 50 | grid->deg_total = 0.; |
| 51 | grid->deg = MALLOC(sizeof(real)*(n)); |
| 52 | deg = grid->deg; |
| 53 | |
| 54 | indeg = MALLOC(sizeof(real)*n); |
| 55 | for (i = 0; i < n; i++){ |
| 56 | deg[i] = 0; |
| 57 | indeg[i] = 0.; |
| 58 | for (j = ia[i]; j < ia[i+1]; j++){ |
| 59 | deg[i] += a[j]; |
| 60 | if (ja[j] == i) indeg[i] = a[j]; |
| 61 | } |
| 62 | deg_total += deg[i]; |
| 63 | } |
| 64 | if (deg_total == 0) deg_total = 1; |
| 65 | for (i = 0; i < n; i++){ |
| 66 | modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total; |
| 67 | } |
| 68 | grid->deg_total = deg_total; |
| 69 | grid->deg = deg; |
| 70 | grid->modularity = modularity; |
| 71 | FREE(indeg); |
| 72 | } |
| 73 | |
| 74 | |
| 75 | return grid; |
| 76 | } |
| 77 | |
| 78 | static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid){ |
| 79 | if (!grid) return; |
| 80 | if (grid->A){ |
| 81 | if (grid->level == 0) { |
| 82 | if (grid->delete_top_level_A) SparseMatrix_delete(grid->A); |
| 83 | } else { |
| 84 | SparseMatrix_delete(grid->A); |
| 85 | } |
| 86 | } |
| 87 | SparseMatrix_delete(grid->P); |
| 88 | SparseMatrix_delete(grid->R); |
| 89 | FREE(grid->matching); |
| 90 | FREE(grid->deg); |
| 91 | |
| 92 | Multilevel_Modularity_Clustering_delete(grid->next); |
| 93 | FREE(grid); |
| 94 | } |
| 95 | |
| 96 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target){ |
| 97 | int *matching = grid->matching; |
| 98 | SparseMatrix A = grid->A; |
| 99 | int n = grid->n, level = grid->level, nc = 0; |
| 100 | real modularity = 0; |
| 101 | int *ia = A->ia, *ja = A->ja; |
| 102 | real *a; |
| 103 | real *deg = grid->deg; |
| 104 | real *deg_new; |
| 105 | int i, j, jj, jc, jmax; |
| 106 | real inv_deg_total = 1./(grid->deg_total); |
| 107 | real *deg_inter, gain; |
| 108 | int *mask; |
| 109 | real maxgain; |
| 110 | real total_gain = 0; |
| 111 | modularity = grid->modularity; |
| 112 | |
| 113 | deg_new = MALLOC(sizeof(real)*n); |
| 114 | deg_inter = MALLOC(sizeof(real)*n); |
| 115 | mask = MALLOC(sizeof(int)*n); |
| 116 | for (i = 0; i < n; i++) mask[i] = -1; |
| 117 | |
| 118 | assert(n == A->n); |
| 119 | for (i = 0; i < n; i++) matching[i] = UNMATCHED; |
| 120 | |
| 121 | /* gain in merging node i into cluster j is |
| 122 | deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2 |
| 123 | */ |
| 124 | a = (real*) A->a; |
| 125 | for (i = 0; i < n; i++){ |
| 126 | if (matching[i] != UNMATCHED) continue; |
| 127 | /* accumulate connections between i and clusters */ |
| 128 | for (j = ia[i]; j < ia[i+1]; j++){ |
| 129 | jj = ja[j]; |
| 130 | if (jj == i) continue; |
| 131 | if ((jc=matching[jj]) != UNMATCHED){ |
| 132 | if (mask[jc] != i) { |
| 133 | mask[jc] = i; |
| 134 | deg_inter[jc] = a[j]; |
| 135 | } else { |
| 136 | deg_inter[jc] += a[j]; |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | maxgain = 0; |
| 142 | jmax = -1; |
| 143 | for (j = ia[i]; j < ia[i+1]; j++){ |
| 144 | jj = ja[j]; |
| 145 | if (jj == i) continue; |
| 146 | if ((jc=matching[jj]) == UNMATCHED){ |
| 147 | /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */ |
| 148 | gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total; |
| 149 | } else { |
| 150 | if (deg_inter[jc] > 0){ |
| 151 | /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */ |
| 152 | gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total; |
| 153 | // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain); |
| 154 | deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */ |
| 155 | } else { |
| 156 | gain = -1; |
| 157 | } |
| 158 | } |
| 159 | if (jmax < 0 || gain > maxgain){ |
| 160 | maxgain = gain; |
| 161 | jmax = jj; |
| 162 | } |
| 163 | |
| 164 | } |
| 165 | |
| 166 | /* now merge i and jmax */ |
| 167 | if (maxgain > 0 || grid->agglomerate_regardless){ |
| 168 | total_gain += maxgain; |
| 169 | jc = matching[jmax]; |
| 170 | if (jc == UNMATCHED){ |
| 171 | //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax); |
| 172 | matching[i] = matching[jmax] = nc; |
| 173 | deg_new[nc] = deg[i] + deg[jmax]; |
| 174 | nc++; |
| 175 | } else { |
| 176 | //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc); |
| 177 | deg_new[jc] += deg[i]; |
| 178 | matching[i] = jc; |
| 179 | } |
| 180 | } else { |
| 181 | assert(maxgain <= 0); |
| 182 | matching[i] = nc; |
| 183 | deg_new[nc] = deg[i]; |
| 184 | nc++; |
| 185 | } |
| 186 | |
| 187 | } |
| 188 | |
| 189 | if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n" , modularity, modularity + total_gain, |
| 190 | level, n, nc, total_gain); |
| 191 | |
| 192 | /* !!!!!!!!!!!!!!!!!!!!!! */ |
| 193 | if (ncluster_target > 0){ |
| 194 | if (nc <= ncluster_target && n >= ncluster_target){ |
| 195 | if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */ |
| 196 | |
| 197 | } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */ |
| 198 | fprintf(stderr,"ncluster_target = %d, close to n=%d\n" , ncluster_target, n); |
| 199 | for (i = 0; i < n; i++) matching[i] = i; |
| 200 | FREE(deg_new); |
| 201 | goto RETURN; |
| 202 | } |
| 203 | } else if (n < ncluster_target){ |
| 204 | fprintf(stderr,"n < target\n" ); |
| 205 | for (i = 0; i < n; i++) matching[i] = i; |
| 206 | FREE(deg_new); |
| 207 | goto RETURN; |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | if (nc >= 1 && (total_gain > 0 || nc < n)){ |
| 212 | /* now set up restriction and prolongation operator */ |
| 213 | SparseMatrix P, R, R0, B, cA; |
| 214 | real one = 1.; |
| 215 | Multilevel_Modularity_Clustering cgrid; |
| 216 | |
| 217 | R0 = SparseMatrix_new(nc, n, 1, MATRIX_TYPE_REAL, FORMAT_COORD); |
| 218 | for (i = 0; i < n; i++){ |
| 219 | jj = matching[i]; |
| 220 | SparseMatrix_coordinate_form_add_entries(R0, 1, &jj, &i, &one); |
| 221 | } |
| 222 | R = SparseMatrix_from_coordinate_format(R0); |
| 223 | SparseMatrix_delete(R0); |
| 224 | P = SparseMatrix_transpose(R); |
| 225 | B = SparseMatrix_multiply(R, A); |
| 226 | if (!B) goto RETURN; |
| 227 | cA = SparseMatrix_multiply(B, P); |
| 228 | if (!cA) goto RETURN; |
| 229 | SparseMatrix_delete(B); |
| 230 | grid->P = P; |
| 231 | grid->R = R; |
| 232 | level++; |
| 233 | cgrid = Multilevel_Modularity_Clustering_init(cA, level); |
| 234 | deg_new = REALLOC(deg_new, nc*sizeof(real)); |
| 235 | cgrid->deg = deg_new; |
| 236 | cgrid->modularity = grid->modularity + total_gain; |
| 237 | cgrid->deg_total = grid->deg_total; |
| 238 | cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target); |
| 239 | grid->next = cgrid; |
| 240 | cgrid->prev = grid; |
| 241 | } else { |
| 242 | /* if we want a small number of cluster but right now we have too many, we will force agglomeration */ |
| 243 | if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){ |
| 244 | grid->agglomerate_regardless = TRUE; |
| 245 | FREE(deg_inter); |
| 246 | FREE(mask); |
| 247 | FREE(deg_new); |
| 248 | return Multilevel_Modularity_Clustering_establish(grid, ncluster_target); |
| 249 | } |
| 250 | /* no more improvement, stop and final clustering found */ |
| 251 | for (i = 0; i < n; i++) matching[i] = i; |
| 252 | FREE(deg_new); |
| 253 | } |
| 254 | |
| 255 | RETURN: |
| 256 | FREE(deg_inter); |
| 257 | FREE(mask); |
| 258 | return grid; |
| 259 | } |
| 260 | |
| 261 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target){ |
| 262 | /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
| 263 | is desired. The resulting clustering will give as close to this number as possible. |
| 264 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
| 265 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
| 266 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
| 267 | . selected among nc or nc2, which ever is closer to ncluster_target. |
| 268 | Default: ncluster_target <= 0 */ |
| 269 | |
| 270 | Multilevel_Modularity_Clustering grid; |
| 271 | SparseMatrix A = A0; |
| 272 | |
| 273 | if (!SparseMatrix_is_symmetric(A, FALSE) || A->type != MATRIX_TYPE_REAL){ |
| 274 | A = SparseMatrix_get_real_adjacency_matrix_symmetrized(A); |
| 275 | } |
| 276 | grid = Multilevel_Modularity_Clustering_init(A, 0); |
| 277 | |
| 278 | grid = Multilevel_Modularity_Clustering_establish(grid, ncluster_target); |
| 279 | |
| 280 | if (A != A0) grid->delete_top_level_A = TRUE;/* be sure to clean up later */ |
| 281 | return grid; |
| 282 | } |
| 283 | |
| 284 | |
| 285 | static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target, |
| 286 | int *nclusters, int **assignment, real *modularity, int *flag){ |
| 287 | /* find a clustering of vertices by maximize modularity |
| 288 | A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1. |
| 289 | |
| 290 | ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
| 291 | is desired. The resulting clustering will give as close to this number as possible. |
| 292 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
| 293 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
| 294 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
| 295 | . selected among nc or nc2, which ever is closer to ncluster_target. |
| 296 | Default: ncluster_target <= 0 |
| 297 | |
| 298 | nclusters: on output the number of clusters |
| 299 | assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters |
| 300 | */ |
| 301 | |
| 302 | Multilevel_Modularity_Clustering grid, cgrid; |
| 303 | int *matching, i; |
| 304 | SparseMatrix P; |
| 305 | real *u; |
| 306 | assert(A->m == A->n); |
| 307 | |
| 308 | *modularity = 0.; |
| 309 | |
| 310 | *flag = 0; |
| 311 | |
| 312 | grid = Multilevel_Modularity_Clustering_new(A, ncluster_target); |
| 313 | |
| 314 | /* find coarsest */ |
| 315 | cgrid = grid; |
| 316 | while (cgrid->next){ |
| 317 | cgrid = cgrid->next; |
| 318 | } |
| 319 | |
| 320 | /* project clustering up */ |
| 321 | u = MALLOC(sizeof(real)*cgrid->n); |
| 322 | for (i = 0; i < cgrid->n; i++) u[i] = (real) (cgrid->matching)[i]; |
| 323 | *nclusters = cgrid->n; |
| 324 | *modularity = cgrid->modularity; |
| 325 | |
| 326 | while (cgrid->prev){ |
| 327 | real *v = NULL; |
| 328 | P = cgrid->prev->P; |
| 329 | SparseMatrix_multiply_vector(P, u, &v, FALSE); |
| 330 | FREE(u); |
| 331 | u = v; |
| 332 | cgrid = cgrid->prev; |
| 333 | } |
| 334 | |
| 335 | if (*assignment){ |
| 336 | matching = *assignment; |
| 337 | } else { |
| 338 | matching = MALLOC(sizeof(int)*(grid->n)); |
| 339 | *assignment = matching; |
| 340 | } |
| 341 | for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i]; |
| 342 | FREE(u); |
| 343 | |
| 344 | Multilevel_Modularity_Clustering_delete(grid); |
| 345 | |
| 346 | } |
| 347 | |
| 348 | |
| 349 | |
| 350 | void modularity_clustering(SparseMatrix A, int inplace, int ncluster_target, int use_value, |
| 351 | int *nclusters, int **assignment, real *modularity, int *flag){ |
| 352 | /* find a clustering of vertices by maximize modularity |
| 353 | A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1. |
| 354 | inplace: whether A can e modified. If true, A will be modified by removing diagonal. |
| 355 | ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
| 356 | is desired. The resulting clustering will give as close to this number as possible. |
| 357 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
| 358 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
| 359 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
| 360 | . selected among nc or nc2, which ever is closer to ncluster_target. |
| 361 | Default: ncluster_target <= 0 |
| 362 | nclusters: on output the number of clusters |
| 363 | assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters |
| 364 | */ |
| 365 | SparseMatrix B; |
| 366 | |
| 367 | *flag = 0; |
| 368 | |
| 369 | assert(A->m == A->n); |
| 370 | |
| 371 | B = SparseMatrix_symmetrize(A, FALSE); |
| 372 | |
| 373 | if (!inplace && B == A) { |
| 374 | B = SparseMatrix_copy(A); |
| 375 | } |
| 376 | |
| 377 | B = SparseMatrix_remove_diagonal(B); |
| 378 | |
| 379 | if (B->type != MATRIX_TYPE_REAL || !use_value) B = SparseMatrix_set_entries_to_real_one(B); |
| 380 | |
| 381 | hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity, flag); |
| 382 | |
| 383 | if (B != A) SparseMatrix_delete(B); |
| 384 | |
| 385 | } |
| 386 | |