[37] | 1 | //---------------------------------------------------------------------- |
---|
| 2 | // File: rand.cpp |
---|
| 3 | // Programmer: Sunil Arya and David Mount |
---|
| 4 | // Description: Routines for random point generation |
---|
| 5 | // Last modified: 08/04/06 (Version 1.1.1) |
---|
| 6 | //---------------------------------------------------------------------- |
---|
| 7 | // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and |
---|
| 8 | // David Mount. All Rights Reserved. |
---|
| 9 | // |
---|
| 10 | // This software and related documentation is part of the Approximate |
---|
| 11 | // Nearest Neighbor Library (ANN). This software is provided under |
---|
| 12 | // the provisions of the Lesser GNU Public License (LGPL). See the |
---|
| 13 | // file ../ReadMe.txt for further information. |
---|
| 14 | // |
---|
| 15 | // The University of Maryland (U.M.) and the authors make no |
---|
| 16 | // representations about the suitability or fitness of this software for |
---|
| 17 | // any purpose. It is provided "as is" without express or implied |
---|
| 18 | // warranty. |
---|
| 19 | //---------------------------------------------------------------------- |
---|
| 20 | // History: |
---|
| 21 | // Revision 0.1 03/04/98 |
---|
| 22 | // Initial release |
---|
| 23 | // Revision 0.2 03/26/98 |
---|
| 24 | // Changed random/srandom declarations for SGI's. |
---|
| 25 | // Revision 1.0 04/01/05 |
---|
| 26 | // annClusGauss centers distributed over [-1,1] rather than [0,1] |
---|
| 27 | // Added annClusOrthFlats distribution |
---|
| 28 | // Changed procedure names to avoid namespace conflicts |
---|
| 29 | // Added annClusFlats distribution |
---|
| 30 | // Added rand/srand option and fixed annRan0() initialization. |
---|
| 31 | // Revision 1.1.1 08/04/06 |
---|
| 32 | // Added planted distribution |
---|
| 33 | //---------------------------------------------------------------------- |
---|
| 34 | |
---|
| 35 | #include "rand.h" // random generator declarations |
---|
| 36 | |
---|
| 37 | using namespace std; // make std:: accessible |
---|
| 38 | |
---|
| 39 | //---------------------------------------------------------------------- |
---|
| 40 | // Globals |
---|
| 41 | //---------------------------------------------------------------------- |
---|
| 42 | int annIdum = 0; // used for random number generation |
---|
| 43 | |
---|
| 44 | //------------------------------------------------------------------------ |
---|
| 45 | // annRan0 - (safer) uniform random number generator |
---|
| 46 | // |
---|
| 47 | // The code given here is taken from "Numerical Recipes in C" by |
---|
| 48 | // William Press, Brian Flannery, Saul Teukolsky, and William |
---|
| 49 | // Vetterling. The task of the code is to do an additional randomizing |
---|
| 50 | // shuffle on the system-supplied random number generator to make it |
---|
| 51 | // safer to use. |
---|
| 52 | // |
---|
| 53 | // Returns a uniform deviate between 0.0 and 1.0 using the |
---|
| 54 | // system-supplied routine random() or rand(). Set the global |
---|
| 55 | // annIdum to any negative value to initialise or reinitialise |
---|
| 56 | // the sequence. |
---|
| 57 | //------------------------------------------------------------------------ |
---|
| 58 | |
---|
| 59 | double annRan0() |
---|
| 60 | { |
---|
| 61 | const int TAB_SIZE = 97; // table size: any large number |
---|
| 62 | int j; |
---|
| 63 | |
---|
| 64 | static double y, v[TAB_SIZE]; |
---|
| 65 | static int iff = 0; |
---|
| 66 | const double RAN_DIVISOR = double(ANN_RAND_MAX + 1UL); |
---|
| 67 | if (RAN_DIVISOR < 0) { |
---|
| 68 | cout << "RAN_DIVISOR " << RAN_DIVISOR << endl; |
---|
| 69 | exit(0); |
---|
| 70 | } |
---|
| 71 | |
---|
| 72 | //-------------------------------------------------------------------- |
---|
| 73 | // As a precaution against misuse, we will always initialize on the |
---|
| 74 | // first call, even if "annIdum" is not set negative. Determine |
---|
| 75 | // "maxran", the next integer after the largest representable value |
---|
| 76 | // of type int. We assume this is a factor of 2 smaller than the |
---|
| 77 | // corresponding value of type unsigned int. |
---|
| 78 | //-------------------------------------------------------------------- |
---|
| 79 | |
---|
| 80 | if (annIdum < 0 || iff == 0) { // initialize |
---|
| 81 | iff = 1; |
---|
| 82 | ANN_SRAND(annIdum); // (re)seed the generator |
---|
| 83 | annIdum = 1; |
---|
| 84 | |
---|
| 85 | for (j = 0; j < TAB_SIZE; j++) // exercise the system routine |
---|
| 86 | ANN_RAND(); // (values intentionally ignored) |
---|
| 87 | |
---|
| 88 | for (j = 0; j < TAB_SIZE; j++) // then save TAB_SIZE-1 values |
---|
| 89 | v[j] = ANN_RAND(); |
---|
| 90 | y = ANN_RAND(); // generate starting value |
---|
| 91 | } |
---|
| 92 | |
---|
| 93 | //-------------------------------------------------------------------- |
---|
| 94 | // This is where we start if not initializing. Use the previously |
---|
| 95 | // saved random number y to get an index j between 1 and TAB_SIZE-1. |
---|
| 96 | // Then use the corresponding v[j] for both the next j and as the |
---|
| 97 | // output number. |
---|
| 98 | //-------------------------------------------------------------------- |
---|
| 99 | |
---|
| 100 | j = int(TAB_SIZE * (y / RAN_DIVISOR)); |
---|
| 101 | y = v[j]; |
---|
| 102 | v[j] = ANN_RAND(); // refill the table entry |
---|
| 103 | return y / RAN_DIVISOR; |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | //------------------------------------------------------------------------ |
---|
| 107 | // annRanInt - generate a random integer from {0,1,...,n-1} |
---|
| 108 | // |
---|
| 109 | // If n == 0, then -1 is returned. |
---|
| 110 | //------------------------------------------------------------------------ |
---|
| 111 | |
---|
| 112 | static int annRanInt( |
---|
| 113 | int n) |
---|
| 114 | { |
---|
| 115 | int r = (int) (annRan0()*n); |
---|
| 116 | if (r == n) r--; // (in case annRan0() == 1 or n == 0) |
---|
| 117 | return r; |
---|
| 118 | } |
---|
| 119 | |
---|
| 120 | //------------------------------------------------------------------------ |
---|
| 121 | // annRanUnif - generate a random uniform in [lo,hi] |
---|
| 122 | //------------------------------------------------------------------------ |
---|
| 123 | |
---|
| 124 | static double annRanUnif( |
---|
| 125 | double lo, |
---|
| 126 | double hi) |
---|
| 127 | { |
---|
| 128 | return annRan0()*(hi-lo) + lo; |
---|
| 129 | } |
---|
| 130 | |
---|
| 131 | //------------------------------------------------------------------------ |
---|
| 132 | // annRanGauss - Gaussian random number generator |
---|
| 133 | // Returns a normally distributed deviate with zero mean and unit |
---|
| 134 | // variance, using annRan0() as the source of uniform deviates. |
---|
| 135 | //------------------------------------------------------------------------ |
---|
| 136 | |
---|
| 137 | static double annRanGauss() |
---|
| 138 | { |
---|
| 139 | static int iset=0; |
---|
| 140 | static double gset; |
---|
| 141 | |
---|
| 142 | if (iset == 0) { // we don't have a deviate handy |
---|
| 143 | double v1, v2; |
---|
| 144 | double r = 2.0; |
---|
| 145 | while (r >= 1.0) { |
---|
| 146 | //------------------------------------------------------------ |
---|
| 147 | // Pick two uniform numbers in the square extending from -1 to |
---|
| 148 | // +1 in each direction, see if they are in the circle of radius |
---|
| 149 | // 1. If not, try again |
---|
| 150 | //------------------------------------------------------------ |
---|
| 151 | v1 = annRanUnif(-1, 1); |
---|
| 152 | v2 = annRanUnif(-1, 1); |
---|
| 153 | r = v1 * v1 + v2 * v2; |
---|
| 154 | } |
---|
| 155 | double fac = sqrt(-2.0 * log(r) / r); |
---|
| 156 | //----------------------------------------------------------------- |
---|
| 157 | // Now make the Box-Muller transformation to get two normal |
---|
| 158 | // deviates. Return one and save the other for next time. |
---|
| 159 | //----------------------------------------------------------------- |
---|
| 160 | gset = v1 * fac; |
---|
| 161 | iset = 1; // set flag |
---|
| 162 | return v2 * fac; |
---|
| 163 | } |
---|
| 164 | else { // we have an extra deviate handy |
---|
| 165 | iset = 0; // so unset the flag |
---|
| 166 | return gset; // and return it |
---|
| 167 | } |
---|
| 168 | } |
---|
| 169 | |
---|
| 170 | //------------------------------------------------------------------------ |
---|
| 171 | // annRanLaplace - Laplacian random number generator |
---|
| 172 | // Returns a Laplacian distributed deviate with zero mean and |
---|
| 173 | // unit variance, using annRan0() as the source of uniform deviates. |
---|
| 174 | // |
---|
| 175 | // prob(x) = b/2 * exp(-b * |x|). |
---|
| 176 | // |
---|
| 177 | // b is chosen to be sqrt(2.0) so that the variance of the Laplacian |
---|
| 178 | // distribution [2/(b^2)] becomes 1. |
---|
| 179 | //------------------------------------------------------------------------ |
---|
| 180 | |
---|
| 181 | static double annRanLaplace() |
---|
| 182 | { |
---|
| 183 | const double b = 1.4142136; |
---|
| 184 | |
---|
| 185 | double laprand = -log(annRan0()) / b; |
---|
| 186 | double sign = annRan0(); |
---|
| 187 | if (sign < 0.5) laprand = -laprand; |
---|
| 188 | return(laprand); |
---|
| 189 | } |
---|
| 190 | |
---|
| 191 | //---------------------------------------------------------------------- |
---|
| 192 | // annUniformPts - Generate uniformly distributed points |
---|
| 193 | // A uniform distribution over [-1,1]. |
---|
| 194 | //---------------------------------------------------------------------- |
---|
| 195 | |
---|
| 196 | void annUniformPts( // uniform distribution |
---|
| 197 | ANNpointArray pa, // point array (modified) |
---|
| 198 | int n, // number of points |
---|
| 199 | int dim) // dimension |
---|
| 200 | { |
---|
| 201 | for (int i = 0; i < n; i++) { |
---|
| 202 | for (int d = 0; d < dim; d++) { |
---|
| 203 | pa[i][d] = (ANNcoord) (annRanUnif(-1,1)); |
---|
| 204 | } |
---|
| 205 | } |
---|
| 206 | } |
---|
| 207 | |
---|
| 208 | //---------------------------------------------------------------------- |
---|
| 209 | // annGaussPts - Generate Gaussian distributed points |
---|
| 210 | // A Gaussian distribution with zero mean and the given standard |
---|
| 211 | // deviation. |
---|
| 212 | //---------------------------------------------------------------------- |
---|
| 213 | |
---|
| 214 | void annGaussPts( // Gaussian distribution |
---|
| 215 | ANNpointArray pa, // point array (modified) |
---|
| 216 | int n, // number of points |
---|
| 217 | int dim, // dimension |
---|
| 218 | double std_dev) // standard deviation |
---|
| 219 | { |
---|
| 220 | for (int i = 0; i < n; i++) { |
---|
| 221 | for (int d = 0; d < dim; d++) { |
---|
| 222 | pa[i][d] = (ANNcoord) (annRanGauss() * std_dev); |
---|
| 223 | } |
---|
| 224 | } |
---|
| 225 | } |
---|
| 226 | |
---|
| 227 | //---------------------------------------------------------------------- |
---|
| 228 | // annLaplacePts - Generate Laplacian distributed points |
---|
| 229 | // Generates a Laplacian distribution (zero mean and unit variance). |
---|
| 230 | //---------------------------------------------------------------------- |
---|
| 231 | |
---|
| 232 | void annLaplacePts( // Laplacian distribution |
---|
| 233 | ANNpointArray pa, // point array (modified) |
---|
| 234 | int n, // number of points |
---|
| 235 | int dim) // dimension |
---|
| 236 | { |
---|
| 237 | for (int i = 0; i < n; i++) { |
---|
| 238 | for (int d = 0; d < dim; d++) { |
---|
| 239 | pa[i][d] = (ANNcoord) annRanLaplace(); |
---|
| 240 | } |
---|
| 241 | } |
---|
| 242 | } |
---|
| 243 | |
---|
| 244 | //---------------------------------------------------------------------- |
---|
| 245 | // annCoGaussPts - Generate correlated Gaussian distributed points |
---|
| 246 | // Generates a Gauss-Markov distribution of zero mean and unit |
---|
| 247 | // variance. |
---|
| 248 | //---------------------------------------------------------------------- |
---|
| 249 | |
---|
| 250 | void annCoGaussPts( // correlated-Gaussian distribution |
---|
| 251 | ANNpointArray pa, // point array (modified) |
---|
| 252 | int n, // number of points |
---|
| 253 | int dim, // dimension |
---|
| 254 | double correlation) // correlation |
---|
| 255 | { |
---|
| 256 | double std_dev_w = sqrt(1.0 - correlation * correlation); |
---|
| 257 | for (int i = 0; i < n; i++) { |
---|
| 258 | double previous = annRanGauss(); |
---|
| 259 | pa[i][0] = (ANNcoord) previous; |
---|
| 260 | for (int d = 1; d < dim; d++) { |
---|
| 261 | previous = correlation*previous + std_dev_w*annRanGauss(); |
---|
| 262 | pa[i][d] = (ANNcoord) previous; |
---|
| 263 | } |
---|
| 264 | } |
---|
| 265 | } |
---|
| 266 | |
---|
| 267 | //---------------------------------------------------------------------- |
---|
| 268 | // annCoLaplacePts - Generate correlated Laplacian distributed points |
---|
| 269 | // Generates a Laplacian-Markov distribution of zero mean and unit |
---|
| 270 | // variance. |
---|
| 271 | //---------------------------------------------------------------------- |
---|
| 272 | |
---|
| 273 | void annCoLaplacePts( // correlated-Laplacian distribution |
---|
| 274 | ANNpointArray pa, // point array (modified) |
---|
| 275 | int n, // number of points |
---|
| 276 | int dim, // dimension |
---|
| 277 | double correlation) // correlation |
---|
| 278 | { |
---|
| 279 | double wn; |
---|
| 280 | double corr_sq = correlation * correlation; |
---|
| 281 | |
---|
| 282 | for (int i = 0; i < n; i++) { |
---|
| 283 | double previous = annRanLaplace(); |
---|
| 284 | pa[i][0] = (ANNcoord) previous; |
---|
| 285 | for (int d = 1; d < dim; d++) { |
---|
| 286 | double temp = annRan0(); |
---|
| 287 | if (temp < corr_sq) |
---|
| 288 | wn = 0.0; |
---|
| 289 | else |
---|
| 290 | wn = annRanLaplace(); |
---|
| 291 | previous = correlation * previous + wn; |
---|
| 292 | pa[i][d] = (ANNcoord) previous; |
---|
| 293 | } |
---|
| 294 | } |
---|
| 295 | } |
---|
| 296 | |
---|
| 297 | //---------------------------------------------------------------------- |
---|
| 298 | // annClusGaussPts - Generate clusters of Gaussian distributed points |
---|
| 299 | // Cluster centers are uniformly distributed over [-1,1], and the |
---|
| 300 | // standard deviation within each cluster is fixed. |
---|
| 301 | // |
---|
| 302 | // Note: Once cluster centers have been set, they are not changed, |
---|
| 303 | // unless new_clust = true. This is so that subsequent calls generate |
---|
| 304 | // points from the same distribution. It follows, of course, that any |
---|
| 305 | // attempt to change the dimension or number of clusters without |
---|
| 306 | // generating new clusters is asking for trouble. |
---|
| 307 | // |
---|
| 308 | // Note: Cluster centers are not generated by a call to uniformPts(). |
---|
| 309 | // Although this could be done, it has been omitted for |
---|
| 310 | // compatibility with annClusGaussPts() in the colored version, |
---|
| 311 | // rand_c.cc. |
---|
| 312 | //---------------------------------------------------------------------- |
---|
| 313 | |
---|
| 314 | void annClusGaussPts( // clustered-Gaussian distribution |
---|
| 315 | ANNpointArray pa, // point array (modified) |
---|
| 316 | int n, // number of points |
---|
| 317 | int dim, // dimension |
---|
| 318 | int n_clus, // number of colors |
---|
| 319 | ANNbool new_clust, // generate new clusters. |
---|
| 320 | double std_dev) // standard deviation within clusters |
---|
| 321 | { |
---|
| 322 | static ANNpointArray clusters = NULL;// cluster storage |
---|
| 323 | |
---|
| 324 | if (clusters == NULL || new_clust) {// need new cluster centers |
---|
| 325 | if (clusters != NULL) // clusters already exist |
---|
| 326 | annDeallocPts(clusters); // get rid of them |
---|
| 327 | clusters = annAllocPts(n_clus, dim); |
---|
| 328 | // generate cluster center coords |
---|
| 329 | for (int i = 0; i < n_clus; i++) { |
---|
| 330 | for (int d = 0; d < dim; d++) { |
---|
| 331 | clusters[i][d] = (ANNcoord) annRanUnif(-1,1); |
---|
| 332 | } |
---|
| 333 | } |
---|
| 334 | } |
---|
| 335 | |
---|
| 336 | for (int i = 0; i < n; i++) { |
---|
| 337 | int c = annRanInt(n_clus); // generate cluster index |
---|
| 338 | for (int d = 0; d < dim; d++) { |
---|
| 339 | pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + clusters[c][d]); |
---|
| 340 | } |
---|
| 341 | } |
---|
| 342 | } |
---|
| 343 | |
---|
| 344 | //---------------------------------------------------------------------- |
---|
| 345 | // annClusOrthFlats - points clustered along orthogonal flats |
---|
| 346 | // |
---|
| 347 | // This distribution consists of a collection points clustered |
---|
| 348 | // among a collection of axis-aligned low dimensional flats in |
---|
| 349 | // the hypercube [-1,1]^d. A set of n_clus orthogonal flats are |
---|
| 350 | // generated, each whose dimension is a random number between 1 |
---|
| 351 | // and max_dim. The points are evenly distributed among the clusters. |
---|
| 352 | // For each cluster, we generate points uniformly distributed along |
---|
| 353 | // the flat within the hypercube. |
---|
| 354 | // |
---|
| 355 | // This is done as follows. Each cluster is defined by a d-element |
---|
| 356 | // control vector whose components are either: |
---|
| 357 | // |
---|
| 358 | // CO_FLAG indicating that this component is to be generated |
---|
| 359 | // uniformly in [-1,1], |
---|
| 360 | // x a value other than CO_FLAG in the range [-1,1], |
---|
| 361 | // which indicates that this coordinate is to be |
---|
| 362 | // generated as x plus a Gaussian random deviation |
---|
| 363 | // with the given standard deviation. |
---|
| 364 | // |
---|
| 365 | // The number of zero components is the dimension of the flat, which |
---|
| 366 | // is a random integer in the range from 1 to max_dim. The points |
---|
| 367 | // are disributed between clusters in nearly equal sized groups. |
---|
| 368 | // |
---|
| 369 | // Note: Once cluster centers have been set, they are not changed, |
---|
| 370 | // unless new_clust = true. This is so that subsequent calls generate |
---|
| 371 | // points from the same distribution. It follows, of course, that any |
---|
| 372 | // attempt to change the dimension or number of clusters without |
---|
| 373 | // generating new clusters is asking for trouble. |
---|
| 374 | // |
---|
| 375 | // To make this a bad scenario at query time, query points should be |
---|
| 376 | // selected from a different distribution, e.g. uniform or Gaussian. |
---|
| 377 | // |
---|
| 378 | // We use a little programming trick to generate groups of roughly |
---|
| 379 | // equal size. If n is the total number of points, and n_clus is |
---|
| 380 | // the number of clusters, then the c-th cluster (0 <= c < n_clus) |
---|
| 381 | // is given floor((n+c)/n_clus) points. It can be shown that this |
---|
| 382 | // will exactly consume all n points. |
---|
| 383 | // |
---|
| 384 | // This procedure makes use of the utility procedure, genOrthFlat |
---|
| 385 | // which generates points in one orthogonal flat, according to |
---|
| 386 | // the given control vector. |
---|
| 387 | // |
---|
| 388 | //---------------------------------------------------------------------- |
---|
| 389 | const double CO_FLAG = 999; // special flag value |
---|
| 390 | |
---|
| 391 | static void genOrthFlat( // generate points on an orthog flat |
---|
| 392 | ANNpointArray pa, // point array |
---|
| 393 | int n, // number of points |
---|
| 394 | int dim, // dimension |
---|
| 395 | double *control, // control vector |
---|
| 396 | double std_dev) // standard deviation |
---|
| 397 | { |
---|
| 398 | for (int i = 0; i < n; i++) { // generate each point |
---|
| 399 | for (int d = 0; d < dim; d++) { // generate each coord |
---|
| 400 | if (control[d] == CO_FLAG) // dimension on flat |
---|
| 401 | pa[i][d] = (ANNcoord) annRanUnif(-1,1); |
---|
| 402 | else // dimension off flat |
---|
| 403 | pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + control[d]); |
---|
| 404 | } |
---|
| 405 | } |
---|
| 406 | } |
---|
| 407 | |
---|
| 408 | void annClusOrthFlats( // clustered along orthogonal flats |
---|
| 409 | ANNpointArray pa, // point array (modified) |
---|
| 410 | int n, // number of points |
---|
| 411 | int dim, // dimension |
---|
| 412 | int n_clus, // number of colors |
---|
| 413 | ANNbool new_clust, // generate new clusters. |
---|
| 414 | double std_dev, // standard deviation within clusters |
---|
| 415 | int max_dim) // maximum dimension of the flats |
---|
| 416 | { |
---|
| 417 | static ANNpointArray control = NULL; // control vectors |
---|
| 418 | |
---|
| 419 | if (control == NULL || new_clust) { // need new cluster centers |
---|
| 420 | if (control != NULL) { // clusters already exist |
---|
| 421 | annDeallocPts(control); // get rid of them |
---|
| 422 | } |
---|
| 423 | control = annAllocPts(n_clus, dim); |
---|
| 424 | |
---|
| 425 | for (int c = 0; c < n_clus; c++) { // generate clusters |
---|
| 426 | int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat |
---|
| 427 | for (int d = 0; d < dim; d++) { // generate side locations |
---|
| 428 | // prob. of picking next dim |
---|
| 429 | double Prob = ((double) n_dim)/((double) (dim-d)); |
---|
| 430 | if (annRan0() < Prob) { // add this one to flat |
---|
| 431 | control[c][d] = CO_FLAG; // flag this entry |
---|
| 432 | n_dim--; // one fewer dim to fill |
---|
| 433 | } |
---|
| 434 | else { // don't take this one |
---|
| 435 | control[c][d] = annRanUnif(-1,1);// random value in [-1,1] |
---|
| 436 | } |
---|
| 437 | } |
---|
| 438 | } |
---|
| 439 | } |
---|
| 440 | int offset = 0; // offset in pa array |
---|
| 441 | for (int c = 0; c < n_clus; c++) { // generate clusters |
---|
| 442 | int pick = (n+c)/n_clus; // number of points to pick |
---|
| 443 | // generate the points |
---|
| 444 | genOrthFlat(pa+offset, pick, dim, control[c], std_dev); |
---|
| 445 | offset += pick; // increment offset |
---|
| 446 | } |
---|
| 447 | } |
---|
| 448 | |
---|
| 449 | //---------------------------------------------------------------------- |
---|
| 450 | // annClusEllipsoids - points clustered around axis-aligned ellipsoids |
---|
| 451 | // |
---|
| 452 | // This distribution consists of a collection points clustered |
---|
| 453 | // among a collection of low dimensional ellipsoids whose axes |
---|
| 454 | // are alligned with the coordinate axes in the hypercube [-1,1]^d. |
---|
| 455 | // The objective is to model distributions in which the points are |
---|
| 456 | // distributed in lower dimensional subspaces, and within this |
---|
| 457 | // lower dimensional space the points are distributed with a |
---|
| 458 | // Gaussian distribution (with no correlation between the |
---|
| 459 | // dimensions). |
---|
| 460 | // |
---|
| 461 | // The distribution is given the number of clusters or "colors" |
---|
| 462 | // (n_clus), maximum number of dimensions (max_dim) of the lower |
---|
| 463 | // dimensional subspace, a "small" standard deviation |
---|
| 464 | // (std_dev_small), and a "large" standard deviation range |
---|
| 465 | // (std_dev_lo, std_dev_hi). |
---|
| 466 | // |
---|
| 467 | // The algorithm generates n_clus cluster centers uniformly from |
---|
| 468 | // the hypercube [-1,1]^d. For each cluster, it selects the |
---|
| 469 | // dimension of the subspace as a random number r between 1 and |
---|
| 470 | // max_dim. These are the dimensions of the ellipsoid. Then it |
---|
| 471 | // generates a d-element std dev vector whose entries are the |
---|
| 472 | // standard deviation for the coordinates of each cluster in the |
---|
| 473 | // distribution. Among the d-element control vector, r randomly |
---|
| 474 | // chosen values are chosen uniformly from the range [std_dev_lo, |
---|
| 475 | // std_dev_hi]. The remaining values are set to std_dev_small. |
---|
| 476 | // |
---|
| 477 | // Note that annClusGaussPts is a special case of this in which |
---|
| 478 | // max_dim = 0, and std_dev = std_dev_small. |
---|
| 479 | // |
---|
| 480 | // If the flag new_clust is set, then new cluster centers are |
---|
| 481 | // generated. |
---|
| 482 | // |
---|
| 483 | // This procedure makes use of the utility procedure genGauss |
---|
| 484 | // which generates points distributed according to a Gaussian |
---|
| 485 | // distribution. |
---|
| 486 | // |
---|
| 487 | //---------------------------------------------------------------------- |
---|
| 488 | |
---|
| 489 | static void genGauss( // generate points on a general Gaussian |
---|
| 490 | ANNpointArray pa, // point array |
---|
| 491 | int n, // number of points |
---|
| 492 | int dim, // dimension |
---|
| 493 | double *center, // center vector |
---|
| 494 | double *std_dev) // standard deviation vector |
---|
| 495 | { |
---|
| 496 | for (int i = 0; i < n; i++) { |
---|
| 497 | for (int d = 0; d < dim; d++) { |
---|
| 498 | pa[i][d] = (ANNcoord) (std_dev[d]*annRanGauss() + center[d]); |
---|
| 499 | } |
---|
| 500 | } |
---|
| 501 | } |
---|
| 502 | |
---|
| 503 | void annClusEllipsoids( // clustered around ellipsoids |
---|
| 504 | ANNpointArray pa, // point array (modified) |
---|
| 505 | int n, // number of points |
---|
| 506 | int dim, // dimension |
---|
| 507 | int n_clus, // number of colors |
---|
| 508 | ANNbool new_clust, // generate new clusters. |
---|
| 509 | double std_dev_small, // small standard deviation |
---|
| 510 | double std_dev_lo, // low standard deviation for ellipses |
---|
| 511 | double std_dev_hi, // high standard deviation for ellipses |
---|
| 512 | int max_dim) // maximum dimension of the flats |
---|
| 513 | { |
---|
| 514 | static ANNpointArray centers = NULL; // cluster centers |
---|
| 515 | static ANNpointArray std_dev = NULL; // standard deviations |
---|
| 516 | |
---|
| 517 | if (centers == NULL || new_clust) { // need new cluster centers |
---|
| 518 | if (centers != NULL) // clusters already exist |
---|
| 519 | annDeallocPts(centers); // get rid of them |
---|
| 520 | if (std_dev != NULL) // std deviations already exist |
---|
| 521 | annDeallocPts(std_dev); // get rid of them |
---|
| 522 | |
---|
| 523 | centers = annAllocPts(n_clus, dim); // alloc new clusters and devs |
---|
| 524 | std_dev = annAllocPts(n_clus, dim); |
---|
| 525 | |
---|
| 526 | for (int i = 0; i < n_clus; i++) { // gen cluster center coords |
---|
| 527 | for (int d = 0; d < dim; d++) { |
---|
| 528 | centers[i][d] = (ANNcoord) annRanUnif(-1,1); |
---|
| 529 | } |
---|
| 530 | } |
---|
| 531 | for (int c = 0; c < n_clus; c++) { // generate cluster std dev |
---|
| 532 | int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat |
---|
| 533 | for (int d = 0; d < dim; d++) { // generate std dev's |
---|
| 534 | // prob. of picking next dim |
---|
| 535 | double Prob = ((double) n_dim)/((double) (dim-d)); |
---|
| 536 | if (annRan0() < Prob) { // add this one to ellipse |
---|
| 537 | // generate random std dev |
---|
| 538 | std_dev[c][d] = annRanUnif(std_dev_lo, std_dev_hi); |
---|
| 539 | n_dim--; // one fewer dim to fill |
---|
| 540 | } |
---|
| 541 | else { // don't take this one |
---|
| 542 | std_dev[c][d] = std_dev_small;// use small std dev |
---|
| 543 | } |
---|
| 544 | } |
---|
| 545 | } |
---|
| 546 | } |
---|
| 547 | |
---|
| 548 | int offset = 0; // next slot to fill |
---|
| 549 | for (int c = 0; c < n_clus; c++) { // generate clusters |
---|
| 550 | int pick = (n+c)/n_clus; // number of points to pick |
---|
| 551 | // generate the points |
---|
| 552 | genGauss(pa+offset, pick, dim, centers[c], std_dev[c]); |
---|
| 553 | offset += pick; // increment offset in array |
---|
| 554 | } |
---|
| 555 | } |
---|
| 556 | |
---|
| 557 | //---------------------------------------------------------------------- |
---|
| 558 | // annPlanted - Generates points from a "planted" distribution |
---|
| 559 | // In high dimensional spaces, interpoint distances tend to be |
---|
| 560 | // highly clustered around the mean value. Approximate nearest |
---|
| 561 | // neighbor searching makes little sense in this context, unless it |
---|
| 562 | // is the case that each query point is significantly closer to its |
---|
| 563 | // nearest neighbor than to other points. Thus, the query points |
---|
| 564 | // should be planted close to the data points. Given a source data |
---|
| 565 | // set, this procedure generates a set of query points having this |
---|
| 566 | // property. |
---|
| 567 | // |
---|
| 568 | // We are given a source data array and a standard deviation. We |
---|
| 569 | // generate points as follows. We select a random point from the |
---|
| 570 | // source data set, and we generate a Gaussian point centered about |
---|
| 571 | // this random point and perturbed by a normal distributed random |
---|
| 572 | // variable with mean zero and the given standard deviation along |
---|
| 573 | // each coordinate. |
---|
| 574 | // |
---|
| 575 | // Note that this essentially the same a clustered Gaussian |
---|
| 576 | // distribution, but where the cluster centers are given by the |
---|
| 577 | // source data set. |
---|
| 578 | //---------------------------------------------------------------------- |
---|
| 579 | |
---|
| 580 | void annPlanted( // planted nearest neighbors |
---|
| 581 | ANNpointArray pa, // point array (modified) |
---|
| 582 | int n, // number of points |
---|
| 583 | int dim, // dimension |
---|
| 584 | ANNpointArray src, // source point array |
---|
| 585 | int n_src, // source size |
---|
| 586 | double std_dev) // standard deviation about source |
---|
| 587 | { |
---|
| 588 | for (int i = 0; i < n; i++) { |
---|
| 589 | int c = annRanInt(n_src); // generate source index |
---|
| 590 | for (int d = 0; d < dim; d++) { |
---|
| 591 | pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + src[c][d]); |
---|
| 592 | } |
---|
| 593 | } |
---|
| 594 | } |
---|