Changeset 65 for proiecte/Parallel-DT/R8
- Timestamp:
- Jan 7, 2010, 9:00:33 AM (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
proiecte/Parallel-DT/R8/Src/build.c
r26 r65 5 5 /* */ 6 6 /*************************************************************************/ 7 8 7 9 8 #include "defns.i" 10 9 #include "types.i" 11 10 #include "extern.i" 12 13 14 ItemCount 15 *Weight, /* Weight[i] = current fraction of item i */ 16 **Freq, /* Freq[x][c] = no. items of class c with outcome x */ 17 *ValFreq, /* ValFreq[x] = no. items with outcome x */ 18 *ClassFreq; /* ClassFreq[c] = no. items of class c */ 19 20 float 21 *Gain, /* Gain[a] = info gain by split on att a */ 22 *Info, /* Info[a] = potential info of split on att a */ 23 *Bar, /* Bar[a] = best threshold for contin att a */ 24 *UnknownRate; /* UnknownRate[a] = current unknown rate for att a */ 25 26 Boolean 27 *Tested, /* Tested[a] set if att a has already been tested */ 28 MultiVal; /* true when all atts have many values */ 29 30 31 /* External variables initialised here */ 32 33 extern float 34 *SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */ 35 *SplitInfo; /* SplitInfo[i] = potential info ditto */ 36 37 extern ItemCount 38 *Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */ 39 *Slice2; /* Slice2[c] = saved values of Freq[y][c] */ 40 41 extern Set 42 **Subset; /* Subset[a][s] = subset s for att a */ 43 44 extern short 45 *Subsets; /* Subsets[a] = no. subsets for att a */ 46 47 11 //#include "buildex.i" 12 13 #include <omp.h> 14 15 16 ItemCount *Weight, /* Weight[i] = current fraction of item i */ 17 **Freq, /* Freq[x][c] = no. items of class c with outcome x */ 18 *ValFreq, /* ValFreq[x] = no. items with outcome x */ 19 *ClassFreq; /* ClassFreq[c] = no. items of class c */ 20 21 float *Gain, /* Gain[a] = info gain by split on att a */ 22 *Info, /* Info[a] = potential info of split on att a */ 23 *Bar, /* Bar[a] = best threshold for contin att a */ 24 *UnknownRate; /* UnknownRate[a] = current unknown rate for att a */ 25 26 Boolean *Tested, /* Tested[a] set if att a has already been tested */ 27 MultiVal; /* true when all atts have many values */ 28 29 /* External variables initialised here */ 30 31 extern float *SplitGain, /* SplitGain[i] = gain with att value of item i as threshold */ 32 *SplitInfo; /* SplitInfo[i] = potential info ditto */ 33 34 extern ItemCount *Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */ 35 *Slice2; /* Slice2[c] = saved values of Freq[y][c] */ 36 37 extern Set **Subset; /* Subset[a][s] = subset s for att a */ 38 39 extern short *Subsets; /* Subsets[a] = no. subsets for att a */ 48 40 49 41 /*************************************************************************/ … … 53 45 /*************************************************************************/ 54 46 55 56 InitialiseTreeData() 47 InitialiseTreeData() 57 48 /* ------------------ */ 58 { 59 DiscrValue v; 60 Attribute a; 61 62 Tested = (char *) calloc(MaxAtt+1, sizeof(char)); 63 64 Gain = (float *) calloc(MaxAtt+1, sizeof(float)); 65 Info = (float *) calloc(MaxAtt+1, sizeof(float)); 66 Bar = (float *) calloc(MaxAtt+1, sizeof(float)); 67 68 Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *)); 69 ForEach(a, 0, MaxAtt) 70 { 71 if ( MaxAttVal[a] ) 72 { 73 Subset[a] = (Set *) calloc(MaxDiscrVal+1, sizeof(Set)); 74 ForEach(v, 0, MaxAttVal[a]) 75 { 76 Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1); 77 } 78 } 79 } 80 Subsets = (short *) calloc(MaxAtt+1, sizeof(short)); 81 82 SplitGain = (float *) calloc(MaxItem+1, sizeof(float)); 83 SplitInfo = (float *) calloc(MaxItem+1, sizeof(float)); 84 85 Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount)); 86 87 Freq = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *)); 88 ForEach(v, 0, MaxDiscrVal) 89 { 90 Freq[v] = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount)); 91 } 92 93 ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount)); 94 ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount)); 95 96 Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount)); 97 Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount)); 98 99 UnknownRate = (float *) calloc(MaxAtt+1, sizeof(float)); 100 101 /* Check whether all attributes have many discrete values */ 102 103 MultiVal = true; 104 if ( ! SUBSET ) 105 { 106 for ( a = 0 ; MultiVal && a <= MaxAtt ; a++ ) 107 { 108 if ( SpecialStatus[a] != IGNORE ) 109 { 110 MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1); 111 } 112 } 113 } 49 { 50 DiscrValue v; 51 Attribute a; 52 53 Tested = (char *) calloc(MaxAtt + 1, sizeof(char)); 54 55 Gain = (float *) calloc(MaxAtt + 1, sizeof(float)); 56 Info = (float *) calloc(MaxAtt + 1, sizeof(float)); 57 Bar = (float *) calloc(MaxAtt + 1, sizeof(float)); 58 59 Subset = (Set **) calloc(MaxAtt + 1, sizeof(Set *)); 60 ForEach(a, 0, MaxAtt) { 61 if (MaxAttVal[a]) { 62 Subset[a] = (Set *) calloc(MaxDiscrVal + 1, sizeof(Set)); 63 ForEach(v, 0, MaxAttVal[a]) { 64 Subset[a][v] = (Set) malloc((MaxAttVal[a] >> 3) + 1); 65 } 66 } 67 } 68 Subsets = (short *) calloc(MaxAtt + 1, sizeof(short)); 69 70 SplitGain = (float *) calloc(MaxItem + 1, sizeof(float)); 71 SplitInfo = (float *) calloc(MaxItem + 1, sizeof(float)); 72 73 Weight = (ItemCount *) calloc(MaxItem + 1, sizeof(ItemCount)); 74 75 Freq = (ItemCount **) calloc(MaxDiscrVal + 1, sizeof(ItemCount *)); 76 ForEach(v, 0, MaxDiscrVal) { 77 Freq[v] = (ItemCount *) calloc(MaxClass + 1, sizeof(ItemCount)); 78 } 79 80 ValFreq = (ItemCount *) calloc(MaxDiscrVal + 1, sizeof(ItemCount)); 81 ClassFreq = (ItemCount *) calloc(MaxClass + 1, sizeof(ItemCount)); 82 83 Slice1 = (ItemCount *) calloc(MaxClass + 2, sizeof(ItemCount)); 84 Slice2 = (ItemCount *) calloc(MaxClass + 2, sizeof(ItemCount)); 85 86 UnknownRate = (float *) calloc(MaxAtt + 1, sizeof(float)); 87 88 /* Check whether all attributes have many discrete values */ 89 90 MultiVal = true; 91 if (!SUBSET) { 92 for (a = 0; MultiVal && a <= MaxAtt; a++) { 93 if (SpecialStatus[a] != IGNORE) { 94 MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1); 95 } 96 } 97 } 98 99 114 100 } 115 101 116 117 118 102 /*************************************************************************/ 119 103 /* */ … … 122 106 /*************************************************************************/ 123 107 124 125 InitialiseWeights() 108 InitialiseWeights() 126 109 /* ----------------- */ 127 110 { 128 ItemNo i; 129 130 ForEach(i, 0, MaxItem) 131 { 132 Weight[i] = 1.0; 133 } 111 ItemNo i; 112 113 ForEach(i, 0, MaxItem) { 114 Weight[i] = 1.0; 115 } 134 116 } 135 136 137 117 138 118 /*************************************************************************/ … … 159 139 /*************************************************************************/ 160 140 161 162 141 Tree FormTree(Fp, Lp) 163 /* --------- */ 164 ItemNo Fp, Lp; 165 { 166 ItemNo i, Kp, Ep, Group(); 167 ItemCount Cases, NoBestClass, KnownCases, CountItems(); 168 float Factor, BestVal, Val, AvGain=0, Worth(); 169 Attribute Att, BestAtt, Possible=0; 170 ClassNo c, BestClass; 171 Tree Node, Leaf(); 172 DiscrValue v; 173 Boolean PrevAllKnown; 174 175 Cases = CountItems(Fp, Lp); 176 177 /* Generate the class frequency distribution */ 178 179 ForEach(c, 0, MaxClass) 180 { 181 ClassFreq[c] = 0; 182 } 183 ForEach(i, Fp, Lp) 184 { 185 ClassFreq[ Class(Item[i]) ] += Weight[i]; 186 } 187 188 /* Find the most frequent class */ 189 190 BestClass = 0; 191 ForEach(c, 0, MaxClass) 192 { 193 if ( ClassFreq[c] > ClassFreq[BestClass] ) 142 /* --------- */ 143 ItemNo Fp, Lp; { 144 ItemNo i, Kp, Ep, Group(); 145 ItemCount Cases, NoBestClass, KnownCases, CountItems(); 146 float Factor, BestVal, Val, AvGain = 0, Worth(); 147 Attribute Att, BestAtt, Possible = 0; 148 ClassNo c, BestClass; 149 Tree Node, Leaf(); 150 DiscrValue v; 151 Boolean PrevAllKnown; 152 153 Cases = CountItems(Fp, Lp); 154 155 /* Generate the class frequency distribution */ 156 157 // ########### begin parallel region ############## // 158 #pragma omp parallel default(shared) 194 159 { 195 BestClass = c; 196 } 197 } 198 NoBestClass = ClassFreq[BestClass]; 199 200 Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass); 201 202 /* If all cases are of the same class or there are not enough 203 cases to divide, the tree is a leaf */ 204 205 if ( NoBestClass == Cases || Cases < 2 * MINOBJS ) 206 { 160 161 //printf("The parallel region is executed by thread %d\n", omp_get_thread_num()); 162 /* THIS CAN BE PARALELIZED */ 163 #pragma omp for private(c) 164 ForEach(c, 0, MaxClass) { 165 ClassFreq[c] = 0; 166 } 167 168 /* THIS CAN BE PARALELIZED */ 169 #pragma omp for private(i) 170 ForEach(i, Fp, Lp) { 171 #pragma omp atomic 172 ClassFreq[Class(Item[i])] += Weight[i]; 173 } 174 } 175 176 /* Find the most frequent class */ 177 /* THIS CAN BE PARALELIZED */ 178 BestClass = 0; 179 180 ForEach(c, 0, MaxClass) { 181 if (ClassFreq[c] > ClassFreq[BestClass]) { 182 BestClass = c; 183 } 184 } 185 186 NoBestClass = ClassFreq[BestClass]; 187 188 Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass); 189 190 /* If all cases are of the same class or there are not enough 191 cases to divide, the tree is a leaf */ 192 193 if (NoBestClass == Cases || Cases < 2 * MINOBJS) { 194 return Node; 195 } 196 197 Verbosity(1) 198 printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases); 199 200 /* For each available attribute, find the information and gain */ 201 202 ForEach(Att, 0, MaxAtt) { 203 Gain[Att] = -Epsilon; 204 205 if (SpecialStatus[Att] == IGNORE) 206 continue; 207 208 if (MaxAttVal[Att]) { 209 /* discrete valued attribute */ 210 211 if (SUBSET && MaxAttVal[Att] > 2) { 212 EvalSubset(Att, Fp, Lp, Cases); 213 } else if (!Tested[Att]) { 214 EvalDiscreteAtt(Att, Fp, Lp, Cases); 215 } 216 } else { 217 /* continuous attribute */ 218 219 EvalContinuousAtt(Att, Fp, Lp); 220 } 221 222 /* Update average gain, excluding attributes with very many values */ 223 224 if (Gain[Att] > -Epsilon && (MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1))) { 225 Possible++; 226 AvGain += Gain[Att]; 227 } 228 229 } 230 231 /* Find the best attribute according to the given criterion */ 232 BestVal = -Epsilon; 233 BestAtt = None; 234 AvGain = (Possible ? AvGain / Possible : 1E6); 235 236 Verbosity(2) { 237 if (AvGain < 1E6) 238 printf("\taverage gain %.3f\n", AvGain); 239 } 240 241 ForEach(Att, 0, MaxAtt) { 242 if (Gain[Att] > -Epsilon) { 243 Val = Worth(Info[Att], Gain[Att], AvGain); 244 if (Val > BestVal) { 245 BestAtt = Att; 246 BestVal = Val; 247 } 248 } 249 } 250 251 252 /* Decide whether to branch or not */ 253 254 if (BestAtt != None) { 255 Verbosity(1) { 256 printf("\tbest attribute %s", AttName[BestAtt]); 257 if (!MaxAttVal[BestAtt]) { 258 printf(" cut %.3f", Bar[BestAtt]); 259 } 260 printf(" inf %.3f gain %.3f val %.3f\n", Info[BestAtt], 261 Gain[BestAtt], BestVal); 262 } 263 264 /* Build a node of the selected test */ 265 266 if (MaxAttVal[BestAtt]) { 267 /* Discrete valued attribute */ 268 269 if (SUBSET && MaxAttVal[BestAtt] > 2) { 270 SubsetTest(Node, BestAtt); 271 } else { 272 DiscreteTest(Node, BestAtt); 273 } 274 } else { 275 /* Continuous attribute */ 276 277 ContinTest(Node, BestAtt); 278 } 279 280 /* Remove unknown attribute values */ 281 282 PrevAllKnown = AllKnown; 283 284 Kp = Group(0, Fp, Lp, Node) + 1; 285 if (Kp != Fp) 286 AllKnown = false; 287 KnownCases = Cases - CountItems(Fp, Kp - 1); 288 UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001); 289 290 Verbosity(1) { 291 if (UnknownRate[BestAtt] > 0) { 292 printf("\tunknown rate for %s = %.3f\n", AttName[BestAtt], 293 UnknownRate[BestAtt]); 294 } 295 } 296 297 /* Recursive divide and conquer */ 298 299 ++Tested[BestAtt]; 300 301 Ep = Kp - 1; 302 Node->Errors = 0; 303 304 ForEach(v, 1, Node->Forks) { 305 Ep = Group(v, Kp, Lp, Node); 306 307 if (Kp <= Ep) { 308 Factor = CountItems(Kp, Ep) / KnownCases; 309 310 ForEach(i, Fp, Kp-1) { 311 Weight[i] *= Factor; 312 } 313 314 Node->Branch[v] = FormTree(Fp, Ep); 315 Node->Errors += Node->Branch[v]->Errors; 316 317 Group(0, Fp, Ep, Node); 318 ForEach(i, Fp, Kp-1) { 319 Weight[i] /= Factor; 320 } 321 } else { 322 Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0); 323 } 324 } 325 326 --Tested[BestAtt]; 327 AllKnown = PrevAllKnown; 328 329 /* See whether we would have been no worse off with a leaf */ 330 331 if (Node->Errors >= Cases - NoBestClass - Epsilon) { 332 Verbosity(1) 333 printf("Collapse tree for %d items to leaf %s\n", Lp - Fp + 1, 334 ClassName[BestClass]); 335 336 Node->NodeType = 0; 337 } 338 } 339 else { 340 Verbosity(1) 341 printf("\tno sensible splits %.1f/%.1f\n", Cases, Cases 342 - NoBestClass); 343 } 344 207 345 return Node; 208 } 209 210 Verbosity(1) 211 printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases); 212 213 /* For each available attribute, find the information and gain */ 214 215 ForEach(Att, 0, MaxAtt) 216 { 217 Gain[Att] = -Epsilon; 218 219 if ( SpecialStatus[Att] == IGNORE ) continue; 220 221 if ( MaxAttVal[Att] ) 346 } 347 348 Tree FormTree_Discr(Fp, Lp) 349 /* --------- */ 350 ItemNo Fp, Lp; { 351 ItemNo i, Kp, Ep, Group(); 352 ItemCount Cases, NoBestClass, KnownCases, CountItems(); 353 float Factor, BestVal, Val, AvGain = 0, Worth(); 354 Attribute Att, BestAtt, Possible = 0; 355 ClassNo c, BestClass; 356 Tree Node, Leaf(); 357 DiscrValue v; 358 Boolean PrevAllKnown; 359 ItemCount Freq_discr[MAX_DISCR_VAL + 1][MAX_CLASS + 1], ValFreq_discr[MAX_DISCR_VAL + 1]; 360 float UnknownRate_discr[MAX_ATT + 1]; 361 362 Cases = CountItems(Fp, Lp); 363 364 /* Generate the class frequency distribution */ 365 366 // ########### begin parallel region ############## // 367 #pragma omp parallel default(shared) 222 368 { 223 /* discrete valued attribute */ 224 225 if ( SUBSET && MaxAttVal[Att] > 2 ) 226 { 227 EvalSubset(Att, Fp, Lp, Cases); 228 } 229 else 230 if ( ! Tested[Att] ) 231 { 232 EvalDiscreteAtt(Att, Fp, Lp, Cases); 233 } 234 } 235 else 236 { 237 /* continuous attribute */ 238 239 EvalContinuousAtt(Att, Fp, Lp); 240 } 241 242 /* Update average gain, excluding attributes with very many values */ 243 244 if ( Gain[Att] > -Epsilon && 245 ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1) ) ) 369 370 //printf("The parallel region is executed by thread %d\n", omp_get_thread_num()); 371 /* THIS CAN BE PARALELIZED */ 372 #pragma omp for private(c) 373 ForEach(c, 0, MaxClass) { 374 ClassFreq[c] = 0; 375 } 376 377 /* THIS CAN BE PARALELIZED */ 378 #pragma omp for private(i) 379 ForEach(i, Fp, Lp) { 380 #pragma omp atomic 381 ClassFreq[Class(Item[i])] += Weight[i]; 382 } 383 } 384 385 /* Find the most frequent class */ 386 /* THIS CAN BE PARALELIZED */ 387 BestClass = 0; 388 389 ForEach(c, 0, MaxClass) { 390 if (ClassFreq[c] > ClassFreq[BestClass]) { 391 BestClass = c; 392 } 393 } 394 395 NoBestClass = ClassFreq[BestClass]; 396 397 Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass); 398 399 /* If all cases are of the same class or there are not enough 400 cases to divide, the tree is a leaf */ 401 402 if (NoBestClass == Cases || Cases < 2 * MINOBJS) { 403 return Node; 404 } 405 406 Verbosity(1) 407 printf("\n%d items, total weight %.1f\n", Lp - Fp + 1, Cases); 408 409 410 /* For each available attribute, find the information and gain */ 411 /* THIS MUST BE PARALELIZED */ 412 #pragma omp parallel default(shared) private(Freq_discr, ValFreq_discr, UnknownRate_discr) 246 413 { 247 Possible++; 248 AvGain += Gain[Att]; 249 } 250 } 251 252 /* Find the best attribute according to the given criterion */ 253 254 BestVal = -Epsilon; 255 BestAtt = None; 256 AvGain = ( Possible ? AvGain / Possible : 1E6 ); 257 258 Verbosity(2) 259 { 260 if ( AvGain < 1E6 ) printf("\taverage gain %.3f\n", AvGain); 261 } 262 263 ForEach(Att, 0, MaxAtt) 264 { 265 if ( Gain[Att] > -Epsilon ) 266 { 267 Val = Worth(Info[Att], Gain[Att], AvGain); 268 if ( Val > BestVal ) 269 { 270 BestAtt = Att; 271 BestVal = Val; 272 } 273 } 274 } 275 276 /* Decide whether to branch or not */ 277 278 if ( BestAtt != None ) 279 { 280 Verbosity(1) 281 { 282 printf("\tbest attribute %s", AttName[BestAtt]); 283 if ( ! MaxAttVal[BestAtt] ) 284 { 285 printf(" cut %.3f", Bar[BestAtt]); 286 } 287 printf(" inf %.3f gain %.3f val %.3f\n", 288 Info[BestAtt], Gain[BestAtt], BestVal); 289 } 290 291 /* Build a node of the selected test */ 292 293 if ( MaxAttVal[BestAtt] ) 294 { 295 /* Discrete valued attribute */ 296 297 if ( SUBSET && MaxAttVal[BestAtt] > 2 ) 298 { 299 SubsetTest(Node, BestAtt); 300 } 301 else 302 { 303 DiscreteTest(Node, BestAtt); 304 } 305 } 306 else 307 { 308 /* Continuous attribute */ 309 310 ContinTest(Node, BestAtt); 311 } 312 313 /* Remove unknown attribute values */ 314 315 PrevAllKnown = AllKnown; 316 317 Kp = Group(0, Fp, Lp, Node) + 1; 318 if ( Kp != Fp ) AllKnown = false; 319 KnownCases = Cases - CountItems(Fp, Kp-1); 320 UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001); 321 322 Verbosity(1) 323 { 324 if ( UnknownRate[BestAtt] > 0 ) 325 { 326 printf("\tunknown rate for %s = %.3f\n", 327 AttName[BestAtt], UnknownRate[BestAtt]); 328 } 329 } 330 331 /* Recursive divide and conquer */ 332 333 ++Tested[BestAtt]; 334 335 Ep = Kp - 1; 336 Node->Errors = 0; 337 338 ForEach(v, 1, Node->Forks) 339 { 340 Ep = Group(v, Kp, Lp, Node); 341 342 if ( Kp <= Ep ) 343 { 344 Factor = CountItems(Kp, Ep) / KnownCases; 345 346 ForEach(i, Fp, Kp-1) 347 { 348 Weight[i] *= Factor; 349 } 350 351 Node->Branch[v] = FormTree(Fp, Ep); 352 Node->Errors += Node->Branch[v]->Errors; 353 354 Group(0, Fp, Ep, Node); 355 ForEach(i, Fp, Kp-1) 356 { 357 Weight[i] /= Factor; 358 } 359 } 360 else 361 { 362 Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0); 363 } 364 } 365 366 --Tested[BestAtt]; 367 AllKnown = PrevAllKnown; 368 369 /* See whether we would have been no worse off with a leaf */ 370 371 if ( Node->Errors >= Cases - NoBestClass - Epsilon ) 372 { 373 Verbosity(1) 374 printf("Collapse tree for %d items to leaf %s\n", 375 Lp - Fp + 1, ClassName[BestClass]); 376 377 Node->NodeType = 0; 378 } 379 } 380 else 381 { 382 Verbosity(1) 383 printf("\tno sensible splits %.1f/%.1f\n", 384 Cases, Cases - NoBestClass); 385 } 386 387 return Node; 388 } 389 390 391 414 415 #pragma omp for 416 ForEach(Att, 0, MaxAtt) { 417 Gain[Att] = -Epsilon; 418 419 if (SpecialStatus[Att] == IGNORE) 420 continue; 421 422 if (MaxAttVal[Att]) { 423 /* discrete valued attribute */ 424 425 if (SUBSET && MaxAttVal[Att] > 2) { 426 EvalSubset(Att, Fp, Lp, Cases); 427 } else if (!Tested[Att]) { 428 EvalDiscreteAtt_Discr(Att, Fp, Lp, Cases, (ItemCount**)Freq_discr, (ItemCount*)ValFreq_discr, (float*)UnknownRate_discr); 429 } 430 } else { 431 /* continuous attribute */ 432 433 EvalContinuousAtt(Att, Fp, Lp); 434 } 435 436 /* Update average gain, excluding attributes with very many values */ 437 #pragma omp critical 438 { 439 if (Gain[Att] > -Epsilon && (MultiVal || MaxAttVal[Att] < 0.3 440 * (MaxItem + 1))) { 441 Possible++; 442 AvGain += Gain[Att]; 443 } 444 } 445 } 446 447 } 448 449 /* Find the best attribute according to the given criterion */ 450 //#pragma omp single 451 //{ 452 BestVal = -Epsilon; 453 BestAtt = None; 454 AvGain = (Possible ? AvGain / Possible : 1E6); 455 456 Verbosity(2) { 457 if (AvGain < 1E6) 458 printf("\taverage gain %.3f\n", AvGain); 459 } 460 461 ForEach(Att, 0, MaxAtt) { 462 if (Gain[Att] > -Epsilon) { 463 Val = Worth(Info[Att], Gain[Att], AvGain); 464 if (Val > BestVal) { 465 BestAtt = Att; 466 BestVal = Val; 467 } 468 } 469 } 470 //} 471 //} 472 /* Decide whether to branch or not */ 473 474 if (BestAtt != None) { 475 Verbosity(1) { 476 printf("\tbest attribute %s", AttName[BestAtt]); 477 if (!MaxAttVal[BestAtt]) { 478 printf(" cut %.3f", Bar[BestAtt]); 479 } 480 printf(" inf %.3f gain %.3f val %.3f\n", Info[BestAtt], 481 Gain[BestAtt], BestVal); 482 } 483 484 /* Build a node of the selected test */ 485 486 if (MaxAttVal[BestAtt]) { 487 /* Discrete valued attribute */ 488 489 if (SUBSET && MaxAttVal[BestAtt] > 2) { 490 SubsetTest(Node, BestAtt); 491 } else { 492 DiscreteTest(Node, BestAtt); 493 } 494 } else { 495 /* Continuous attribute */ 496 497 ContinTest(Node, BestAtt); 498 } 499 500 /* Remove unknown attribute values */ 501 502 PrevAllKnown = AllKnown; 503 504 Kp = Group(0, Fp, Lp, Node) + 1; 505 if (Kp != Fp) 506 AllKnown = false; 507 KnownCases = Cases - CountItems(Fp, Kp - 1); 508 UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001); 509 510 Verbosity(1) { 511 if (UnknownRate[BestAtt] > 0) { 512 printf("\tunknown rate for %s = %.3f\n", AttName[BestAtt], 513 UnknownRate[BestAtt]); 514 } 515 } 516 517 /* Recursive divide and conquer */ 518 519 ++Tested[BestAtt]; 520 521 Ep = Kp - 1; 522 Node->Errors = 0; 523 524 ForEach(v, 1, Node->Forks) { 525 Ep = Group(v, Kp, Lp, Node); 526 527 if (Kp <= Ep) { 528 Factor = CountItems(Kp, Ep) / KnownCases; 529 530 ForEach(i, Fp, Kp-1) { 531 Weight[i] *= Factor; 532 } 533 534 Node->Branch[v] = FormTree(Fp, Ep); 535 Node->Errors += Node->Branch[v]->Errors; 536 537 Group(0, Fp, Ep, Node); 538 ForEach(i, Fp, Kp-1) { 539 Weight[i] /= Factor; 540 } 541 } else { 542 Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0); 543 } 544 } 545 546 --Tested[BestAtt]; 547 AllKnown = PrevAllKnown; 548 549 /* See whether we would have been no worse off with a leaf */ 550 551 if (Node->Errors >= Cases - NoBestClass - Epsilon) { 552 Verbosity(1) 553 printf("Collapse tree for %d items to leaf %s\n", Lp - Fp + 1, 554 ClassName[BestClass]); 555 556 Node->NodeType = 0; 557 } 558 } 559 else { 560 Verbosity(1) 561 printf("\tno sensible splits %.1f/%.1f\n", Cases, Cases 562 - NoBestClass); 563 } 564 565 return Node; 566 } 392 567 /*************************************************************************/ 393 568 /* */ … … 399 574 /*************************************************************************/ 400 575 401 402 576 ItemNo Group(V, Fp, Lp, TestNode) 403 /* ----- */ 404 DiscrValue V; 405 ItemNo Fp, Lp; 406 Tree TestNode; 407 { 408 ItemNo i; 409 Attribute Att; 410 float Thresh; 411 Set SS; 412 void Swap(); 413 414 Att = TestNode->Tested; 415 416 if ( V ) 417 { 418 /* Group items on the value of attribute Att, and depending 419 on the type of branch */ 420 421 switch ( TestNode->NodeType ) 422 { 423 case BrDiscr: 424 425 ForEach(i, Fp, Lp) 426 { 427 if ( DVal(Item[i], Att) == V ) Swap(Fp++, i); 428 } 429 break; 430 431 case ThreshContin: 432 433 Thresh = TestNode->Cut; 434 ForEach(i, Fp, Lp) 435 { 436 if ( (CVal(Item[i], Att) <= Thresh) == (V == 1) ) Swap(Fp++, i); 437 } 438 break; 439 440 case BrSubset: 441 442 SS = TestNode->Subset[V]; 443 ForEach(i, Fp, Lp) 444 { 445 if ( In(DVal(Item[i], Att), SS) ) Swap(Fp++, i); 446 } 447 break; 448 } 449 } 450 else 451 { 452 /* Group together unknown values */ 453 454 switch ( TestNode->NodeType ) 455 { 456 case BrDiscr: 457 case BrSubset: 458 459 ForEach(i, Fp, Lp) 460 { 461 if ( ! DVal(Item[i], Att) ) Swap(Fp++, i); 462 } 463 break; 464 465 case ThreshContin: 466 467 ForEach(i, Fp, Lp) 468 { 469 if ( CVal(Item[i], Att) == Unknown ) Swap(Fp++, i); 470 } 471 break; 472 } 473 } 474 475 return Fp - 1; 577 /* ----- */ 578 DiscrValue V;ItemNo Fp, Lp;Tree TestNode; { 579 ItemNo i; 580 Attribute Att; 581 float Thresh; 582 Set SS; 583 void Swap(); 584 585 Att = TestNode->Tested; 586 587 if (V) { 588 /* Group items on the value of attribute Att, and depending 589 on the type of branch */ 590 591 switch (TestNode->NodeType) { 592 case BrDiscr: 593 594 ForEach(i, Fp, Lp) { 595 if (DVal(Item[i], Att) == V) 596 Swap(Fp++, i); 597 } 598 break; 599 600 case ThreshContin: 601 602 Thresh = TestNode->Cut; 603 ForEach(i, Fp, Lp) { 604 if ((CVal(Item[i], Att) <= Thresh) == (V == 1)) 605 Swap(Fp++, i); 606 } 607 break; 608 609 case BrSubset: 610 611 SS = TestNode->Subset[V]; 612 ForEach(i, Fp, Lp) { 613 if (In(DVal(Item[i], Att), SS)) 614 Swap(Fp++, i); 615 } 616 break; 617 } 618 } else { 619 /* Group together unknown values */ 620 621 switch (TestNode->NodeType) { 622 case BrDiscr: 623 case BrSubset: 624 625 ForEach(i, Fp, Lp) { 626 if (!DVal(Item[i], Att)) 627 Swap(Fp++, i); 628 } 629 break; 630 631 case ThreshContin: 632 633 ForEach(i, Fp, Lp) { 634 if (CVal(Item[i], Att) == Unknown) 635 Swap(Fp++, i); 636 } 637 break; 638 } 639 } 640 641 return Fp - 1; 476 642 } 477 643 478 479 480 644 /*************************************************************************/ 481 645 /* */ … … 484 648 /*************************************************************************/ 485 649 486 487 650 ItemCount CountItems(Fp, Lp) 488 /* ---------- */ 489 ItemNo Fp, Lp; 490 { 491 register ItemCount Sum=0.0, *Wt, *LWt; 492 493 if ( AllKnown ) return Lp - Fp + 1; 494 495 for ( Wt = Weight + Fp, LWt = Weight + Lp ; Wt <= LWt ; ) 496 { 497 Sum += *Wt++; 498 } 499 500 return Sum; 651 /* ---------- */ 652 ItemNo Fp, Lp; { 653 register ItemCount Sum = 0.0, *Wt, *LWt; 654 ItemNo i; 655 656 if (AllKnown) 657 return Lp - Fp + 1; 658 659 //Lwt = Weight + Lp; 660 661 #pragma omp parallel for reduction(+:Sum) 662 for (i = Fp; i <= Lp; i++) { 663 Sum += Weight[i]; 664 } 665 666 return Sum; 501 667 } 502 503 504 668 505 669 /*************************************************************************/ … … 509 673 /*************************************************************************/ 510 674 511 512 void Swap(a,b) 513 /* ---- */ 514 ItemNo a, b; 515 { 516 register Description Hold; 517 register ItemCount HoldW; 518 519 Hold = Item[a]; 520 Item[a] = Item[b]; 521 Item[b] = Hold; 522 523 HoldW = Weight[a]; 524 Weight[a] = Weight[b]; 525 Weight[b] = HoldW; 675 void Swap(a, b) 676 /* ---- */ 677 ItemNo a, b; { 678 register Description Hold; 679 register ItemCount HoldW; 680 681 Hold = Item[a]; 682 Item[a] = Item[b]; 683 Item[b] = Hold; 684 685 HoldW = Weight[a]; 686 Weight[a] = Weight[b]; 687 Weight[b] = HoldW; 526 688 }
Note: See TracChangeset
for help on using the changeset viewer.