[26] | 1 | /*************************************************************************/ |
---|
| 2 | /* */ |
---|
| 3 | /* Classify items interactively using a decision tree */ |
---|
| 4 | /* -------------------------------------------------- */ |
---|
| 5 | /* */ |
---|
| 6 | /*************************************************************************/ |
---|
| 7 | |
---|
| 8 | |
---|
| 9 | #include "defns.i" |
---|
| 10 | #include "types.i" |
---|
| 11 | |
---|
| 12 | |
---|
| 13 | /* External data -- see c4.5.c for meanings */ |
---|
| 14 | |
---|
| 15 | short MaxAtt, MaxClass, MaxDiscrVal; |
---|
| 16 | |
---|
| 17 | ItemNo MaxItem; |
---|
| 18 | |
---|
| 19 | Description *Item; |
---|
| 20 | |
---|
| 21 | DiscrValue *MaxAttVal; |
---|
| 22 | |
---|
| 23 | String *ClassName, |
---|
| 24 | *AttName, |
---|
| 25 | **AttValName, |
---|
| 26 | FileName = "DF"; |
---|
| 27 | |
---|
| 28 | char *SpecialStatus; |
---|
| 29 | |
---|
| 30 | short VERBOSITY = 0, |
---|
| 31 | TRACE = 0; |
---|
| 32 | |
---|
| 33 | |
---|
| 34 | /* The interview module uses a more complex description of an |
---|
| 35 | case called a "Range Description". The value of an |
---|
| 36 | attribute is given by |
---|
| 37 | - lower and upper bounds (continuous attribute) |
---|
| 38 | - probability of each possible value (discrete attribute) */ |
---|
| 39 | |
---|
| 40 | |
---|
| 41 | typedef struct ValRange *RangeDescRec; |
---|
| 42 | |
---|
| 43 | struct ValRange |
---|
| 44 | { |
---|
| 45 | Boolean Known, /* is range known? */ |
---|
| 46 | Asked; /* has it been asked? */ |
---|
| 47 | float LowerBound, /* lower bound given */ |
---|
| 48 | UpperBound, /* upper ditto */ |
---|
| 49 | *Probability; /* prior prob of each discr value */ |
---|
| 50 | }; |
---|
| 51 | |
---|
| 52 | RangeDescRec RangeDesc; |
---|
| 53 | |
---|
| 54 | Tree DecisionTree, /* tree being used */ |
---|
| 55 | GetTree(); |
---|
| 56 | |
---|
| 57 | float *LowClassSum, /* accumulated lower estimates */ |
---|
| 58 | *ClassSum = Nil; /* accumulated central estimates */ |
---|
| 59 | |
---|
| 60 | #define Fuzz 0.01 /* minimum weight */ |
---|
| 61 | |
---|
| 62 | |
---|
| 63 | |
---|
| 64 | /*************************************************************************/ |
---|
| 65 | /* */ |
---|
| 66 | /* Classify the extended case description in RangeDesc using the */ |
---|
| 67 | /* given subtree, by adjusting the values ClassSum and LowClassSum */ |
---|
| 68 | /* for each class, indicating the likelihood of the case being */ |
---|
| 69 | /* of that class. */ |
---|
| 70 | /* */ |
---|
| 71 | /*************************************************************************/ |
---|
| 72 | |
---|
| 73 | |
---|
| 74 | ClassifyCase(Subtree, Weight) |
---|
| 75 | /* ------------ */ |
---|
| 76 | Tree Subtree; |
---|
| 77 | float Weight; |
---|
| 78 | { |
---|
| 79 | DiscrValue v; |
---|
| 80 | float BranchWeight, Area(), Interpolate(); |
---|
| 81 | Attribute a; |
---|
| 82 | short s; |
---|
| 83 | ClassNo c; |
---|
| 84 | |
---|
| 85 | /* A leaf */ |
---|
| 86 | |
---|
| 87 | if ( ! Subtree->NodeType ) |
---|
| 88 | { |
---|
| 89 | Verbosity(1) |
---|
| 90 | printf("\tClass %s weight %g cases %g\n", |
---|
| 91 | ClassName[Subtree->Leaf], Weight, Subtree->Items); |
---|
| 92 | |
---|
| 93 | if ( Subtree->Items > 0 ) |
---|
| 94 | { |
---|
| 95 | /* Adjust class sum of ALL classes, but adjust low class sum |
---|
| 96 | of leaf class only */ |
---|
| 97 | |
---|
| 98 | ForEach(c, 0, MaxClass) |
---|
| 99 | { |
---|
| 100 | ClassSum[c] += Weight * Subtree->ClassDist[c] / Subtree->Items; |
---|
| 101 | } |
---|
| 102 | |
---|
| 103 | LowClassSum[Subtree->Leaf] += |
---|
| 104 | Weight * (1 - Subtree->Errors / Subtree->Items); |
---|
| 105 | } |
---|
| 106 | else |
---|
| 107 | { |
---|
| 108 | ClassSum[Subtree->Leaf] += Weight; |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | return; |
---|
| 112 | } |
---|
| 113 | |
---|
| 114 | a = Subtree->Tested; |
---|
| 115 | |
---|
| 116 | CheckValue(a, Subtree); |
---|
| 117 | |
---|
| 118 | /* Unknown value */ |
---|
| 119 | |
---|
| 120 | if ( ! RangeDesc[a].Known ) |
---|
| 121 | { |
---|
| 122 | ForEach(v, 1, Subtree->Forks) |
---|
| 123 | { |
---|
| 124 | ClassifyCase(Subtree->Branch[v], |
---|
| 125 | (Weight * Subtree->Branch[v]->Items) / Subtree->Items); |
---|
| 126 | } |
---|
| 127 | return; |
---|
| 128 | } |
---|
| 129 | |
---|
| 130 | /* Known value */ |
---|
| 131 | |
---|
| 132 | switch ( Subtree->NodeType ) |
---|
| 133 | { |
---|
| 134 | case BrDiscr: /* test of discrete attribute */ |
---|
| 135 | |
---|
| 136 | ForEach(v, 1, MaxAttVal[a]) |
---|
| 137 | { |
---|
| 138 | BranchWeight = RangeDesc[a].Probability[v]; |
---|
| 139 | if ( BranchWeight > 0 ) |
---|
| 140 | { |
---|
| 141 | Verbosity(1) |
---|
| 142 | printf("\tWeight %g: test att %s (val %s = %g)\n", |
---|
| 143 | Weight, AttName[a], AttValName[a][v], |
---|
| 144 | BranchWeight); |
---|
| 145 | |
---|
| 146 | ClassifyCase(Subtree->Branch[v], Weight * BranchWeight); |
---|
| 147 | } |
---|
| 148 | } |
---|
| 149 | break; |
---|
| 150 | |
---|
| 151 | case ThreshContin: /* test of continuous attribute */ |
---|
| 152 | |
---|
| 153 | BranchWeight = |
---|
| 154 | RangeDesc[a].UpperBound <= Subtree->Lower ? 1.0 : |
---|
| 155 | RangeDesc[a].LowerBound > Subtree->Upper ? 0.0 : |
---|
| 156 | RangeDesc[a].LowerBound != RangeDesc[a].UpperBound ? |
---|
| 157 | (Area(Subtree, RangeDesc[a].LowerBound) - |
---|
| 158 | Area(Subtree, RangeDesc[a].UpperBound)) / |
---|
| 159 | (RangeDesc[a].UpperBound - RangeDesc[a].LowerBound) : |
---|
| 160 | Interpolate(Subtree, RangeDesc[a].LowerBound) ; |
---|
| 161 | |
---|
| 162 | Verbosity(1) |
---|
| 163 | printf("\tWeight %g: test att %s (branch weight=%g)\n", |
---|
| 164 | Weight, AttName[a], BranchWeight); |
---|
| 165 | |
---|
| 166 | if ( BranchWeight > Fuzz ) |
---|
| 167 | { |
---|
| 168 | ClassifyCase(Subtree->Branch[1], Weight * BranchWeight); |
---|
| 169 | } |
---|
| 170 | if ( BranchWeight < 1-Fuzz ) |
---|
| 171 | { |
---|
| 172 | ClassifyCase(Subtree->Branch[2], Weight * (1 - BranchWeight)); |
---|
| 173 | } |
---|
| 174 | break; |
---|
| 175 | |
---|
| 176 | case BrSubset: /* subset test on discrete attribute */ |
---|
| 177 | |
---|
| 178 | ForEach(s, 1, Subtree->Forks) |
---|
| 179 | { |
---|
| 180 | BranchWeight = 0.0; |
---|
| 181 | ForEach(v, 1, MaxAttVal[a]) |
---|
| 182 | { |
---|
| 183 | if ( In(v, Subtree->Subset[s]) ) |
---|
| 184 | { |
---|
| 185 | BranchWeight += RangeDesc[a].Probability[v]; |
---|
| 186 | } |
---|
| 187 | } |
---|
| 188 | if ( BranchWeight > 0 ) |
---|
| 189 | { |
---|
| 190 | Verbosity(1) |
---|
| 191 | printf("\tWeight %g: test att %s (val %s = %g)\n", |
---|
| 192 | Weight, AttName[a], AttValName[a][v], |
---|
| 193 | BranchWeight); |
---|
| 194 | |
---|
| 195 | ClassifyCase(Subtree->Branch[s], Weight * BranchWeight); |
---|
| 196 | } |
---|
| 197 | } |
---|
| 198 | break; |
---|
| 199 | } |
---|
| 200 | } |
---|
| 201 | |
---|
| 202 | |
---|
| 203 | |
---|
| 204 | /*************************************************************************/ |
---|
| 205 | /* */ |
---|
| 206 | /* Interpolate a single value between Lower, Cut and Upper */ |
---|
| 207 | /* */ |
---|
| 208 | /*************************************************************************/ |
---|
| 209 | |
---|
| 210 | |
---|
| 211 | float Interpolate(t, v) |
---|
| 212 | /* ---- */ |
---|
| 213 | Tree t; |
---|
| 214 | float v; |
---|
| 215 | { |
---|
| 216 | float Sum=Epsilon; |
---|
| 217 | |
---|
| 218 | if ( v <= t->Lower ) |
---|
| 219 | { |
---|
| 220 | return 1.0; |
---|
| 221 | } |
---|
| 222 | |
---|
| 223 | if ( v <= t->Cut ) |
---|
| 224 | { |
---|
| 225 | return 1 - 0.5 * (v - t->Lower) / (t->Cut - t->Lower + Epsilon); |
---|
| 226 | } |
---|
| 227 | |
---|
| 228 | if ( v < t->Upper ) |
---|
| 229 | { |
---|
| 230 | return 0.5 - 0.5 * (v - t->Cut) / (t->Upper - t->Cut + Epsilon); |
---|
| 231 | } |
---|
| 232 | |
---|
| 233 | return 0.0; |
---|
| 234 | } |
---|
| 235 | |
---|
| 236 | |
---|
| 237 | |
---|
| 238 | /*************************************************************************/ |
---|
| 239 | /* */ |
---|
| 240 | /* Compute the area under a soft threshold curve to the right of a */ |
---|
| 241 | /* given value. */ |
---|
| 242 | /* */ |
---|
| 243 | /*************************************************************************/ |
---|
| 244 | |
---|
| 245 | |
---|
| 246 | float Area(t, v) |
---|
| 247 | /* ---- */ |
---|
| 248 | Tree t; |
---|
| 249 | float v; |
---|
| 250 | { |
---|
| 251 | float Sum=Epsilon, F; |
---|
| 252 | |
---|
| 253 | if ( v < t->Lower ) |
---|
| 254 | { |
---|
| 255 | Sum += t->Lower - v; |
---|
| 256 | v = t->Lower; |
---|
| 257 | } |
---|
| 258 | |
---|
| 259 | if ( v < t->Cut ) |
---|
| 260 | { |
---|
| 261 | F = (t->Cut - v ) / (t->Cut - t->Lower + Epsilon); |
---|
| 262 | |
---|
| 263 | Sum += 0.5 * (t->Cut - v) + 0.25 * F * (t->Cut - v); |
---|
| 264 | v = t->Cut; |
---|
| 265 | } |
---|
| 266 | |
---|
| 267 | if ( v < t->Upper ) |
---|
| 268 | { |
---|
| 269 | F = (t->Upper - v ) / (t->Upper - t->Cut + Epsilon); |
---|
| 270 | |
---|
| 271 | Sum += 0.25 * (t->Upper - v) * F; |
---|
| 272 | } |
---|
| 273 | |
---|
| 274 | Verbosity(1) printf("lower=%g cut=%g upper=%g area=%g\n", |
---|
| 275 | t->Lower, t->Cut, t->Upper, Sum); |
---|
| 276 | |
---|
| 277 | return Sum; |
---|
| 278 | } |
---|
| 279 | |
---|
| 280 | |
---|
| 281 | |
---|
| 282 | /*************************************************************************/ |
---|
| 283 | /* */ |
---|
| 284 | /* Process a single case */ |
---|
| 285 | /* */ |
---|
| 286 | /*************************************************************************/ |
---|
| 287 | |
---|
| 288 | |
---|
| 289 | InterpretTree() |
---|
| 290 | /* ------------- */ |
---|
| 291 | { |
---|
| 292 | ClassNo c, BestClass; |
---|
| 293 | float Uncertainty=1.0; |
---|
| 294 | char Reply; |
---|
| 295 | Attribute a; |
---|
| 296 | |
---|
| 297 | /* Initialise */ |
---|
| 298 | |
---|
| 299 | ForEach(a, 0, MaxAtt) |
---|
| 300 | { |
---|
| 301 | RangeDesc[a].Asked = false; |
---|
| 302 | } |
---|
| 303 | |
---|
| 304 | if ( ! ClassSum ) |
---|
| 305 | { |
---|
| 306 | /* The first time through .. allocate class sums */ |
---|
| 307 | |
---|
| 308 | ClassSum = (float *) malloc((MaxClass+1) * sizeof(float)); |
---|
| 309 | LowClassSum = (float *) malloc((MaxClass+1) * sizeof(float)); |
---|
| 310 | |
---|
| 311 | printf("\n"); |
---|
| 312 | } |
---|
| 313 | else |
---|
| 314 | { |
---|
| 315 | printf("\n-------------------------------------------\n\n"); |
---|
| 316 | } |
---|
| 317 | |
---|
| 318 | ForEach(c, 0, MaxClass) |
---|
| 319 | { |
---|
| 320 | LowClassSum[c] = ClassSum[c] = 0; |
---|
| 321 | } |
---|
| 322 | |
---|
| 323 | /* Find the likelihood of an item's being of each class */ |
---|
| 324 | |
---|
| 325 | ClassifyCase(DecisionTree, 1.0); |
---|
| 326 | |
---|
| 327 | /* Find the best class and show decision made */ |
---|
| 328 | |
---|
| 329 | BestClass = 0; |
---|
| 330 | ForEach(c, 0, MaxClass) |
---|
| 331 | { |
---|
| 332 | Verbosity(1) printf("class %d weight %.2f\n", c, ClassSum[c]); |
---|
| 333 | |
---|
| 334 | Uncertainty -= LowClassSum[c]; |
---|
| 335 | if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c; |
---|
| 336 | } |
---|
| 337 | |
---|
| 338 | printf("\nDecision:\n"); |
---|
| 339 | Decision(BestClass, ClassSum[BestClass], |
---|
| 340 | LowClassSum[BestClass], |
---|
| 341 | Uncertainty + LowClassSum[BestClass]); |
---|
| 342 | |
---|
| 343 | /* Show the other significant classes, if more than two classes */ |
---|
| 344 | |
---|
| 345 | if ( MaxClass > 1 ) |
---|
| 346 | { |
---|
| 347 | while ( true ) |
---|
| 348 | { |
---|
| 349 | ClassSum[BestClass] = 0; |
---|
| 350 | BestClass = 0; |
---|
| 351 | ForEach(c, 0, MaxClass) |
---|
| 352 | { |
---|
| 353 | if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c; |
---|
| 354 | } |
---|
| 355 | |
---|
| 356 | if ( ClassSum[BestClass] < Fuzz ) break; |
---|
| 357 | |
---|
| 358 | Decision(BestClass, ClassSum[BestClass], |
---|
| 359 | LowClassSum[BestClass], |
---|
| 360 | Uncertainty + LowClassSum[BestClass]); |
---|
| 361 | } |
---|
| 362 | } |
---|
| 363 | |
---|
| 364 | /* Prompt for what to do next */ |
---|
| 365 | |
---|
| 366 | while ( true ) |
---|
| 367 | { |
---|
| 368 | printf("\nRetry, new case or quit [r,n,q]: "); |
---|
| 369 | Reply = getchar(); |
---|
| 370 | SkipLine(Reply); |
---|
| 371 | switch ( Reply ) |
---|
| 372 | { |
---|
| 373 | case 'r': return; |
---|
| 374 | case 'n': Clear(); return; |
---|
| 375 | case 'q': exit(0); |
---|
| 376 | default: printf("Please enter 'r', 'n' or 'q'"); |
---|
| 377 | } |
---|
| 378 | } |
---|
| 379 | } |
---|
| 380 | |
---|
| 381 | |
---|
| 382 | |
---|
| 383 | /*************************************************************************/ |
---|
| 384 | /* */ |
---|
| 385 | /* Print the chosen class with certainty factor and range */ |
---|
| 386 | /* */ |
---|
| 387 | /*************************************************************************/ |
---|
| 388 | |
---|
| 389 | |
---|
| 390 | Decision(c, p, lb, ub) |
---|
| 391 | /* -------- */ |
---|
| 392 | ClassNo c; |
---|
| 393 | float p, lb, ub; |
---|
| 394 | { |
---|
| 395 | printf("\t%s", ClassName[c]); |
---|
| 396 | |
---|
| 397 | if ( p < 1-Fuzz || lb < ub - Fuzz ) |
---|
| 398 | { |
---|
| 399 | printf(" CF = %.2f", p); |
---|
| 400 | if ( lb < ub - Fuzz ) |
---|
| 401 | { |
---|
| 402 | printf(" [ %.2f - %.2f ]", lb, ub); |
---|
| 403 | } |
---|
| 404 | } |
---|
| 405 | |
---|
| 406 | printf("\n"); |
---|
| 407 | } |
---|
| 408 | |
---|
| 409 | |
---|
| 410 | |
---|
| 411 | /*************************************************************************/ |
---|
| 412 | /* */ |
---|
| 413 | /* Main routine for classifying items using a decision tree */ |
---|
| 414 | /* */ |
---|
| 415 | /*************************************************************************/ |
---|
| 416 | |
---|
| 417 | |
---|
| 418 | main(Argc, Argv) |
---|
| 419 | /* ---- */ |
---|
| 420 | int Argc; |
---|
| 421 | char *Argv[]; |
---|
| 422 | { |
---|
| 423 | int o; |
---|
| 424 | extern char *optarg; |
---|
| 425 | extern int optind; |
---|
| 426 | Attribute a; |
---|
| 427 | |
---|
| 428 | PrintHeader("decision tree interpreter"); |
---|
| 429 | |
---|
| 430 | /* Process options */ |
---|
| 431 | |
---|
| 432 | while ( (o = getopt(Argc, Argv, "tvf:")) != EOF ) |
---|
| 433 | { |
---|
| 434 | switch (o) |
---|
| 435 | { |
---|
| 436 | case 't': TRACE = 1; |
---|
| 437 | break; |
---|
| 438 | case 'v': VERBOSITY = 1; |
---|
| 439 | break; |
---|
| 440 | case 'f': FileName = optarg; |
---|
| 441 | break; |
---|
| 442 | case '?': printf("unrecognised option\n"); |
---|
| 443 | exit(1); |
---|
| 444 | } |
---|
| 445 | } |
---|
| 446 | |
---|
| 447 | /* Initialise */ |
---|
| 448 | |
---|
| 449 | GetNames(); |
---|
| 450 | |
---|
| 451 | DecisionTree = GetTree(".tree"); |
---|
| 452 | if ( TRACE ) PrintTree(DecisionTree); |
---|
| 453 | |
---|
| 454 | /* Allocate value ranges */ |
---|
| 455 | |
---|
| 456 | RangeDesc = (struct ValRange *) calloc(MaxAtt+1, sizeof(struct ValRange)); |
---|
| 457 | |
---|
| 458 | ForEach(a, 0, MaxAtt) |
---|
| 459 | { |
---|
| 460 | if ( MaxAttVal[a] ) |
---|
| 461 | { |
---|
| 462 | RangeDesc[a].Probability = |
---|
| 463 | (float *) calloc(MaxAttVal[a]+1, sizeof(float)); |
---|
| 464 | } |
---|
| 465 | } |
---|
| 466 | |
---|
| 467 | /* Consult */ |
---|
| 468 | |
---|
| 469 | Clear(); |
---|
| 470 | while ( true ) |
---|
| 471 | { |
---|
| 472 | InterpretTree(); |
---|
| 473 | } |
---|
| 474 | } |
---|