[26] | 1 | /*************************************************************************/ |
---|
| 2 | /* */ |
---|
| 3 | /* Pruning single rules */ |
---|
| 4 | /* -------------------- */ |
---|
| 5 | /* */ |
---|
| 6 | /*************************************************************************/ |
---|
| 7 | |
---|
| 8 | |
---|
| 9 | #include "defns.i" |
---|
| 10 | #include "types.i" |
---|
| 11 | #include "extern.i" |
---|
| 12 | #include "rulex.i" |
---|
| 13 | |
---|
| 14 | |
---|
| 15 | /* External data structures used in building rules */ |
---|
| 16 | |
---|
| 17 | extern ItemNo *TargetClassFreq, /* [Boolean] */ |
---|
| 18 | *Errors, /* [Condition] */ |
---|
| 19 | *Total; /* [Condition] */ |
---|
| 20 | |
---|
| 21 | extern float *Pessimistic, /* [Condition] */ |
---|
| 22 | *Actual, /* [Condition] */ |
---|
| 23 | *CondSigLevel; /* [Condition] */ |
---|
| 24 | |
---|
| 25 | extern Boolean **CondSatisfiedBy, /* [Condition][ItemNo] */ |
---|
| 26 | *Deleted; |
---|
| 27 | |
---|
| 28 | #define Before(n1,n2) (n1->Tested < n2->Tested ||\ |
---|
| 29 | n1->NodeType < n2->NodeType ||\ |
---|
| 30 | n1->Tested == n2->Tested && n1->Cut < n2->Cut) |
---|
| 31 | |
---|
| 32 | #define IsTarget(case) (Class(case) == TargetClass ? 1 : 0) |
---|
| 33 | |
---|
| 34 | |
---|
| 35 | |
---|
| 36 | /*************************************************************************/ |
---|
| 37 | /* */ |
---|
| 38 | /* Prune the rule given by the conditions Cond, and the number of */ |
---|
| 39 | /* conditions NCond, and add the resulting rule to the current */ |
---|
| 40 | /* ruleset if it is sufficiently accurate */ |
---|
| 41 | /* */ |
---|
| 42 | /*************************************************************************/ |
---|
| 43 | |
---|
| 44 | |
---|
| 45 | PruneRule(Cond, NCond, TargetClass) |
---|
| 46 | /* --------- */ |
---|
| 47 | Condition Cond[]; |
---|
| 48 | short NCond; |
---|
| 49 | ClassNo TargetClass; |
---|
| 50 | { |
---|
| 51 | short d, dd, id, Bestd, Bestid, Remaining=NCond; |
---|
| 52 | float DefaultError, Extra, AddErrs(), TableProb(); |
---|
| 53 | Boolean Alter, Satisfies(), NewRule(), Redundant(); |
---|
| 54 | Condition Hold; |
---|
| 55 | ItemNo i; |
---|
| 56 | |
---|
| 57 | ForEach(d, 0, NCond) |
---|
| 58 | { |
---|
| 59 | Deleted[d] = false; |
---|
| 60 | } |
---|
| 61 | |
---|
| 62 | /* Evaluate the satisfaction matrix */ |
---|
| 63 | |
---|
| 64 | TargetClassFreq[0] = TargetClassFreq[1] = 0; |
---|
| 65 | |
---|
| 66 | ForEach(i, 0, MaxItem) |
---|
| 67 | { |
---|
| 68 | ForEach(d, 1, NCond) |
---|
| 69 | { |
---|
| 70 | CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]); |
---|
| 71 | } |
---|
| 72 | TargetClassFreq[IsTarget(Item[i])]++; |
---|
| 73 | } |
---|
| 74 | |
---|
| 75 | DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0); |
---|
| 76 | |
---|
| 77 | /* Find conditions to delete */ |
---|
| 78 | |
---|
| 79 | Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]); |
---|
| 80 | |
---|
| 81 | do |
---|
| 82 | { |
---|
| 83 | Alter = false; |
---|
| 84 | |
---|
| 85 | FindTables(NCond, TargetClass); |
---|
| 86 | |
---|
| 87 | /* Find the condition, deleting which would most improve |
---|
| 88 | the accuracy of the rule. |
---|
| 89 | Notes: The pessimistic error rate, and not the actual |
---|
| 90 | error rate, is currently used. |
---|
| 91 | When d is 0, we are dealing with all conditions. */ |
---|
| 92 | |
---|
| 93 | Bestd = id = 0; |
---|
| 94 | |
---|
| 95 | Verbosity(1) |
---|
| 96 | printf("\n Err Used Pess\tAbsent condition\n"); |
---|
| 97 | |
---|
| 98 | ForEach(d, 0, NCond) |
---|
| 99 | { |
---|
| 100 | if ( Deleted[d] ) continue; |
---|
| 101 | |
---|
| 102 | if ( Total[d] ) |
---|
| 103 | { |
---|
| 104 | Actual[d] = Errors[d] / (float) Total[d]; |
---|
| 105 | Extra = AddErrs((float) Total[d], (float) Errors[d]); |
---|
| 106 | Pessimistic[d] = (Errors[d] + Extra) / Total[d]; |
---|
| 107 | } |
---|
| 108 | else |
---|
| 109 | { |
---|
| 110 | Actual[d] = 0; |
---|
| 111 | Pessimistic[d] = DefaultError; |
---|
| 112 | } |
---|
| 113 | |
---|
| 114 | Verbosity(1) |
---|
| 115 | printf(" %5d%5d %4.1f%%", |
---|
| 116 | Errors[d], Total[d], 100 * Pessimistic[d]); |
---|
| 117 | |
---|
| 118 | if ( ! d ) |
---|
| 119 | { |
---|
| 120 | Verbosity(1) printf("\t<base rule>\n"); |
---|
| 121 | } |
---|
| 122 | else |
---|
| 123 | { |
---|
| 124 | id++; |
---|
| 125 | |
---|
| 126 | /* If significance testing option used, invoke Fisher's |
---|
| 127 | exact test here to assess probability that division |
---|
| 128 | by d arises from chance. */ |
---|
| 129 | |
---|
| 130 | if ( SIGTEST ) |
---|
| 131 | { |
---|
| 132 | CondSigLevel[d] = |
---|
| 133 | TableProb(Errors[0], |
---|
| 134 | Errors[d]-Errors[0], |
---|
| 135 | Total[0]-Errors[0], |
---|
| 136 | Total[d]-Total[0]-Errors[d]+Errors[0]); |
---|
| 137 | |
---|
| 138 | Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]); |
---|
| 139 | } |
---|
| 140 | |
---|
| 141 | Verbosity(1) PrintCondition(Cond[d]); |
---|
| 142 | |
---|
| 143 | /* Bestd identifies the condition with lowest pessimistic |
---|
| 144 | error estimate */ |
---|
| 145 | |
---|
| 146 | if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] ) |
---|
| 147 | { |
---|
| 148 | Bestd = d; |
---|
| 149 | Bestid = id; |
---|
| 150 | } |
---|
| 151 | |
---|
| 152 | /* Alter is set true if we are going to drop a condition |
---|
| 153 | (either because we get lower pessimistic est, or |
---|
| 154 | because one of the conditions fails a significance test) */ |
---|
| 155 | |
---|
| 156 | if ( Pessimistic[d] <= Pessimistic[0] || |
---|
| 157 | Actual[d] <= Actual[0] || |
---|
| 158 | SIGTEST && CondSigLevel[d] > SIGTHRESH ) |
---|
| 159 | { |
---|
| 160 | Alter = true; |
---|
| 161 | } |
---|
| 162 | } |
---|
| 163 | } |
---|
| 164 | |
---|
| 165 | if ( Alter ) |
---|
| 166 | { |
---|
| 167 | Verbosity(1) printf("\teliminate test %d\n", Bestid); |
---|
| 168 | |
---|
| 169 | Deleted[Bestd] = true; |
---|
| 170 | Remaining--; |
---|
| 171 | } |
---|
| 172 | |
---|
| 173 | } while ( Alter && Remaining ); |
---|
| 174 | |
---|
| 175 | if ( ! Remaining || ! Total[0] ) |
---|
| 176 | { |
---|
| 177 | return; |
---|
| 178 | } |
---|
| 179 | |
---|
| 180 | if ( Pessimistic[0] >= DefaultError ) |
---|
| 181 | { |
---|
| 182 | Verbosity(1) printf("\ttoo inaccurate\n"); |
---|
| 183 | return; |
---|
| 184 | } |
---|
| 185 | |
---|
| 186 | /* Sort the conditions */ |
---|
| 187 | |
---|
| 188 | ForEach(d, 1, Remaining) |
---|
| 189 | { |
---|
| 190 | dd = 0; |
---|
| 191 | ForEach(id, d, NCond) |
---|
| 192 | { |
---|
| 193 | if ( ! Deleted[id] && |
---|
| 194 | ( ! dd || |
---|
| 195 | Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) ) |
---|
| 196 | { |
---|
| 197 | dd = id; |
---|
| 198 | } |
---|
| 199 | } |
---|
| 200 | if ( dd != d ) |
---|
| 201 | { |
---|
| 202 | Hold = Cond[d]; |
---|
| 203 | Cond[d] = Cond[dd]; |
---|
| 204 | Cond[dd] = Hold; |
---|
| 205 | Deleted[dd] = Deleted[d]; |
---|
| 206 | } |
---|
| 207 | Deleted[d] = true; |
---|
| 208 | } |
---|
| 209 | |
---|
| 210 | NewRule(Cond, Remaining, TargetClass, Pessimistic[0]); |
---|
| 211 | } |
---|
| 212 | |
---|
| 213 | |
---|
| 214 | |
---|
| 215 | /*************************************************************************/ |
---|
| 216 | /* */ |
---|
| 217 | /* See whether condition R is redundant */ |
---|
| 218 | /* */ |
---|
| 219 | /*************************************************************************/ |
---|
| 220 | |
---|
| 221 | |
---|
| 222 | Boolean Redundant(R, Cond, NCond) |
---|
| 223 | /* --------- */ |
---|
| 224 | Condition Cond[]; |
---|
| 225 | short R, NCond; |
---|
| 226 | { |
---|
| 227 | short d, v, vv; |
---|
| 228 | Test t, Rt; |
---|
| 229 | Boolean IsSubset(); |
---|
| 230 | |
---|
| 231 | Rt = Cond[R]->CondTest; |
---|
| 232 | v = Cond[R]->TestValue; |
---|
| 233 | |
---|
| 234 | ForEach(d, 1, NCond) |
---|
| 235 | { |
---|
| 236 | if ( Deleted[d] || d == R ) continue; |
---|
| 237 | |
---|
| 238 | t = Cond[d]->CondTest; |
---|
| 239 | vv = Cond[d]->TestValue; |
---|
| 240 | |
---|
| 241 | if ( t->Tested != Rt->Tested ) continue; |
---|
| 242 | |
---|
| 243 | switch ( t->NodeType ) |
---|
| 244 | { |
---|
| 245 | case BrDiscr: /* test of discrete attribute */ |
---|
| 246 | |
---|
| 247 | return false; |
---|
| 248 | |
---|
| 249 | case ThreshContin: /* test of continuous attribute */ |
---|
| 250 | |
---|
| 251 | if ( vv == v && |
---|
| 252 | ( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) ) |
---|
| 253 | { |
---|
| 254 | return true; |
---|
| 255 | } |
---|
| 256 | |
---|
| 257 | break; |
---|
| 258 | |
---|
| 259 | case BrSubset: /* subset test on discrete attribute */ |
---|
| 260 | |
---|
| 261 | if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) ) |
---|
| 262 | { |
---|
| 263 | return true; |
---|
| 264 | } |
---|
| 265 | } |
---|
| 266 | } |
---|
| 267 | |
---|
| 268 | return false; |
---|
| 269 | } |
---|
| 270 | |
---|
| 271 | |
---|
| 272 | |
---|
| 273 | /*************************************************************************/ |
---|
| 274 | /* */ |
---|
| 275 | /* Decide whether subset S1 of values is contained in subset S2 */ |
---|
| 276 | /* */ |
---|
| 277 | /*************************************************************************/ |
---|
| 278 | |
---|
| 279 | |
---|
| 280 | Boolean IsSubset(S1, S2, Att) |
---|
| 281 | /* -------- */ |
---|
| 282 | Set S1, S2; |
---|
| 283 | Attribute Att; |
---|
| 284 | { |
---|
| 285 | DiscrValue v; |
---|
| 286 | |
---|
| 287 | ForEach(v, 1, MaxAttVal[Att]) |
---|
| 288 | { |
---|
| 289 | if ( In(v, S1) && ! In(v, S2) ) return false; |
---|
| 290 | } |
---|
| 291 | |
---|
| 292 | return true; |
---|
| 293 | } |
---|
| 294 | |
---|
| 295 | |
---|
| 296 | |
---|
| 297 | /*************************************************************************/ |
---|
| 298 | /* */ |
---|
| 299 | /* Find the frequency distribution tables for the current conditions: */ |
---|
| 300 | /* */ |
---|
| 301 | /* Total[0] = items matching all conditions */ |
---|
| 302 | /* Total[d] = items matching all except condition d */ |
---|
| 303 | /* */ |
---|
| 304 | /* Errors[0] = wrong-class items matching all conditions */ |
---|
| 305 | /* Errors[d] = wrong-class items matching all but cond d */ |
---|
| 306 | /* */ |
---|
| 307 | /* This routine is critical to the efficiency of rule pruning. It */ |
---|
| 308 | /* computes the information above in one pass through the data, */ |
---|
| 309 | /* looking at cases that fail to satisfy at most one of the */ |
---|
| 310 | /* non-deleted conditions */ |
---|
| 311 | /* */ |
---|
| 312 | /*************************************************************************/ |
---|
| 313 | |
---|
| 314 | |
---|
| 315 | FindTables(NCond, TargetClass) |
---|
| 316 | /* ----------- */ |
---|
| 317 | short NCond; |
---|
| 318 | ClassNo TargetClass; |
---|
| 319 | { |
---|
| 320 | ItemNo i; |
---|
| 321 | short Misses, Missed[2], d; |
---|
| 322 | Boolean CorrectClass; |
---|
| 323 | |
---|
| 324 | /* Clear distributions */ |
---|
| 325 | |
---|
| 326 | ForEach(d, 0, NCond) |
---|
| 327 | { |
---|
| 328 | Total[d] = Errors[d] = 0; |
---|
| 329 | } |
---|
| 330 | |
---|
| 331 | /* Set distributions */ |
---|
| 332 | |
---|
| 333 | ForEach(i, 0, MaxItem) |
---|
| 334 | { |
---|
| 335 | Misses = 0; |
---|
| 336 | CorrectClass = IsTarget(Item[i]); |
---|
| 337 | |
---|
| 338 | for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ ) |
---|
| 339 | { |
---|
| 340 | if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] ) |
---|
| 341 | { |
---|
| 342 | Missed[Misses++] = d; |
---|
| 343 | } |
---|
| 344 | } |
---|
| 345 | |
---|
| 346 | if ( ! Misses ) |
---|
| 347 | { |
---|
| 348 | UpdateCount(Total, Errors, 0, CorrectClass); |
---|
| 349 | } |
---|
| 350 | else |
---|
| 351 | if ( Misses == 1 ) |
---|
| 352 | { |
---|
| 353 | UpdateCount(Total, Errors, Missed[0], CorrectClass); |
---|
| 354 | } |
---|
| 355 | } |
---|
| 356 | |
---|
| 357 | /* Adjust counts to reflect cases that met all conditions */ |
---|
| 358 | |
---|
| 359 | ForEach(d, 1, NCond) |
---|
| 360 | { |
---|
| 361 | if ( ! Deleted[d] ) |
---|
| 362 | { |
---|
| 363 | Total[d] += Total[0]; |
---|
| 364 | Errors[d] += Errors[0]; |
---|
| 365 | } |
---|
| 366 | } |
---|
| 367 | } |
---|
| 368 | |
---|
| 369 | |
---|
| 370 | |
---|
| 371 | /*************************************************************************/ |
---|
| 372 | /* */ |
---|
| 373 | /* Increment the counts Total[d] and Errors[d] */ |
---|
| 374 | /* */ |
---|
| 375 | /*************************************************************************/ |
---|
| 376 | |
---|
| 377 | |
---|
| 378 | UpdateCount(T, E, d, OK) |
---|
| 379 | /* ----------- */ |
---|
| 380 | ItemNo T[], E[]; |
---|
| 381 | short d; |
---|
| 382 | Boolean OK; |
---|
| 383 | { |
---|
| 384 | T[d]++; |
---|
| 385 | if ( ! OK ) E[d]++; |
---|
| 386 | } |
---|
| 387 | |
---|
| 388 | |
---|
| 389 | |
---|
| 390 | /*************************************************************************/ |
---|
| 391 | /* */ |
---|
| 392 | /* Determine whether the given case description satisfies the given */ |
---|
| 393 | /* condition. */ |
---|
| 394 | /* */ |
---|
| 395 | /*************************************************************************/ |
---|
| 396 | |
---|
| 397 | |
---|
| 398 | Boolean Satisfies(CaseDesc, OneCond) |
---|
| 399 | /* --------- */ |
---|
| 400 | Description CaseDesc; |
---|
| 401 | Condition OneCond; |
---|
| 402 | { |
---|
| 403 | DiscrValue v; |
---|
| 404 | float cv; |
---|
| 405 | Test t; |
---|
| 406 | short s; |
---|
| 407 | Boolean Outcome; |
---|
| 408 | |
---|
| 409 | t = OneCond->CondTest; |
---|
| 410 | |
---|
| 411 | /* Determine the outcome of this test on this item */ |
---|
| 412 | |
---|
| 413 | switch ( t->NodeType ) |
---|
| 414 | { |
---|
| 415 | case BrDiscr: /* test of discrete attribute */ |
---|
| 416 | |
---|
| 417 | v = DVal(CaseDesc, t->Tested); |
---|
| 418 | Outcome = ( v == 0 ? -1 : v ); |
---|
| 419 | break; |
---|
| 420 | |
---|
| 421 | case ThreshContin: /* test of continuous attribute */ |
---|
| 422 | |
---|
| 423 | cv = CVal(CaseDesc, t->Tested); |
---|
| 424 | Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 ); |
---|
| 425 | break; |
---|
| 426 | |
---|
| 427 | case BrSubset: /* subset test on discrete attribute */ |
---|
| 428 | |
---|
| 429 | v = DVal(CaseDesc, t->Tested); |
---|
| 430 | Outcome = -1; |
---|
| 431 | ForEach(s, 1, t->Forks) |
---|
| 432 | { |
---|
| 433 | if ( In(v, t->Subset[s]) ) |
---|
| 434 | { |
---|
| 435 | Outcome = s; |
---|
| 436 | break; |
---|
| 437 | } |
---|
| 438 | } |
---|
| 439 | } |
---|
| 440 | |
---|
| 441 | return ( Outcome == OneCond->TestValue ); |
---|
| 442 | } |
---|
| 443 | |
---|
| 444 | |
---|
| 445 | |
---|
| 446 | /*************************************************************************/ |
---|
| 447 | /* */ |
---|
| 448 | /* Hypergeometric distribution (uses tabulated log factorials) */ |
---|
| 449 | /* */ |
---|
| 450 | /*************************************************************************/ |
---|
| 451 | |
---|
| 452 | |
---|
| 453 | double Hypergeom(a, r, A, B) |
---|
| 454 | /* --------- */ |
---|
| 455 | int a, r, A, B; |
---|
| 456 | { |
---|
| 457 | return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] - |
---|
| 458 | ( LogFact[a] + LogFact[r-a] + LogFact[A-a] |
---|
| 459 | + LogFact[B-(r-a)] + LogFact[A+B]) ); |
---|
| 460 | } |
---|
| 461 | |
---|
| 462 | |
---|
| 463 | |
---|
| 464 | /*************************************************************************/ |
---|
| 465 | /* */ |
---|
| 466 | /* TableProb examines the 2x2 contingency table t and computes the */ |
---|
| 467 | /* probability that a random division could have produced a split at */ |
---|
| 468 | /* least as extreme as this. Also known as "Fisher's Exact Test", */ |
---|
| 469 | /* after its inventor, R.A. Fisher. */ |
---|
| 470 | /* */ |
---|
| 471 | /*************************************************************************/ |
---|
| 472 | |
---|
| 473 | |
---|
| 474 | float TableProb(t11, t12, t21, t22) |
---|
| 475 | /* --------- */ |
---|
| 476 | int t11, t12, t21, t22; |
---|
| 477 | { |
---|
| 478 | double Sum=0.0; |
---|
| 479 | int A, B, r, a, k, a0; |
---|
| 480 | |
---|
| 481 | /* First, rearrange the rows and columns of the table to get it into |
---|
| 482 | canonical form */ |
---|
| 483 | |
---|
| 484 | if ( t11 + t12 > t21 + t22 ) |
---|
| 485 | { |
---|
| 486 | A = t11 + t12; |
---|
| 487 | B = t21 + t22; |
---|
| 488 | |
---|
| 489 | if ( t11 * (t21 + t22) > t21 * (t11 + t12) ) |
---|
| 490 | { |
---|
| 491 | a0 = t11; |
---|
| 492 | r = t11 + t21; |
---|
| 493 | } |
---|
| 494 | else |
---|
| 495 | { |
---|
| 496 | a0 = t12; |
---|
| 497 | r = t12 + t22; |
---|
| 498 | } |
---|
| 499 | } |
---|
| 500 | else |
---|
| 501 | { |
---|
| 502 | A = t21 + t22; |
---|
| 503 | B = t11 + t12; |
---|
| 504 | if ( t21 * (t11 + t12) > t11 * (t21 + t22) ) |
---|
| 505 | { |
---|
| 506 | a0 = t21; |
---|
| 507 | r = t21 + t11; |
---|
| 508 | } |
---|
| 509 | else |
---|
| 510 | { |
---|
| 511 | a0 = t22; |
---|
| 512 | r = t22 + t12; |
---|
| 513 | } |
---|
| 514 | } |
---|
| 515 | |
---|
| 516 | /* Now compute the probability */ |
---|
| 517 | |
---|
| 518 | k = Min(r, A); |
---|
| 519 | ForEach(a, a0, k) |
---|
| 520 | { |
---|
| 521 | Sum += Hypergeom(a, r, A, B); |
---|
| 522 | } |
---|
| 523 | |
---|
| 524 | return Sum; |
---|
| 525 | } |
---|