1 | /*************************************************************************/ |
---|
2 | /* */ |
---|
3 | /* Pruning single rules */ |
---|
4 | /* -------------------- */ |
---|
5 | /* */ |
---|
6 | /*************************************************************************/ |
---|
7 | |
---|
8 | |
---|
9 | #include "defns.i" |
---|
10 | #include "types.i" |
---|
11 | #include "extern.i" |
---|
12 | #include "rulex.i" |
---|
13 | |
---|
14 | |
---|
15 | /* External data structures used in building rules */ |
---|
16 | |
---|
17 | extern ItemNo *TargetClassFreq, /* [Boolean] */ |
---|
18 | *Errors, /* [Condition] */ |
---|
19 | *Total; /* [Condition] */ |
---|
20 | |
---|
21 | extern float *Pessimistic, /* [Condition] */ |
---|
22 | *Actual, /* [Condition] */ |
---|
23 | *CondSigLevel; /* [Condition] */ |
---|
24 | |
---|
25 | extern Boolean **CondSatisfiedBy, /* [Condition][ItemNo] */ |
---|
26 | *Deleted; |
---|
27 | |
---|
28 | #define Before(n1,n2) (n1->Tested < n2->Tested ||\ |
---|
29 | n1->NodeType < n2->NodeType ||\ |
---|
30 | n1->Tested == n2->Tested && n1->Cut < n2->Cut) |
---|
31 | |
---|
32 | #define IsTarget(case) (Class(case) == TargetClass ? 1 : 0) |
---|
33 | |
---|
34 | |
---|
35 | |
---|
36 | /*************************************************************************/ |
---|
37 | /* */ |
---|
38 | /* Prune the rule given by the conditions Cond, and the number of */ |
---|
39 | /* conditions NCond, and add the resulting rule to the current */ |
---|
40 | /* ruleset if it is sufficiently accurate */ |
---|
41 | /* */ |
---|
42 | /*************************************************************************/ |
---|
43 | |
---|
44 | |
---|
45 | PruneRule(Cond, NCond, TargetClass) |
---|
46 | /* --------- */ |
---|
47 | Condition Cond[]; |
---|
48 | short NCond; |
---|
49 | ClassNo TargetClass; |
---|
50 | { |
---|
51 | short d, dd, id, Bestd, Bestid, Remaining=NCond; |
---|
52 | float DefaultError, Extra, AddErrs(), TableProb(); |
---|
53 | Boolean Alter, Satisfies(), NewRule(), Redundant(); |
---|
54 | Condition Hold; |
---|
55 | ItemNo i; |
---|
56 | |
---|
57 | ForEach(d, 0, NCond) |
---|
58 | { |
---|
59 | Deleted[d] = false; |
---|
60 | } |
---|
61 | |
---|
62 | /* Evaluate the satisfaction matrix */ |
---|
63 | |
---|
64 | TargetClassFreq[0] = TargetClassFreq[1] = 0; |
---|
65 | |
---|
66 | ForEach(i, 0, MaxItem) |
---|
67 | { |
---|
68 | ForEach(d, 1, NCond) |
---|
69 | { |
---|
70 | CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]); |
---|
71 | } |
---|
72 | TargetClassFreq[IsTarget(Item[i])]++; |
---|
73 | } |
---|
74 | |
---|
75 | DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0); |
---|
76 | |
---|
77 | /* Find conditions to delete */ |
---|
78 | |
---|
79 | Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]); |
---|
80 | |
---|
81 | do |
---|
82 | { |
---|
83 | Alter = false; |
---|
84 | |
---|
85 | FindTables(NCond, TargetClass); |
---|
86 | |
---|
87 | /* Find the condition, deleting which would most improve |
---|
88 | the accuracy of the rule. |
---|
89 | Notes: The pessimistic error rate, and not the actual |
---|
90 | error rate, is currently used. |
---|
91 | When d is 0, we are dealing with all conditions. */ |
---|
92 | |
---|
93 | Bestd = id = 0; |
---|
94 | |
---|
95 | Verbosity(1) |
---|
96 | printf("\n Err Used Pess\tAbsent condition\n"); |
---|
97 | |
---|
98 | ForEach(d, 0, NCond) |
---|
99 | { |
---|
100 | if ( Deleted[d] ) continue; |
---|
101 | |
---|
102 | if ( Total[d] ) |
---|
103 | { |
---|
104 | Actual[d] = Errors[d] / (float) Total[d]; |
---|
105 | Extra = AddErrs((float) Total[d], (float) Errors[d]); |
---|
106 | Pessimistic[d] = (Errors[d] + Extra) / Total[d]; |
---|
107 | } |
---|
108 | else |
---|
109 | { |
---|
110 | Actual[d] = 0; |
---|
111 | Pessimistic[d] = DefaultError; |
---|
112 | } |
---|
113 | |
---|
114 | Verbosity(1) |
---|
115 | printf(" %5d%5d %4.1f%%", |
---|
116 | Errors[d], Total[d], 100 * Pessimistic[d]); |
---|
117 | |
---|
118 | if ( ! d ) |
---|
119 | { |
---|
120 | Verbosity(1) printf("\t<base rule>\n"); |
---|
121 | } |
---|
122 | else |
---|
123 | { |
---|
124 | id++; |
---|
125 | |
---|
126 | /* If significance testing option used, invoke Fisher's |
---|
127 | exact test here to assess probability that division |
---|
128 | by d arises from chance. */ |
---|
129 | |
---|
130 | if ( SIGTEST ) |
---|
131 | { |
---|
132 | CondSigLevel[d] = |
---|
133 | TableProb(Errors[0], |
---|
134 | Errors[d]-Errors[0], |
---|
135 | Total[0]-Errors[0], |
---|
136 | Total[d]-Total[0]-Errors[d]+Errors[0]); |
---|
137 | |
---|
138 | Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]); |
---|
139 | } |
---|
140 | |
---|
141 | Verbosity(1) PrintCondition(Cond[d]); |
---|
142 | |
---|
143 | /* Bestd identifies the condition with lowest pessimistic |
---|
144 | error estimate */ |
---|
145 | |
---|
146 | if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] ) |
---|
147 | { |
---|
148 | Bestd = d; |
---|
149 | Bestid = id; |
---|
150 | } |
---|
151 | |
---|
152 | /* Alter is set true if we are going to drop a condition |
---|
153 | (either because we get lower pessimistic est, or |
---|
154 | because one of the conditions fails a significance test) */ |
---|
155 | |
---|
156 | if ( Pessimistic[d] <= Pessimistic[0] || |
---|
157 | Actual[d] <= Actual[0] || |
---|
158 | SIGTEST && CondSigLevel[d] > SIGTHRESH ) |
---|
159 | { |
---|
160 | Alter = true; |
---|
161 | } |
---|
162 | } |
---|
163 | } |
---|
164 | |
---|
165 | if ( Alter ) |
---|
166 | { |
---|
167 | Verbosity(1) printf("\teliminate test %d\n", Bestid); |
---|
168 | |
---|
169 | Deleted[Bestd] = true; |
---|
170 | Remaining--; |
---|
171 | } |
---|
172 | |
---|
173 | } while ( Alter && Remaining ); |
---|
174 | |
---|
175 | if ( ! Remaining || ! Total[0] ) |
---|
176 | { |
---|
177 | return; |
---|
178 | } |
---|
179 | |
---|
180 | if ( Pessimistic[0] >= DefaultError ) |
---|
181 | { |
---|
182 | Verbosity(1) printf("\ttoo inaccurate\n"); |
---|
183 | return; |
---|
184 | } |
---|
185 | |
---|
186 | /* Sort the conditions */ |
---|
187 | |
---|
188 | ForEach(d, 1, Remaining) |
---|
189 | { |
---|
190 | dd = 0; |
---|
191 | ForEach(id, d, NCond) |
---|
192 | { |
---|
193 | if ( ! Deleted[id] && |
---|
194 | ( ! dd || |
---|
195 | Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) ) |
---|
196 | { |
---|
197 | dd = id; |
---|
198 | } |
---|
199 | } |
---|
200 | if ( dd != d ) |
---|
201 | { |
---|
202 | Hold = Cond[d]; |
---|
203 | Cond[d] = Cond[dd]; |
---|
204 | Cond[dd] = Hold; |
---|
205 | Deleted[dd] = Deleted[d]; |
---|
206 | } |
---|
207 | Deleted[d] = true; |
---|
208 | } |
---|
209 | |
---|
210 | NewRule(Cond, Remaining, TargetClass, Pessimistic[0]); |
---|
211 | } |
---|
212 | |
---|
213 | |
---|
214 | |
---|
215 | /*************************************************************************/ |
---|
216 | /* */ |
---|
217 | /* See whether condition R is redundant */ |
---|
218 | /* */ |
---|
219 | /*************************************************************************/ |
---|
220 | |
---|
221 | |
---|
222 | Boolean Redundant(R, Cond, NCond) |
---|
223 | /* --------- */ |
---|
224 | Condition Cond[]; |
---|
225 | short R, NCond; |
---|
226 | { |
---|
227 | short d, v, vv; |
---|
228 | Test t, Rt; |
---|
229 | Boolean IsSubset(); |
---|
230 | |
---|
231 | Rt = Cond[R]->CondTest; |
---|
232 | v = Cond[R]->TestValue; |
---|
233 | |
---|
234 | ForEach(d, 1, NCond) |
---|
235 | { |
---|
236 | if ( Deleted[d] || d == R ) continue; |
---|
237 | |
---|
238 | t = Cond[d]->CondTest; |
---|
239 | vv = Cond[d]->TestValue; |
---|
240 | |
---|
241 | if ( t->Tested != Rt->Tested ) continue; |
---|
242 | |
---|
243 | switch ( t->NodeType ) |
---|
244 | { |
---|
245 | case BrDiscr: /* test of discrete attribute */ |
---|
246 | |
---|
247 | return false; |
---|
248 | |
---|
249 | case ThreshContin: /* test of continuous attribute */ |
---|
250 | |
---|
251 | if ( vv == v && |
---|
252 | ( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) ) |
---|
253 | { |
---|
254 | return true; |
---|
255 | } |
---|
256 | |
---|
257 | break; |
---|
258 | |
---|
259 | case BrSubset: /* subset test on discrete attribute */ |
---|
260 | |
---|
261 | if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) ) |
---|
262 | { |
---|
263 | return true; |
---|
264 | } |
---|
265 | } |
---|
266 | } |
---|
267 | |
---|
268 | return false; |
---|
269 | } |
---|
270 | |
---|
271 | |
---|
272 | |
---|
273 | /*************************************************************************/ |
---|
274 | /* */ |
---|
275 | /* Decide whether subset S1 of values is contained in subset S2 */ |
---|
276 | /* */ |
---|
277 | /*************************************************************************/ |
---|
278 | |
---|
279 | |
---|
280 | Boolean IsSubset(S1, S2, Att) |
---|
281 | /* -------- */ |
---|
282 | Set S1, S2; |
---|
283 | Attribute Att; |
---|
284 | { |
---|
285 | DiscrValue v; |
---|
286 | |
---|
287 | ForEach(v, 1, MaxAttVal[Att]) |
---|
288 | { |
---|
289 | if ( In(v, S1) && ! In(v, S2) ) return false; |
---|
290 | } |
---|
291 | |
---|
292 | return true; |
---|
293 | } |
---|
294 | |
---|
295 | |
---|
296 | |
---|
297 | /*************************************************************************/ |
---|
298 | /* */ |
---|
299 | /* Find the frequency distribution tables for the current conditions: */ |
---|
300 | /* */ |
---|
301 | /* Total[0] = items matching all conditions */ |
---|
302 | /* Total[d] = items matching all except condition d */ |
---|
303 | /* */ |
---|
304 | /* Errors[0] = wrong-class items matching all conditions */ |
---|
305 | /* Errors[d] = wrong-class items matching all but cond d */ |
---|
306 | /* */ |
---|
307 | /* This routine is critical to the efficiency of rule pruning. It */ |
---|
308 | /* computes the information above in one pass through the data, */ |
---|
309 | /* looking at cases that fail to satisfy at most one of the */ |
---|
310 | /* non-deleted conditions */ |
---|
311 | /* */ |
---|
312 | /*************************************************************************/ |
---|
313 | |
---|
314 | |
---|
315 | FindTables(NCond, TargetClass) |
---|
316 | /* ----------- */ |
---|
317 | short NCond; |
---|
318 | ClassNo TargetClass; |
---|
319 | { |
---|
320 | ItemNo i; |
---|
321 | short Misses, Missed[2], d; |
---|
322 | Boolean CorrectClass; |
---|
323 | |
---|
324 | /* Clear distributions */ |
---|
325 | |
---|
326 | ForEach(d, 0, NCond) |
---|
327 | { |
---|
328 | Total[d] = Errors[d] = 0; |
---|
329 | } |
---|
330 | |
---|
331 | /* Set distributions */ |
---|
332 | |
---|
333 | ForEach(i, 0, MaxItem) |
---|
334 | { |
---|
335 | Misses = 0; |
---|
336 | CorrectClass = IsTarget(Item[i]); |
---|
337 | |
---|
338 | for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ ) |
---|
339 | { |
---|
340 | if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] ) |
---|
341 | { |
---|
342 | Missed[Misses++] = d; |
---|
343 | } |
---|
344 | } |
---|
345 | |
---|
346 | if ( ! Misses ) |
---|
347 | { |
---|
348 | UpdateCount(Total, Errors, 0, CorrectClass); |
---|
349 | } |
---|
350 | else |
---|
351 | if ( Misses == 1 ) |
---|
352 | { |
---|
353 | UpdateCount(Total, Errors, Missed[0], CorrectClass); |
---|
354 | } |
---|
355 | } |
---|
356 | |
---|
357 | /* Adjust counts to reflect cases that met all conditions */ |
---|
358 | |
---|
359 | ForEach(d, 1, NCond) |
---|
360 | { |
---|
361 | if ( ! Deleted[d] ) |
---|
362 | { |
---|
363 | Total[d] += Total[0]; |
---|
364 | Errors[d] += Errors[0]; |
---|
365 | } |
---|
366 | } |
---|
367 | } |
---|
368 | |
---|
369 | |
---|
370 | |
---|
371 | /*************************************************************************/ |
---|
372 | /* */ |
---|
373 | /* Increment the counts Total[d] and Errors[d] */ |
---|
374 | /* */ |
---|
375 | /*************************************************************************/ |
---|
376 | |
---|
377 | |
---|
378 | UpdateCount(T, E, d, OK) |
---|
379 | /* ----------- */ |
---|
380 | ItemNo T[], E[]; |
---|
381 | short d; |
---|
382 | Boolean OK; |
---|
383 | { |
---|
384 | T[d]++; |
---|
385 | if ( ! OK ) E[d]++; |
---|
386 | } |
---|
387 | |
---|
388 | |
---|
389 | |
---|
390 | /*************************************************************************/ |
---|
391 | /* */ |
---|
392 | /* Determine whether the given case description satisfies the given */ |
---|
393 | /* condition. */ |
---|
394 | /* */ |
---|
395 | /*************************************************************************/ |
---|
396 | |
---|
397 | |
---|
398 | Boolean Satisfies(CaseDesc, OneCond) |
---|
399 | /* --------- */ |
---|
400 | Description CaseDesc; |
---|
401 | Condition OneCond; |
---|
402 | { |
---|
403 | DiscrValue v; |
---|
404 | float cv; |
---|
405 | Test t; |
---|
406 | short s; |
---|
407 | Boolean Outcome; |
---|
408 | |
---|
409 | t = OneCond->CondTest; |
---|
410 | |
---|
411 | /* Determine the outcome of this test on this item */ |
---|
412 | |
---|
413 | switch ( t->NodeType ) |
---|
414 | { |
---|
415 | case BrDiscr: /* test of discrete attribute */ |
---|
416 | |
---|
417 | v = DVal(CaseDesc, t->Tested); |
---|
418 | Outcome = ( v == 0 ? -1 : v ); |
---|
419 | break; |
---|
420 | |
---|
421 | case ThreshContin: /* test of continuous attribute */ |
---|
422 | |
---|
423 | cv = CVal(CaseDesc, t->Tested); |
---|
424 | Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 ); |
---|
425 | break; |
---|
426 | |
---|
427 | case BrSubset: /* subset test on discrete attribute */ |
---|
428 | |
---|
429 | v = DVal(CaseDesc, t->Tested); |
---|
430 | Outcome = -1; |
---|
431 | ForEach(s, 1, t->Forks) |
---|
432 | { |
---|
433 | if ( In(v, t->Subset[s]) ) |
---|
434 | { |
---|
435 | Outcome = s; |
---|
436 | break; |
---|
437 | } |
---|
438 | } |
---|
439 | } |
---|
440 | |
---|
441 | return ( Outcome == OneCond->TestValue ); |
---|
442 | } |
---|
443 | |
---|
444 | |
---|
445 | |
---|
446 | /*************************************************************************/ |
---|
447 | /* */ |
---|
448 | /* Hypergeometric distribution (uses tabulated log factorials) */ |
---|
449 | /* */ |
---|
450 | /*************************************************************************/ |
---|
451 | |
---|
452 | |
---|
453 | double Hypergeom(a, r, A, B) |
---|
454 | /* --------- */ |
---|
455 | int a, r, A, B; |
---|
456 | { |
---|
457 | return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] - |
---|
458 | ( LogFact[a] + LogFact[r-a] + LogFact[A-a] |
---|
459 | + LogFact[B-(r-a)] + LogFact[A+B]) ); |
---|
460 | } |
---|
461 | |
---|
462 | |
---|
463 | |
---|
464 | /*************************************************************************/ |
---|
465 | /* */ |
---|
466 | /* TableProb examines the 2x2 contingency table t and computes the */ |
---|
467 | /* probability that a random division could have produced a split at */ |
---|
468 | /* least as extreme as this. Also known as "Fisher's Exact Test", */ |
---|
469 | /* after its inventor, R.A. Fisher. */ |
---|
470 | /* */ |
---|
471 | /*************************************************************************/ |
---|
472 | |
---|
473 | |
---|
474 | float TableProb(t11, t12, t21, t22) |
---|
475 | /* --------- */ |
---|
476 | int t11, t12, t21, t22; |
---|
477 | { |
---|
478 | double Sum=0.0; |
---|
479 | int A, B, r, a, k, a0; |
---|
480 | |
---|
481 | /* First, rearrange the rows and columns of the table to get it into |
---|
482 | canonical form */ |
---|
483 | |
---|
484 | if ( t11 + t12 > t21 + t22 ) |
---|
485 | { |
---|
486 | A = t11 + t12; |
---|
487 | B = t21 + t22; |
---|
488 | |
---|
489 | if ( t11 * (t21 + t22) > t21 * (t11 + t12) ) |
---|
490 | { |
---|
491 | a0 = t11; |
---|
492 | r = t11 + t21; |
---|
493 | } |
---|
494 | else |
---|
495 | { |
---|
496 | a0 = t12; |
---|
497 | r = t12 + t22; |
---|
498 | } |
---|
499 | } |
---|
500 | else |
---|
501 | { |
---|
502 | A = t21 + t22; |
---|
503 | B = t11 + t12; |
---|
504 | if ( t21 * (t11 + t12) > t11 * (t21 + t22) ) |
---|
505 | { |
---|
506 | a0 = t21; |
---|
507 | r = t21 + t11; |
---|
508 | } |
---|
509 | else |
---|
510 | { |
---|
511 | a0 = t22; |
---|
512 | r = t22 + t12; |
---|
513 | } |
---|
514 | } |
---|
515 | |
---|
516 | /* Now compute the probability */ |
---|
517 | |
---|
518 | k = Min(r, A); |
---|
519 | ForEach(a, a0, k) |
---|
520 | { |
---|
521 | Sum += Hypergeom(a, r, A, B); |
---|
522 | } |
---|
523 | |
---|
524 | return Sum; |
---|
525 | } |
---|