Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

prunerule.c @ 26

Last change on this file since 26 was 26, checked in by (none), 14 years ago
blabla
File size: 11.1 KB

Line
1	/*************************************************************************/
2	/* */
3	/* Pruning single rules */
4	/* -------------------- */
5	/* */
6	/*************************************************************************/
7
8
9	#include "defns.i"
10	#include "types.i"
11	#include "extern.i"
12	#include "rulex.i"
13
14
15	/* External data structures used in building rules */
16
17	extern ItemNo TargetClassFreq, / [Boolean] */
18	Errors, / [Condition] */
19	Total; / [Condition] */
20
21	extern float Pessimistic, / [Condition] */
22	Actual, / [Condition] */
23	CondSigLevel; / [Condition] */
24
25	extern Boolean *CondSatisfiedBy, / [Condition][ItemNo] */
26	*Deleted;
27
28	#define Before(n1,n2) (n1->Tested < n2->Tested \|\|\
29	n1->NodeType < n2->NodeType \|\|\
30	n1->Tested == n2->Tested && n1->Cut < n2->Cut)
31
32	#define IsTarget(case) (Class(case) == TargetClass ? 1 : 0)
33
34
35
36	/*************************************************************************/
37	/* */
38	/* Prune the rule given by the conditions Cond, and the number of */
39	/* conditions NCond, and add the resulting rule to the current */
40	/* ruleset if it is sufficiently accurate */
41	/* */
42	/*************************************************************************/
43
44
45	PruneRule(Cond, NCond, TargetClass)
46	/* --------- */
47	Condition Cond[];
48	short NCond;
49	ClassNo TargetClass;
50	{
51	short d, dd, id, Bestd, Bestid, Remaining=NCond;
52	float DefaultError, Extra, AddErrs(), TableProb();
53	Boolean Alter, Satisfies(), NewRule(), Redundant();
54	Condition Hold;
55	ItemNo i;
56
57	ForEach(d, 0, NCond)
58	{
59	Deleted[d] = false;
60	}
61
62	/* Evaluate the satisfaction matrix */
63
64	TargetClassFreq[0] = TargetClassFreq[1] = 0;
65
66	ForEach(i, 0, MaxItem)
67	{
68	ForEach(d, 1, NCond)
69	{
70	CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]);
71	}
72	TargetClassFreq[IsTarget(Item[i])]++;
73	}
74
75	DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0);
76
77	/* Find conditions to delete */
78
79	Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]);
80
81	do
82	{
83	Alter = false;
84
85	FindTables(NCond, TargetClass);
86
87	/* Find the condition, deleting which would most improve
88	the accuracy of the rule.
89	Notes: The pessimistic error rate, and not the actual
90	error rate, is currently used.
91	When d is 0, we are dealing with all conditions. */
92
93	Bestd = id = 0;
94
95	Verbosity(1)
96	printf("\n Err Used Pess\tAbsent condition\n");
97
98	ForEach(d, 0, NCond)
99	{
100	if ( Deleted[d] ) continue;
101
102	if ( Total[d] )
103	{
104	Actual[d] = Errors[d] / (float) Total[d];
105	Extra = AddErrs((float) Total[d], (float) Errors[d]);
106	Pessimistic[d] = (Errors[d] + Extra) / Total[d];
107	}
108	else
109	{
110	Actual[d] = 0;
111	Pessimistic[d] = DefaultError;
112	}
113
114	Verbosity(1)
115	printf(" %5d%5d %4.1f%%",
116	Errors[d], Total[d], 100 * Pessimistic[d]);
117
118	if ( ! d )
119	{
120	Verbosity(1) printf("\t<base rule>\n");
121	}
122	else
123	{
124	id++;
125
126	/* If significance testing option used, invoke Fisher's
127	exact test here to assess probability that division
128	by d arises from chance. */
129
130	if ( SIGTEST )
131	{
132	CondSigLevel[d] =
133	TableProb(Errors[0],
134	Errors[d]-Errors[0],
135	Total[0]-Errors[0],
136	Total[d]-Total[0]-Errors[d]+Errors[0]);
137
138	Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]);
139	}
140
141	Verbosity(1) PrintCondition(Cond[d]);
142
143	/* Bestd identifies the condition with lowest pessimistic
144	error estimate */
145
146	if ( ! Bestd \|\| Pessimistic[d] <= Pessimistic[Bestd] )
147	{
148	Bestd = d;
149	Bestid = id;
150	}
151
152	/* Alter is set true if we are going to drop a condition
153	(either because we get lower pessimistic est, or
154	because one of the conditions fails a significance test) */
155
156	if ( Pessimistic[d] <= Pessimistic[0] \|\|
157	Actual[d] <= Actual[0] \|\|
158	SIGTEST && CondSigLevel[d] > SIGTHRESH )
159	{
160	Alter = true;
161	}
162	}
163	}
164
165	if ( Alter )
166	{
167	Verbosity(1) printf("\teliminate test %d\n", Bestid);
168
169	Deleted[Bestd] = true;
170	Remaining--;
171	}
172
173	} while ( Alter && Remaining );
174
175	if ( ! Remaining \|\| ! Total[0] )
176	{
177	return;
178	}
179
180	if ( Pessimistic[0] >= DefaultError )
181	{
182	Verbosity(1) printf("\ttoo inaccurate\n");
183	return;
184	}
185
186	/* Sort the conditions */
187
188	ForEach(d, 1, Remaining)
189	{
190	dd = 0;
191	ForEach(id, d, NCond)
192	{
193	if ( ! Deleted[id] &&
194	( ! dd \|\|
195	Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) )
196	{
197	dd = id;
198	}
199	}
200	if ( dd != d )
201	{
202	Hold = Cond[d];
203	Cond[d] = Cond[dd];
204	Cond[dd] = Hold;
205	Deleted[dd] = Deleted[d];
206	}
207	Deleted[d] = true;
208	}
209
210	NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);
211	}
212
213
214
215	/*************************************************************************/
216	/* */
217	/* See whether condition R is redundant */
218	/* */
219	/*************************************************************************/
220
221
222	Boolean Redundant(R, Cond, NCond)
223	/* --------- */
224	Condition Cond[];
225	short R, NCond;
226	{
227	short d, v, vv;
228	Test t, Rt;
229	Boolean IsSubset();
230
231	Rt = Cond[R]->CondTest;
232	v = Cond[R]->TestValue;
233
234	ForEach(d, 1, NCond)
235	{
236	if ( Deleted[d] \|\| d == R ) continue;
237
238	t = Cond[d]->CondTest;
239	vv = Cond[d]->TestValue;
240
241	if ( t->Tested != Rt->Tested ) continue;
242
243	switch ( t->NodeType )
244	{
245	case BrDiscr: /* test of discrete attribute */
246
247	return false;
248
249	case ThreshContin: /* test of continuous attribute */
250
251	if ( vv == v &&
252	( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) )
253	{
254	return true;
255	}
256
257	break;
258
259	case BrSubset: /* subset test on discrete attribute */
260
261	if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) )
262	{
263	return true;
264	}
265	}
266	}
267
268	return false;
269	}
270
271
272
273	/*************************************************************************/
274	/* */
275	/* Decide whether subset S1 of values is contained in subset S2 */
276	/* */
277	/*************************************************************************/
278
279
280	Boolean IsSubset(S1, S2, Att)
281	/* -------- */
282	Set S1, S2;
283	Attribute Att;
284	{
285	DiscrValue v;
286
287	ForEach(v, 1, MaxAttVal[Att])
288	{
289	if ( In(v, S1) && ! In(v, S2) ) return false;
290	}
291
292	return true;
293	}
294
295
296
297	/*************************************************************************/
298	/* */
299	/* Find the frequency distribution tables for the current conditions: */
300	/* */
301	/* Total[0] = items matching all conditions */
302	/* Total[d] = items matching all except condition d */
303	/* */
304	/* Errors[0] = wrong-class items matching all conditions */
305	/* Errors[d] = wrong-class items matching all but cond d */
306	/* */
307	/* This routine is critical to the efficiency of rule pruning. It */
308	/* computes the information above in one pass through the data, */
309	/* looking at cases that fail to satisfy at most one of the */
310	/* non-deleted conditions */
311	/* */
312	/*************************************************************************/
313
314
315	FindTables(NCond, TargetClass)
316	/* ----------- */
317	short NCond;
318	ClassNo TargetClass;
319	{
320	ItemNo i;
321	short Misses, Missed[2], d;
322	Boolean CorrectClass;
323
324	/* Clear distributions */
325
326	ForEach(d, 0, NCond)
327	{
328	Total[d] = Errors[d] = 0;
329	}
330
331	/* Set distributions */
332
333	ForEach(i, 0, MaxItem)
334	{
335	Misses = 0;
336	CorrectClass = IsTarget(Item[i]);
337
338	for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ )
339	{
340	if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] )
341	{
342	Missed[Misses++] = d;
343	}
344	}
345
346	if ( ! Misses )
347	{
348	UpdateCount(Total, Errors, 0, CorrectClass);
349	}
350	else
351	if ( Misses == 1 )
352	{
353	UpdateCount(Total, Errors, Missed[0], CorrectClass);
354	}
355	}
356
357	/* Adjust counts to reflect cases that met all conditions */
358
359	ForEach(d, 1, NCond)
360	{
361	if ( ! Deleted[d] )
362	{
363	Total[d] += Total[0];
364	Errors[d] += Errors[0];
365	}
366	}
367	}
368
369
370
371	/*************************************************************************/
372	/* */
373	/* Increment the counts Total[d] and Errors[d] */
374	/* */
375	/*************************************************************************/
376
377
378	UpdateCount(T, E, d, OK)
379	/* ----------- */
380	ItemNo T[], E[];
381	short d;
382	Boolean OK;
383	{
384	T[d]++;
385	if ( ! OK ) E[d]++;
386	}
387
388
389
390	/*************************************************************************/
391	/* */
392	/* Determine whether the given case description satisfies the given */
393	/* condition. */
394	/* */
395	/*************************************************************************/
396
397
398	Boolean Satisfies(CaseDesc, OneCond)
399	/* --------- */
400	Description CaseDesc;
401	Condition OneCond;
402	{
403	DiscrValue v;
404	float cv;
405	Test t;
406	short s;
407	Boolean Outcome;
408
409	t = OneCond->CondTest;
410
411	/* Determine the outcome of this test on this item */
412
413	switch ( t->NodeType )
414	{
415	case BrDiscr: /* test of discrete attribute */
416
417	v = DVal(CaseDesc, t->Tested);
418	Outcome = ( v == 0 ? -1 : v );
419	break;
420
421	case ThreshContin: /* test of continuous attribute */
422
423	cv = CVal(CaseDesc, t->Tested);
424	Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 );
425	break;
426
427	case BrSubset: /* subset test on discrete attribute */
428
429	v = DVal(CaseDesc, t->Tested);
430	Outcome = -1;
431	ForEach(s, 1, t->Forks)
432	{
433	if ( In(v, t->Subset[s]) )
434	{
435	Outcome = s;
436	break;
437	}
438	}
439	}
440
441	return ( Outcome == OneCond->TestValue );
442	}
443
444
445
446	/*************************************************************************/
447	/* */
448	/* Hypergeometric distribution (uses tabulated log factorials) */
449	/* */
450	/*************************************************************************/
451
452
453	double Hypergeom(a, r, A, B)
454	/* --------- */
455	int a, r, A, B;
456	{
457	return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] -
458	( LogFact[a] + LogFact[r-a] + LogFact[A-a]
459	+ LogFact[B-(r-a)] + LogFact[A+B]) );
460	}
461
462
463
464	/*************************************************************************/
465	/* */
466	/* TableProb examines the 2x2 contingency table t and computes the */
467	/* probability that a random division could have produced a split at */
468	/* least as extreme as this. Also known as "Fisher's Exact Test", */
469	/* after its inventor, R.A. Fisher. */
470	/* */
471	/*************************************************************************/
472
473
474	float TableProb(t11, t12, t21, t22)
475	/* --------- */
476	int t11, t12, t21, t22;
477	{
478	double Sum=0.0;
479	int A, B, r, a, k, a0;
480
481	/* First, rearrange the rows and columns of the table to get it into
482	canonical form */
483
484	if ( t11 + t12 > t21 + t22 )
485	{
486	A = t11 + t12;
487	B = t21 + t22;
488
489	if ( t11 * (t21 + t22) > t21 * (t11 + t12) )
490	{
491	a0 = t11;
492	r = t11 + t21;
493	}
494	else
495	{
496	a0 = t12;
497	r = t12 + t22;
498	}
499	}
500	else
501	{
502	A = t21 + t22;
503	B = t11 + t12;
504	if ( t21 * (t11 + t12) > t11 * (t21 + t22) )
505	{
506	a0 = t21;
507	r = t21 + t11;
508	}
509	else
510	{
511	a0 = t22;
512	r = t22 + t12;
513	}
514	}
515
516	/* Now compute the probability */
517
518	k = Min(r, A);
519	ForEach(a, a0, k)
520	{
521	Sum += Hypergeom(a, r, A, B);
522	}
523
524	return Sum;
525	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/Parallel-DT/R8/Src/prunerule.c @ 26

Download in other formats: