Context Navigation

prunerule.c

Last change on this file was 26, checked in by (none), 15 years ago
blabla
File size: 11.1 KB

Rev	Line
[26]	1	/*************************************************************************/
	2	/* */
	3	/* Pruning single rules */
	4	/* -------------------- */
	5	/* */
	6	/*************************************************************************/
	7
	8
	9	#include "defns.i"
	10	#include "types.i"
	11	#include "extern.i"
	12	#include "rulex.i"
	13
	14
	15	/* External data structures used in building rules */
	16
	17	extern ItemNo TargetClassFreq, / [Boolean] */
	18	Errors, / [Condition] */
	19	Total; / [Condition] */
	20
	21	extern float Pessimistic, / [Condition] */
	22	Actual, / [Condition] */
	23	CondSigLevel; / [Condition] */
	24
	25	extern Boolean *CondSatisfiedBy, / [Condition][ItemNo] */
	26	*Deleted;
	27
	28	#define Before(n1,n2) (n1->Tested < n2->Tested \|\|\
	29	n1->NodeType < n2->NodeType \|\|\
	30	n1->Tested == n2->Tested && n1->Cut < n2->Cut)
	31
	32	#define IsTarget(case) (Class(case) == TargetClass ? 1 : 0)
	33
	34
	35
	36	/*************************************************************************/
	37	/* */
	38	/* Prune the rule given by the conditions Cond, and the number of */
	39	/* conditions NCond, and add the resulting rule to the current */
	40	/* ruleset if it is sufficiently accurate */
	41	/* */
	42	/*************************************************************************/
	43
	44
	45	PruneRule(Cond, NCond, TargetClass)
	46	/* --------- */
	47	Condition Cond[];
	48	short NCond;
	49	ClassNo TargetClass;
	50	{
	51	short d, dd, id, Bestd, Bestid, Remaining=NCond;
	52	float DefaultError, Extra, AddErrs(), TableProb();
	53	Boolean Alter, Satisfies(), NewRule(), Redundant();
	54	Condition Hold;
	55	ItemNo i;
	56
	57	ForEach(d, 0, NCond)
	58	{
	59	Deleted[d] = false;
	60	}
	61
	62	/* Evaluate the satisfaction matrix */
	63
	64	TargetClassFreq[0] = TargetClassFreq[1] = 0;
	65
	66	ForEach(i, 0, MaxItem)
	67	{
	68	ForEach(d, 1, NCond)
	69	{
	70	CondSatisfiedBy[d][i] = Satisfies(Item[i], Cond[d]);
	71	}
	72	TargetClassFreq[IsTarget(Item[i])]++;
	73	}
	74
	75	DefaultError = 1.0 - (TargetClassFreq[true] + 1.0) / (MaxItem + 3.0);
	76
	77	/* Find conditions to delete */
	78
	79	Verbosity(1) printf("\n pruning rule for %s", ClassName[TargetClass]);
	80
	81	do
	82	{
	83	Alter = false;
	84
	85	FindTables(NCond, TargetClass);
	86
	87	/* Find the condition, deleting which would most improve
	88	the accuracy of the rule.
	89	Notes: The pessimistic error rate, and not the actual
	90	error rate, is currently used.
	91	When d is 0, we are dealing with all conditions. */
	92
	93	Bestd = id = 0;
	94
	95	Verbosity(1)
	96	printf("\n Err Used Pess\tAbsent condition\n");
	97
	98	ForEach(d, 0, NCond)
	99	{
	100	if ( Deleted[d] ) continue;
	101
	102	if ( Total[d] )
	103	{
	104	Actual[d] = Errors[d] / (float) Total[d];
	105	Extra = AddErrs((float) Total[d], (float) Errors[d]);
	106	Pessimistic[d] = (Errors[d] + Extra) / Total[d];
	107	}
	108	else
	109	{
	110	Actual[d] = 0;
	111	Pessimistic[d] = DefaultError;
	112	}
	113
	114	Verbosity(1)
	115	printf(" %5d%5d %4.1f%%",
	116	Errors[d], Total[d], 100 * Pessimistic[d]);
	117
	118	if ( ! d )
	119	{
	120	Verbosity(1) printf("\t<base rule>\n");
	121	}
	122	else
	123	{
	124	id++;
	125
	126	/* If significance testing option used, invoke Fisher's
	127	exact test here to assess probability that division
	128	by d arises from chance. */
	129
	130	if ( SIGTEST )
	131	{
	132	CondSigLevel[d] =
	133	TableProb(Errors[0],
	134	Errors[d]-Errors[0],
	135	Total[0]-Errors[0],
	136	Total[d]-Total[0]-Errors[d]+Errors[0]);
	137
	138	Verbosity(1) printf(" Sig=%.3f", CondSigLevel[d]);
	139	}
	140
	141	Verbosity(1) PrintCondition(Cond[d]);
	142
	143	/* Bestd identifies the condition with lowest pessimistic
	144	error estimate */
	145
	146	if ( ! Bestd \|\| Pessimistic[d] <= Pessimistic[Bestd] )
	147	{
	148	Bestd = d;
	149	Bestid = id;
	150	}
	151
	152	/* Alter is set true if we are going to drop a condition
	153	(either because we get lower pessimistic est, or
	154	because one of the conditions fails a significance test) */
	155
	156	if ( Pessimistic[d] <= Pessimistic[0] \|\|
	157	Actual[d] <= Actual[0] \|\|
	158	SIGTEST && CondSigLevel[d] > SIGTHRESH )
	159	{
	160	Alter = true;
	161	}
	162	}
	163	}
	164
	165	if ( Alter )
	166	{
	167	Verbosity(1) printf("\teliminate test %d\n", Bestid);
	168
	169	Deleted[Bestd] = true;
	170	Remaining--;
	171	}
	172
	173	} while ( Alter && Remaining );
	174
	175	if ( ! Remaining \|\| ! Total[0] )
	176	{
	177	return;
	178	}
	179
	180	if ( Pessimistic[0] >= DefaultError )
	181	{
	182	Verbosity(1) printf("\ttoo inaccurate\n");
	183	return;
	184	}
	185
	186	/* Sort the conditions */
	187
	188	ForEach(d, 1, Remaining)
	189	{
	190	dd = 0;
	191	ForEach(id, d, NCond)
	192	{
	193	if ( ! Deleted[id] &&
	194	( ! dd \|\|
	195	Before(Cond[id]->CondTest, Cond[dd]->CondTest) ) )
	196	{
	197	dd = id;
	198	}
	199	}
	200	if ( dd != d )
	201	{
	202	Hold = Cond[d];
	203	Cond[d] = Cond[dd];
	204	Cond[dd] = Hold;
	205	Deleted[dd] = Deleted[d];
	206	}
	207	Deleted[d] = true;
	208	}
	209
	210	NewRule(Cond, Remaining, TargetClass, Pessimistic[0]);
	211	}
	212
	213
	214
	215	/*************************************************************************/
	216	/* */
	217	/* See whether condition R is redundant */
	218	/* */
	219	/*************************************************************************/
	220
	221
	222	Boolean Redundant(R, Cond, NCond)
	223	/* --------- */
	224	Condition Cond[];
	225	short R, NCond;
	226	{
	227	short d, v, vv;
	228	Test t, Rt;
	229	Boolean IsSubset();
	230
	231	Rt = Cond[R]->CondTest;
	232	v = Cond[R]->TestValue;
	233
	234	ForEach(d, 1, NCond)
	235	{
	236	if ( Deleted[d] \|\| d == R ) continue;
	237
	238	t = Cond[d]->CondTest;
	239	vv = Cond[d]->TestValue;
	240
	241	if ( t->Tested != Rt->Tested ) continue;
	242
	243	switch ( t->NodeType )
	244	{
	245	case BrDiscr: /* test of discrete attribute */
	246
	247	return false;
	248
	249	case ThreshContin: /* test of continuous attribute */
	250
	251	if ( vv == v &&
	252	( v == 1 ? t->Cut < Rt->Cut : t->Cut > Rt->Cut ) )
	253	{
	254	return true;
	255	}
	256
	257	break;
	258
	259	case BrSubset: /* subset test on discrete attribute */
	260
	261	if ( IsSubset(t->Subset[vv], Rt->Subset[v], Rt->Tested) )
	262	{
	263	return true;
	264	}
	265	}
	266	}
	267
	268	return false;
	269	}
	270
	271
	272
	273	/*************************************************************************/
	274	/* */
	275	/* Decide whether subset S1 of values is contained in subset S2 */
	276	/* */
	277	/*************************************************************************/
	278
	279
	280	Boolean IsSubset(S1, S2, Att)
	281	/* -------- */
	282	Set S1, S2;
	283	Attribute Att;
	284	{
	285	DiscrValue v;
	286
	287	ForEach(v, 1, MaxAttVal[Att])
	288	{
	289	if ( In(v, S1) && ! In(v, S2) ) return false;
	290	}
	291
	292	return true;
	293	}
	294
	295
	296
	297	/*************************************************************************/
	298	/* */
	299	/* Find the frequency distribution tables for the current conditions: */
	300	/* */
	301	/* Total[0] = items matching all conditions */
	302	/* Total[d] = items matching all except condition d */
	303	/* */
	304	/* Errors[0] = wrong-class items matching all conditions */
	305	/* Errors[d] = wrong-class items matching all but cond d */
	306	/* */
	307	/* This routine is critical to the efficiency of rule pruning. It */
	308	/* computes the information above in one pass through the data, */
	309	/* looking at cases that fail to satisfy at most one of the */
	310	/* non-deleted conditions */
	311	/* */
	312	/*************************************************************************/
	313
	314
	315	FindTables(NCond, TargetClass)
	316	/* ----------- */
	317	short NCond;
	318	ClassNo TargetClass;
	319	{
	320	ItemNo i;
	321	short Misses, Missed[2], d;
	322	Boolean CorrectClass;
	323
	324	/* Clear distributions */
	325
	326	ForEach(d, 0, NCond)
	327	{
	328	Total[d] = Errors[d] = 0;
	329	}
	330
	331	/* Set distributions */
	332
	333	ForEach(i, 0, MaxItem)
	334	{
	335	Misses = 0;
	336	CorrectClass = IsTarget(Item[i]);
	337
	338	for ( d = 1 ; d <= NCond && Misses <= 1 ; d++ )
	339	{
	340	if ( ! Deleted[d] && ! CondSatisfiedBy[d][i] )
	341	{
	342	Missed[Misses++] = d;
	343	}
	344	}
	345
	346	if ( ! Misses )
	347	{
	348	UpdateCount(Total, Errors, 0, CorrectClass);
	349	}
	350	else
	351	if ( Misses == 1 )
	352	{
	353	UpdateCount(Total, Errors, Missed[0], CorrectClass);
	354	}
	355	}
	356
	357	/* Adjust counts to reflect cases that met all conditions */
	358
	359	ForEach(d, 1, NCond)
	360	{
	361	if ( ! Deleted[d] )
	362	{
	363	Total[d] += Total[0];
	364	Errors[d] += Errors[0];
	365	}
	366	}
	367	}
	368
	369
	370
	371	/*************************************************************************/
	372	/* */
	373	/* Increment the counts Total[d] and Errors[d] */
	374	/* */
	375	/*************************************************************************/
	376
	377
	378	UpdateCount(T, E, d, OK)
	379	/* ----------- */
	380	ItemNo T[], E[];
	381	short d;
	382	Boolean OK;
	383	{
	384	T[d]++;
	385	if ( ! OK ) E[d]++;
	386	}
	387
	388
	389
	390	/*************************************************************************/
	391	/* */
	392	/* Determine whether the given case description satisfies the given */
	393	/* condition. */
	394	/* */
	395	/*************************************************************************/
	396
	397
	398	Boolean Satisfies(CaseDesc, OneCond)
	399	/* --------- */
	400	Description CaseDesc;
	401	Condition OneCond;
	402	{
	403	DiscrValue v;
	404	float cv;
	405	Test t;
	406	short s;
	407	Boolean Outcome;
	408
	409	t = OneCond->CondTest;
	410
	411	/* Determine the outcome of this test on this item */
	412
	413	switch ( t->NodeType )
	414	{
	415	case BrDiscr: /* test of discrete attribute */
	416
	417	v = DVal(CaseDesc, t->Tested);
	418	Outcome = ( v == 0 ? -1 : v );
	419	break;
	420
	421	case ThreshContin: /* test of continuous attribute */
	422
	423	cv = CVal(CaseDesc, t->Tested);
	424	Outcome = ( cv == Unknown ? -1 : cv <= t->Cut ? 1 : 2 );
	425	break;
	426
	427	case BrSubset: /* subset test on discrete attribute */
	428
	429	v = DVal(CaseDesc, t->Tested);
	430	Outcome = -1;
	431	ForEach(s, 1, t->Forks)
	432	{
	433	if ( In(v, t->Subset[s]) )
	434	{
	435	Outcome = s;
	436	break;
	437	}
	438	}
	439	}
	440
	441	return ( Outcome == OneCond->TestValue );
	442	}
	443
	444
	445
	446	/*************************************************************************/
	447	/* */
	448	/* Hypergeometric distribution (uses tabulated log factorials) */
	449	/* */
	450	/*************************************************************************/
	451
	452
	453	double Hypergeom(a, r, A, B)
	454	/* --------- */
	455	int a, r, A, B;
	456	{
	457	return exp( LogFact[A] + LogFact[B] + LogFact[r] + LogFact[A+B-r] -
	458	( LogFact[a] + LogFact[r-a] + LogFact[A-a]
	459	+ LogFact[B-(r-a)] + LogFact[A+B]) );
	460	}
	461
	462
	463
	464	/*************************************************************************/
	465	/* */
	466	/* TableProb examines the 2x2 contingency table t and computes the */
	467	/* probability that a random division could have produced a split at */
	468	/* least as extreme as this. Also known as "Fisher's Exact Test", */
	469	/* after its inventor, R.A. Fisher. */
	470	/* */
	471	/*************************************************************************/
	472
	473
	474	float TableProb(t11, t12, t21, t22)
	475	/* --------- */
	476	int t11, t12, t21, t22;
	477	{
	478	double Sum=0.0;
	479	int A, B, r, a, k, a0;
	480
	481	/* First, rearrange the rows and columns of the table to get it into
	482	canonical form */
	483
	484	if ( t11 + t12 > t21 + t22 )
	485	{
	486	A = t11 + t12;
	487	B = t21 + t22;
	488
	489	if ( t11 * (t21 + t22) > t21 * (t11 + t12) )
	490	{
	491	a0 = t11;
	492	r = t11 + t21;
	493	}
	494	else
	495	{
	496	a0 = t12;
	497	r = t12 + t22;
	498	}
	499	}
	500	else
	501	{
	502	A = t21 + t22;
	503	B = t11 + t12;
	504	if ( t21 * (t11 + t12) > t11 * (t21 + t22) )
	505	{
	506	a0 = t21;
	507	r = t21 + t11;
	508	}
	509	else
	510	{
	511	a0 = t22;
	512	r = t22 + t12;
	513	}
	514	}
	515
	516	/* Now compute the probability */
	517
	518	k = Min(r, A);
	519	ForEach(a, a0, k)
	520	{
	521	Sum += Hypergeom(a, r, A, B);
	522	}
	523
	524	return Sum;
	525	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/Parallel-DT/R8/Src/prunerule.c

Download in other formats: