/*************************************************************************/ /* */ /* Evaluation of a test on a discrete valued attribute */ /* --------------------------------------------------- */ /* */ /*************************************************************************/ #include "buildex.i" /*************************************************************************/ /* */ /* Set Info[] and Gain[] for discrete partition of items Fp to Lp */ /* */ /*************************************************************************/ EvalDiscreteAtt(Att, Fp, Lp, Items) /* --------------- */ Attribute Att;ItemNo Fp, Lp;ItemCount Items; { ItemCount KnownItems; float DiscrKnownBaseInfo(), ComputeGain(), TotalInfo(); ComputeFrequencies(Att, Fp, Lp); KnownItems = Items - ValFreq[0]; /* Special case when no known values of the attribute */ if (Items <= ValFreq[0]) { Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]); Gain[Att] = -Epsilon; Info[Att] = 0.0; return; } Gain[Att] = ComputeGain(DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]), UnknownRate[Att], MaxAttVal[Att], KnownItems); Info[Att] = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items; Verbosity(2) { printf("\tAtt %s", AttName[Att]); Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true); printf("\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]); } } EvalDiscreteAtt_Discr(Att, Fp, Lp, Items, Freq, ValFreq, UnknownRate) /* --------------- */ Attribute Att;ItemNo Fp, Lp;ItemCount Items; ItemCount** Freq; ItemCount* ValFreq; float* UnknownRate; { ItemCount KnownItems; float DiscrKnownBaseInfo_Discr(), ComputeGain_Discr(), TotalInfo(); ComputeFrequencies_Discr(Att, Fp, Lp, Freq, ValFreq, UnknownRate); KnownItems = Items - ValFreq[0]; /* Special case when no known values of the attribute */ if (Items <= ValFreq[0]) { Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]); Gain[Att] = -Epsilon; Info[Att] = 0.0; return; } Gain[Att] = ComputeGain_Discr(DiscrKnownBaseInfo_Discr(KnownItems, MaxAttVal[Att], Freq), UnknownRate[Att], MaxAttVal[Att], KnownItems, Freq, ValFreq); Info[Att] = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items; Verbosity(2) { printf("\tAtt %s", AttName[Att]); Verbosity(3) PrintDistribution_Discr(Att, MaxAttVal[Att], true, Freq); printf("\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]); } } /*************************************************************************/ /* */ /* Compute frequency tables Freq[][] and ValFreq[] for attribute */ /* Att from items Fp to Lp, and set the UnknownRate for Att */ /* */ /*************************************************************************/ ComputeFrequencies(Att, Fp, Lp) /* ------------------ */ Attribute Att;ItemNo Fp, Lp; { Description Case; ClassNo c; DiscrValue v; ItemCount CountItems(); ItemNo p; ResetFreq(MaxAttVal[Att], Freq, ValFreq); /* Determine the frequency of each class amongst cases with each possible value for the given attribute */ #pragma omp parallel for private(p, Case) ForEach(p, Fp, Lp) { Case = Item[p]; #pragma omp critical Freq[DVal(Case,Att)][Class(Case)] += Weight[p]; } /* Determine the frequency of each possible value for the given attribute */ #pragma omp parallel for private(v) ForEach(v, 0, MaxAttVal[Att]) { ForEach(c, 0, MaxClass) { ValFreq[v] += Freq[v][c]; } } /* Set the rate of unknown values of the attribute */ UnknownRate[Att] = ValFreq[0] / CountItems(Fp, Lp); } ComputeFrequencies_Discr(Att, Fp, Lp, Freq, ValFreq, UnknownRate) /* ------------------ */ Attribute Att;ItemNo Fp, Lp; ItemCount** Freq; ItemCount* ValFreq; float* UnknownRate;{ Description Case; ClassNo c; DiscrValue v; ItemCount CountItems(); ItemNo p; ResetFreq(MaxAttVal[Att], Freq, ValFreq); /* Determine the frequency of each class amongst cases with each possible value for the given attribute */ ForEach(p, Fp, Lp) { Case = Item[p]; Freq[DVal(Case,Att)][Class(Case)] += Weight[p]; } /* Determine the frequency of each possible value for the given attribute */ ForEach(v, 0, MaxAttVal[Att]) { ForEach(c, 0, MaxClass) { ValFreq[v] += Freq[v][c]; } } /* Set the rate of unknown values of the attribute */ UnknownRate[Att] = ValFreq[0] / CountItems(Fp, Lp); } /*************************************************************************/ /* */ /* Return the base info for items with known values of a discrete */ /* attribute, using the frequency table Freq[][] */ /* */ /*************************************************************************/ float DiscrKnownBaseInfo(KnownItems, MaxVal) /* ------------------ */ DiscrValue MaxVal;ItemCount KnownItems; { ClassNo c; ItemCount ClassCount; double Sum = 0; DiscrValue v; //#pragma omp parallel for ForEach(c, 0, MaxClass) { ClassCount = 0; ForEach(v, 1, MaxVal) { ClassCount += Freq[v][c]; } Sum += ClassCount * Log(ClassCount); } return (KnownItems * Log(KnownItems) - Sum) / KnownItems; } float DiscrKnownBaseInfo_Discr(KnownItems, MaxVal, Freq) /* ------------------ */ DiscrValue MaxVal;ItemCount KnownItems; ItemCount** Freq;{ ClassNo c; ItemCount ClassCount; double Sum = 0; DiscrValue v; ForEach(c, 0, MaxClass) { ClassCount = 0; ForEach(v, 1, MaxVal) { ClassCount += Freq[v][c]; } Sum += ClassCount * Log(ClassCount); } return (KnownItems * Log(KnownItems) - Sum) / KnownItems; } /*************************************************************************/ /* */ /* Construct and return a node for a test on a discrete attribute */ /* */ /*************************************************************************/ DiscreteTest(Node, Att) /* ---------- */ Tree Node;Attribute Att; { ItemCount CountItems(); Sprout(Node, MaxAttVal[Att]); Node->NodeType = BrDiscr; Node->Tested = Att; Node->Errors = 0; }