/*************************************************************************/ /* */ /* Calculate information, information gain, and print dists */ /* -------------------------------------------------------- */ /* */ /*************************************************************************/ #include "buildex.i" /*************************************************************************/ /* */ /* Determine the worth of a particular split according to the */ /* operative criterion */ /* */ /* Parameters: */ /* SplitInfo: potential info of the split */ /* SplitGain: gain in info of the split */ /* MinGain: gain above which the Gain Ratio */ /* may be used */ /* */ /* If the Gain criterion is being used, the information gain of */ /* the split is returned, but if the Gain Ratio criterion is */ /* being used, the ratio of the information gain of the split to */ /* its potential information is returned. */ /* */ /*************************************************************************/ float Worth(ThisInfo, ThisGain, MinGain) /* ----- */ float ThisInfo, ThisGain, MinGain; { if (GAINRATIO) { if (ThisGain >= MinGain - Epsilon && ThisInfo > Epsilon) { return ThisGain / ThisInfo; } else { return -Epsilon; } } else { return (ThisInfo > 0 && ThisGain > -Epsilon ? ThisGain : -Epsilon); } } /*************************************************************************/ /* */ /* Zero the frequency tables Freq[][] and ValFreq[] up to MaxVal */ /* */ /*************************************************************************/ ResetFreq(MaxVal, Freq, ValFreq) /* --------- */ DiscrValue MaxVal; ItemCount** Freq; ItemCount* ValFreq; { DiscrValue v; ClassNo c; #pragma omp parallel for private(v) ForEach(v, 0, MaxVal) { ForEach(c, 0, MaxClass) { Freq[v][c] = 0; } ValFreq[v] = 0; } } void ResetFreq_discr(MaxVal, Freq_discr, ValFreq_discr) DiscrValue MaxVal; ItemCount** Freq_discr; ItemCount* ValFreq_discr;{ DiscrValue v; ClassNo c; ForEach(v, 0, MaxVal) { ForEach(c, 0, MaxClass) { Freq_discr[v][c] = 0; } ValFreq_discr[v] = 0; } } /*************************************************************************/ /* */ /* Given tables Freq[][] and ValFreq[], compute the information gain. */ /* */ /* Parameters: */ /* BaseInfo: average information for all items with */ /* known values of the test attribute */ /* UnknownRate: fraction of items with unknown ditto */ /* MaxVal: number of forks */ /* TotalItems: number of items with known values of */ /* test att */ /* */ /* where Freq[x][y] contains the no. of cases with value x for a */ /* particular attribute that are members of class y, */ /* and ValFreq[x] contains the no. of cases with value x for a */ /* particular attribute */ /* */ /*************************************************************************/ float ComputeGain(BaseInfo, UnknFrac, MaxVal, TotalItems) /* ----------- */ float BaseInfo, UnknFrac;DiscrValue MaxVal;ItemCount TotalItems; { DiscrValue v; float ThisInfo = 0.0, ThisGain, TotalInfo(); short ReasonableSubsets = 0; /* Check whether all values are unknown or the same */ if (!TotalItems) return -Epsilon; /* There must be at least two subsets with MINOBJS items */ ForEach(v, 1, MaxVal) { if (ValFreq[v] >= MINOBJS) ReasonableSubsets++; } if (ReasonableSubsets < 2) return -Epsilon; /* Compute total info after split, by summing the info of each of the subsets formed by the test */ //#pragma omp parallel for reduction(+:ThisInfo) ForEach(v, 1, MaxVal) { ThisInfo += TotalInfo(Freq[v], 0, MaxClass); } /* Set the gain in information for all items, adjusted for unknowns */ ThisGain = (1 - UnknFrac) * (BaseInfo - ThisInfo / TotalItems); Verbosity(5) printf( "ComputeThisGain: items %.1f info %.3f base %.3f unkn %.3f result %.3f\n", TotalItems + ValFreq[0], ThisInfo, BaseInfo, UnknFrac, ThisGain); return ThisGain; } float ComputeGain_Discr(BaseInfo, UnknFrac, MaxVal, TotalItems, Freq_discr,ValFreq_discr) /* --------- */ float BaseInfo, UnknFrac;DiscrValue MaxVal;ItemCount TotalItems; ItemCount** Freq_discr; ItemCount* ValFreq_discr;{ DiscrValue v; float ThisInfo = 0.0, ThisGain, TotalInfo(); short ReasonableSubsets = 0; /* Check whether all values are unknown or the same */ if (!TotalItems) return -Epsilon; /* There must be at least two subsets with MINOBJS items */ ForEach(v, 1, MaxVal) { if (ValFreq_discr[v] >= MINOBJS) ReasonableSubsets++; } if (ReasonableSubsets < 2) return -Epsilon; /* Compute total info after split, by summing the info of each of the subsets formed by the test */ ForEach(v, 1, MaxVal) { ThisInfo += TotalInfo(Freq_discr[v], 0, MaxClass); } /* Set the gain in information for all items, adjusted for unknowns */ ThisGain = (1 - UnknFrac) * (BaseInfo - ThisInfo / TotalItems); Verbosity(5) printf( "ComputeThisGain: items %.1f info %.3f base %.3f unkn %.3f result %.3f\n", TotalItems + ValFreq_discr[0], ThisInfo, BaseInfo, UnknFrac, ThisGain); return ThisGain; } /*************************************************************************/ /* */ /* Compute the total information in V[ MinVal..MaxVal ] */ /* */ /*************************************************************************/ float TotalInfo(V, MinVal, MaxVal) /* --------- */ ItemCount* V;DiscrValue MinVal, MaxVal; { DiscrValue v; float Sum = 0.0; ItemCount N, TotalItems = 0; ForEach(v, MinVal, MaxVal) { N = V[v]; Sum += N * Log(N); TotalItems += N; } return TotalItems * Log(TotalItems) - Sum; } /*************************************************************************/ /* */ /* Print distribution table for given attribute */ /* */ /*************************************************************************/ PrintDistribution(Att, MaxVal, ShowNames) /* ----------------- */ Attribute Att;DiscrValue MaxVal;Boolean ShowNames; { DiscrValue v; ClassNo c; String Val; printf("\n\t\t\t "); ForEach(c, 0, MaxClass) { printf("%7.6s", ClassName[c]); } printf("\n"); ForEach(v, 0, MaxVal) { if (ShowNames) { Val = (!v ? "unknown" : MaxAttVal[Att] ? AttValName[Att][v] : v == 1 ? "below" : "above"); printf("\t\t[%-7.7s:", Val); } else { printf("\t\t[%-7d:", v); } ForEach(c, 0, MaxClass) { printf(" %6.1f", Freq[v][c]); } printf("]\n"); } } PrintDistribution_Discr(Att, MaxVal, ShowNames, Freq_discr) /* ----------------- */ Attribute Att;DiscrValue MaxVal;Boolean ShowNames; ItemCount** Freq_discr;{ DiscrValue v; ClassNo c; String Val; printf("\n\t\t\t "); ForEach(c, 0, MaxClass) { printf("%7.6s", ClassName[c]); } printf("\n"); ForEach(v, 0, MaxVal) { if (ShowNames) { Val = (!v ? "unknown" : MaxAttVal[Att] ? AttValName[Att][v] : v == 1 ? "below" : "above"); printf("\t\t[%-7.7s:", Val); } else { printf("\t\t[%-7d:", v); } ForEach(c, 0, MaxClass) { printf(" %6.1f", Freq_discr[v][c]); } printf("]\n"); } }