[26] | 1 | /*************************************************************************/ |
---|
| 2 | /* */ |
---|
| 3 | /* Evaluation of a test on a discrete valued attribute */ |
---|
| 4 | /* --------------------------------------------------- */ |
---|
| 5 | /* */ |
---|
| 6 | /*************************************************************************/ |
---|
| 7 | |
---|
| 8 | #include "buildex.i" |
---|
| 9 | |
---|
| 10 | /*************************************************************************/ |
---|
| 11 | /* */ |
---|
| 12 | /* Set Info[] and Gain[] for discrete partition of items Fp to Lp */ |
---|
| 13 | /* */ |
---|
| 14 | /*************************************************************************/ |
---|
| 15 | |
---|
[68] | 16 | EvalDiscreteAtt(Att, Fp, Lp, Items) |
---|
| 17 | /* --------------- */ |
---|
| 18 | Attribute Att;ItemNo Fp, Lp;ItemCount Items; { |
---|
| 19 | ItemCount KnownItems; |
---|
| 20 | float DiscrKnownBaseInfo(), ComputeGain(), TotalInfo(); |
---|
[26] | 21 | |
---|
[68] | 22 | ComputeFrequencies(Att, Fp, Lp); |
---|
[26] | 23 | |
---|
[68] | 24 | KnownItems = Items - ValFreq[0]; |
---|
[26] | 25 | |
---|
[68] | 26 | /* Special case when no known values of the attribute */ |
---|
[26] | 27 | |
---|
[68] | 28 | if (Items <= ValFreq[0]) { |
---|
| 29 | Verbosity(2) |
---|
| 30 | printf("\tAtt %s: no known values\n", AttName[Att]); |
---|
[26] | 31 | |
---|
[68] | 32 | Gain[Att] = -Epsilon; |
---|
| 33 | Info[Att] = 0.0; |
---|
| 34 | return; |
---|
| 35 | } |
---|
[26] | 36 | |
---|
[68] | 37 | Gain[Att] = ComputeGain(DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]), |
---|
| 38 | UnknownRate[Att], MaxAttVal[Att], KnownItems); |
---|
| 39 | Info[Att] = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items; |
---|
[26] | 40 | |
---|
[68] | 41 | Verbosity(2) { |
---|
| 42 | printf("\tAtt %s", AttName[Att]); |
---|
| 43 | Verbosity(3) |
---|
| 44 | PrintDistribution(Att, MaxAttVal[Att], true); |
---|
| 45 | printf("\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]); |
---|
| 46 | } |
---|
[26] | 47 | |
---|
[68] | 48 | } |
---|
[26] | 49 | |
---|
[84] | 50 | EvalDiscreteAtt_Discr(Att, Fp, Lp, Items, Freq_discr, ValFreq_discr, UnknownRate_discr) |
---|
[68] | 51 | /* --------------- */ |
---|
[84] | 52 | Attribute Att;ItemNo Fp, Lp;ItemCount Items; ItemCount** Freq_discr; ItemCount* ValFreq_discr; float* UnknownRate_discr; { |
---|
[68] | 53 | ItemCount KnownItems; |
---|
| 54 | float DiscrKnownBaseInfo_Discr(), ComputeGain_Discr(), TotalInfo(); |
---|
[26] | 55 | |
---|
[84] | 56 | ComputeFrequencies_Discr(Att, Fp, Lp, Freq_discr, ValFreq_discr, UnknownRate_discr); |
---|
[26] | 57 | |
---|
[84] | 58 | KnownItems = Items - ValFreq_discr[0]; |
---|
[26] | 59 | |
---|
[68] | 60 | /* Special case when no known values of the attribute */ |
---|
| 61 | |
---|
[84] | 62 | if (Items <= ValFreq_discr[0]) { |
---|
[68] | 63 | Verbosity(2) |
---|
| 64 | printf("\tAtt %s: no known values\n", AttName[Att]); |
---|
| 65 | |
---|
| 66 | Gain[Att] = -Epsilon; |
---|
| 67 | Info[Att] = 0.0; |
---|
| 68 | return; |
---|
| 69 | } |
---|
| 70 | |
---|
[84] | 71 | Gain[Att] = ComputeGain_Discr(DiscrKnownBaseInfo_Discr(KnownItems, MaxAttVal[Att], Freq_discr), |
---|
| 72 | UnknownRate_discr[Att], MaxAttVal[Att], KnownItems, Freq_discr, ValFreq_discr); |
---|
| 73 | Info[Att] = TotalInfo(ValFreq_discr, 0, MaxAttVal[Att]) / Items; |
---|
[68] | 74 | |
---|
| 75 | Verbosity(2) { |
---|
| 76 | printf("\tAtt %s", AttName[Att]); |
---|
| 77 | Verbosity(3) |
---|
[84] | 78 | PrintDistribution_Discr(Att, MaxAttVal[Att], true, Freq_discr); |
---|
[68] | 79 | printf("\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]); |
---|
| 80 | } |
---|
| 81 | |
---|
| 82 | } |
---|
[26] | 83 | /*************************************************************************/ |
---|
| 84 | /* */ |
---|
| 85 | /* Compute frequency tables Freq[][] and ValFreq[] for attribute */ |
---|
| 86 | /* Att from items Fp to Lp, and set the UnknownRate for Att */ |
---|
| 87 | /* */ |
---|
| 88 | /*************************************************************************/ |
---|
| 89 | |
---|
[68] | 90 | ComputeFrequencies(Att, Fp, Lp) |
---|
| 91 | /* ------------------ */ |
---|
| 92 | Attribute Att;ItemNo Fp, Lp; { |
---|
| 93 | Description Case; |
---|
| 94 | ClassNo c; |
---|
| 95 | DiscrValue v; |
---|
| 96 | ItemCount CountItems(); |
---|
| 97 | ItemNo p; |
---|
[26] | 98 | |
---|
[68] | 99 | ResetFreq(MaxAttVal[Att], Freq, ValFreq); |
---|
[26] | 100 | |
---|
[68] | 101 | /* Determine the frequency of each class amongst cases |
---|
| 102 | with each possible value for the given attribute */ |
---|
[26] | 103 | |
---|
[68] | 104 | #pragma omp parallel for private(p, Case) |
---|
| 105 | ForEach(p, Fp, Lp) { |
---|
| 106 | Case = Item[p]; |
---|
| 107 | #pragma omp critical |
---|
| 108 | Freq[DVal(Case,Att)][Class(Case)] += Weight[p]; |
---|
| 109 | } |
---|
[26] | 110 | |
---|
[68] | 111 | /* Determine the frequency of each possible value for the |
---|
| 112 | given attribute */ |
---|
[26] | 113 | |
---|
[68] | 114 | #pragma omp parallel for private(v) |
---|
| 115 | ForEach(v, 0, MaxAttVal[Att]) { |
---|
| 116 | ForEach(c, 0, MaxClass) { |
---|
| 117 | ValFreq[v] += Freq[v][c]; |
---|
| 118 | } |
---|
[26] | 119 | } |
---|
| 120 | |
---|
[68] | 121 | /* Set the rate of unknown values of the attribute */ |
---|
| 122 | |
---|
| 123 | UnknownRate[Att] = ValFreq[0] / CountItems(Fp, Lp); |
---|
[26] | 124 | } |
---|
| 125 | |
---|
[84] | 126 | ComputeFrequencies_Discr(Att, Fp, Lp, Freq_discr, ValFreq_discr, UnknownRate_discr) |
---|
[68] | 127 | /* ------------------ */ |
---|
[84] | 128 | Attribute Att;ItemNo Fp, Lp; ItemCount** Freq_discr; ItemCount* ValFreq_discr; float* UnknownRate_discr;{ |
---|
[68] | 129 | Description Case; |
---|
| 130 | ClassNo c; |
---|
| 131 | DiscrValue v; |
---|
| 132 | ItemCount CountItems(); |
---|
| 133 | ItemNo p; |
---|
[26] | 134 | |
---|
[84] | 135 | ResetFreq_discr(MaxAttVal[Att], Freq_discr, ValFreq_discr); |
---|
[26] | 136 | |
---|
[68] | 137 | /* Determine the frequency of each class amongst cases |
---|
| 138 | with each possible value for the given attribute */ |
---|
| 139 | |
---|
| 140 | ForEach(p, Fp, Lp) { |
---|
| 141 | Case = Item[p]; |
---|
[84] | 142 | Freq_discr[DVal(Case,Att)][Class(Case)] += Weight[p]; |
---|
[68] | 143 | } |
---|
| 144 | |
---|
| 145 | /* Determine the frequency of each possible value for the |
---|
| 146 | given attribute */ |
---|
| 147 | |
---|
| 148 | ForEach(v, 0, MaxAttVal[Att]) { |
---|
| 149 | ForEach(c, 0, MaxClass) { |
---|
[84] | 150 | ValFreq_discr[v] += Freq_discr[v][c]; |
---|
[68] | 151 | } |
---|
| 152 | } |
---|
| 153 | |
---|
| 154 | /* Set the rate of unknown values of the attribute */ |
---|
| 155 | |
---|
[84] | 156 | UnknownRate_discr[Att] = ValFreq_discr[0] / CountItems(Fp, Lp); |
---|
[68] | 157 | } |
---|
[26] | 158 | /*************************************************************************/ |
---|
| 159 | /* */ |
---|
| 160 | /* Return the base info for items with known values of a discrete */ |
---|
| 161 | /* attribute, using the frequency table Freq[][] */ |
---|
| 162 | /* */ |
---|
| 163 | /*************************************************************************/ |
---|
| 164 | |
---|
| 165 | float DiscrKnownBaseInfo(KnownItems, MaxVal) |
---|
[68] | 166 | /* ------------------ */ |
---|
| 167 | DiscrValue MaxVal;ItemCount KnownItems; { |
---|
| 168 | ClassNo c; |
---|
| 169 | ItemCount ClassCount; |
---|
| 170 | double Sum = 0; |
---|
| 171 | DiscrValue v; |
---|
[26] | 172 | |
---|
[68] | 173 | //#pragma omp parallel for |
---|
| 174 | ForEach(c, 0, MaxClass) { |
---|
| 175 | ClassCount = 0; |
---|
| 176 | ForEach(v, 1, MaxVal) { |
---|
| 177 | ClassCount += Freq[v][c]; |
---|
| 178 | } |
---|
| 179 | Sum += ClassCount * Log(ClassCount); |
---|
[26] | 180 | } |
---|
| 181 | |
---|
[68] | 182 | return (KnownItems * Log(KnownItems) - Sum) / KnownItems; |
---|
[26] | 183 | } |
---|
| 184 | |
---|
[84] | 185 | float DiscrKnownBaseInfo_Discr(KnownItems, MaxVal, Freq_discr) |
---|
[68] | 186 | /* ------------------ */ |
---|
[84] | 187 | DiscrValue MaxVal;ItemCount KnownItems; ItemCount** Freq_discr;{ |
---|
[68] | 188 | ClassNo c; |
---|
| 189 | ItemCount ClassCount; |
---|
| 190 | double Sum = 0; |
---|
| 191 | DiscrValue v; |
---|
[26] | 192 | |
---|
[68] | 193 | ForEach(c, 0, MaxClass) { |
---|
| 194 | ClassCount = 0; |
---|
| 195 | ForEach(v, 1, MaxVal) { |
---|
[84] | 196 | ClassCount += Freq_discr[v][c]; |
---|
[68] | 197 | } |
---|
| 198 | Sum += ClassCount * Log(ClassCount); |
---|
| 199 | } |
---|
[26] | 200 | |
---|
[68] | 201 | return (KnownItems * Log(KnownItems) - Sum) / KnownItems; |
---|
| 202 | } |
---|
| 203 | |
---|
[26] | 204 | /*************************************************************************/ |
---|
| 205 | /* */ |
---|
| 206 | /* Construct and return a node for a test on a discrete attribute */ |
---|
| 207 | /* */ |
---|
| 208 | /*************************************************************************/ |
---|
| 209 | |
---|
[68] | 210 | DiscreteTest(Node, Att) |
---|
| 211 | /* ---------- */ |
---|
| 212 | Tree Node;Attribute Att; { |
---|
| 213 | ItemCount CountItems(); |
---|
[26] | 214 | |
---|
[68] | 215 | Sprout(Node, MaxAttVal[Att]); |
---|
[26] | 216 | |
---|
[68] | 217 | Node->NodeType = BrDiscr; |
---|
| 218 | Node->Tested = Att; |
---|
| 219 | Node->Errors = 0; |
---|
[26] | 220 | } |
---|