c++的问题,有关数据挖掘c4.5

来源:百度知道 编辑:UC知道 时间:2024/07/08 00:18:08
#include <iostream>
#include <fstream>
#include <sstream>
#include "DTree.h"

using namespace std;

DTree *root;

vector<StoreData> trainAll, //所有的训练数据
testAll, //所有的测试数据
train, //选取的训练数据
test; //测试数据

vector<OriganData> OtrainAll, //原始的训练数据
OtestAll, //原始测试数据
Otrain; //原始的选取的训练数据

vector<int> attributes; //属性的范围
ifstream fin;
set<int> trainSet; //选取的训练数据编号集合
int sortKind; //排序的方式
double conSpit[6]; //连续取值的属性的阈值,用c4.5的办法求得。
int size = 0;

void init()
{
readData(OtrainAll, "crx.train");
readData(OtestAll, "crx.test");
unsigned int selectDataNum = 350;
selectData(OtrainAll, Otrain, selectDataNum, (int)OtrainAll.size());
processConValue();
changeData(Otrain, train);

double Entropy(double p, double s)
{
double n = s - p;
double result = 0;
if (n != 0)
result += - double(n) / s * log(double(n) / s) / log(2.0);
if (p != 0)
result += double(-p) / s * log(double(p) / s) / log(2.0);
return result;
}

double Gain(double p1, double s1, double p2, double s2)
{
return Entropy(p1 + p2, s1 + s2) - double(p1 / s1) * Entropy(p1, s1) - double(p2 / s2) * Entropy(p2, s2);
}

void processConValue()
{
int con[6] = {2, 3, 8, 11, 14, 15};
for (int i = 0; i < 6; i++)
{
sortKind = con[i];
stable_sort(Otrain.begin(), Otrain.end(), header);
/*
for (vector<OriganData>::iterator it = Otrain.begin(); it != Otrain.end(); it++)
cout << (*it).A2 << (*it).label << '\t';
cout << endl;
*/
double bestGain = 0; //记录最佳的Gain。
double gain;
vector<OriganData&