00001
00036
00037
00038 #ifndef _Dataset_H_
00039 #define _Dataset_H_
00040
00041
00042 #include <Array/Array.h>
00043
00044
00062 class DataSource
00063 {
00064 public:
00066 DataSource();
00067
00069 virtual ~DataSource();
00070
00071
00073 inline int getDataDimension()
00074 {
00075 return dataDim;
00076 }
00077
00079 inline int getTargetDimension()
00080 {
00081 return targetDim;
00082 }
00083
00105 virtual bool GetData(Array<double>& data, Array<double>& target, int count) = 0;
00106
00107 protected:
00108 int dataDim;
00109 int targetDim;
00110 };
00111
00112
00149 class DataFile : public DataSource
00150 {
00151 public:
00153 DataFile(const char* filename);
00154
00156 ~DataFile();
00157
00158
00160 bool GetData(Array<double>& data, Array<double>& target, int count);
00161
00163 bool GetData(Array<double>& training_data, Array<double>& training_target, int training,
00164 Array<double>& test_data, Array<double>& test_target, int test,
00165 bool shuffle = false);
00166
00168 inline int getNumberOfExamples()
00169 {
00170 return numberOfExamples;
00171 }
00172
00173 protected:
00174 bool ReadHeaderLine();
00175 bool ReadExample(Array<double>& data, Array<double>& target, int number);
00176 int ReadToken(char* buffer, int maxlength, const char* separators);
00177 int DiscardUntil(const char* separators);
00178
00180 FILE* file;
00181
00183 int numberOfExamples;
00184
00186 int format;
00187
00189 int currentExample;
00190 };
00191
00192
00203 class Dataset
00204 {
00205 public:
00207 Dataset(const Dataset & dataset);
00208
00210 Dataset(DataSource& source, int train, int test);
00211
00213 Dataset(const char* filename, int train, int test = 0);
00214
00216 Dataset(const char* filename, double train);
00217
00219 Dataset(const char* trainfile, const char* testfile);
00220
00224 Dataset(const char* trainfile, const char* testfile, int train);
00225
00227 Dataset(const char* datafile, const char* splitfile, double disambiguation);
00228
00230 Dataset(const Array<double>& trainingData, const Array<double>& trainingTarget, const Array<double>& testData, const Array<double>& testTarget);
00231
00232
00234 void ShuffleTraining();
00235
00237 void ShuffleTest();
00238
00242 void ShuffleAll();
00243
00245 inline const Array<double> & getTrainingData() const
00246 {
00247 return trainingData;
00248 }
00249
00251 inline const Array<double> & getTrainingTarget() const
00252 {
00253 return trainingTarget;
00254 }
00255
00257 inline const Array<double> & getTestData() const
00258 {
00259 return testData;
00260 }
00261
00263 inline const Array<double> & getTestTarget() const
00264 {
00265 return testTarget;
00266 }
00267
00291 bool Save(const char* filename, bool training = true, bool test = true, const char* format = "ascii");
00292
00301 bool SaveLIBSVM(const char* filename, bool training = true, bool test = true);
00302
00312 void NormalizeComponents();
00313
00322 void NormalizeComponent( int d );
00323
00324 protected:
00325 bool ReadSplitFile(const char* filename, std::vector<unsigned int>& train, std::vector<unsigned int>& test);
00326 bool ReadLine(FILE* file, char* buffer, int bufferlength);
00327
00328 Array<double> trainingData;
00329 Array<double> trainingTarget;
00330 Array<double> testData;
00331 Array<double> testTarget;
00332 };
00333
00334
00335 #endif
00336