Shark Machine Learning Library
  • About Shark
  • Sourceforge
    • Project Summary
    • Downloads
    • Subversion Repository
  • Getting Started
  • Tutorials
  • FAQ
  • Main Modules
    • ReClaM
    • EALib
    • MOO-EALib
    • Fuzzy
  • Tools
    • Mixture
    • Array
    • Rng
    • LinAlg
    • FileUtil
  • Main Page
  • Related Pages
  • Classes

Dataset.h

Go to the documentation of this file.
00001 //===========================================================================
00036 //===========================================================================
00037 
00038 #ifndef _Dataset_H_
00039 #define _Dataset_H_
00040 
00041 
00042 #include <Array/Array.h>
00043 
00044 
00062 class DataSource
00063 {
00064 public:
00066     DataSource();
00067 
00069     virtual ~DataSource();
00070 
00071 
00073     inline int getDataDimension()
00074     {
00075         return dataDim;
00076     }
00077 
00079     inline int getTargetDimension()
00080     {
00081         return targetDim;
00082     }
00083 
00105     virtual bool GetData(Array<double>& data, Array<double>& target, int count) = 0;
00106 
00107 protected:
00108     int dataDim;
00109     int targetDim;
00110 };
00111 
00112 
00149 class DataFile : public DataSource
00150 {
00151 public:
00153     DataFile(const char* filename);
00154 
00156     ~DataFile();
00157 
00158 
00160     bool GetData(Array<double>& data, Array<double>& target, int count);
00161 
00163     bool GetData(Array<double>& training_data, Array<double>& training_target, int training,
00164             Array<double>& test_data, Array<double>& test_target, int test,
00165             bool shuffle = false);
00166 
00168     inline int getNumberOfExamples()
00169     {
00170         return numberOfExamples;
00171     }
00172 
00173 protected:
00174     bool ReadHeaderLine();
00175     bool ReadExample(Array<double>& data, Array<double>& target, int number);
00176     int ReadToken(char* buffer, int maxlength, const char* separators);
00177     int DiscardUntil(const char* separators);
00178 
00180     FILE* file;
00181 
00183     int numberOfExamples;
00184 
00186     int format;
00187 
00189     int currentExample;
00190 };
00191 
00192 
00203 class Dataset
00204 {
00205 public:
00207     Dataset(const Dataset & dataset);
00208 
00210     Dataset(DataSource& source, int train, int test);
00211 
00213     Dataset(const char* filename, int train, int test = 0);
00214 
00216     Dataset(const char* filename, double train);
00217 
00219     Dataset(const char* trainfile, const char* testfile);
00220 
00224     Dataset(const char* trainfile, const char* testfile, int train);
00225 
00227     Dataset(const char* datafile, const char* splitfile, double disambiguation);
00228 
00230     Dataset(const Array<double>& trainingData, const Array<double>& trainingTarget, const Array<double>& testData, const Array<double>& testTarget);
00231 
00232 
00234     void ShuffleTraining();
00235 
00237     void ShuffleTest();
00238 
00242     void ShuffleAll();
00243 
00245     inline const Array<double> & getTrainingData() const
00246     {
00247         return trainingData;
00248     }
00249 
00251     inline const Array<double> & getTrainingTarget() const
00252     {
00253         return trainingTarget;
00254     }
00255 
00257     inline const Array<double> & getTestData() const
00258     {
00259         return testData;
00260     }
00261 
00263     inline const Array<double> & getTestTarget() const
00264     {
00265         return testTarget;
00266     }
00267 
00291     bool Save(const char* filename, bool training = true, bool test = true, const char* format = "ascii");
00292 
00301     bool SaveLIBSVM(const char* filename, bool training = true, bool test = true);
00302 
00312     void NormalizeComponents();
00313 
00322     void NormalizeComponent( int d );
00323 
00324 protected:
00325     bool ReadSplitFile(const char* filename, std::vector<unsigned int>& train, std::vector<unsigned int>& test);
00326     bool ReadLine(FILE* file, char* buffer, int bufferlength);
00327 
00328     Array<double> trainingData;
00329     Array<double> trainingTarget;
00330     Array<double> testData;
00331     Array<double> testTarget;
00332 };
00333 
00334 
00335 #endif
00336