Shark Machine Learning Library
  • About Shark
  • Sourceforge
    • Project Summary
    • Downloads
    • Subversion Repository
  • Getting Started
  • Tutorials
  • FAQ
  • Main Modules
    • ReClaM
    • EALib
    • MOO-EALib
    • Fuzzy
  • Tools
    • Mixture
    • Array
    • Rng
    • LinAlg
    • FileUtil
  • Main Page
  • Related Pages
  • Classes

Dataset.cpp

Go to the documentation of this file.
00001 //===========================================================================
00037 //===========================================================================
00038 
00039 
00040 #include <fstream>
00041 #include <vector>
00042 #include <algorithm>
00043 #include <SharkDefs.h>
00044 #include <Rng/GlobalRng.h>
00045 #include <ReClaM/Dataset.h>
00046 
00047 
00048 
00049 #define DataFile_ReadType(T) \
00050     T value; \
00051     for (i = 0; i < dataDim; i++) { \
00052         if (fread(&value, sizeof(T), 1, file) != 1) return false; \
00053         data(number, i) = value; \
00054     } \
00055     for (i = 0; i < targetDim; i++) { \
00056         if (fread(&value, sizeof(T), 1, file) != 1) return false; \
00057         target(number, i) = value; \
00058     }
00059 
00060 
00061 #define Dataset_WriteType(T) \
00062     T value; \
00063     if (training) \
00064     { \
00065         for (t=0; t<ttr; t++) \
00066         { \
00067             for (i=0; i<ic; i++) \
00068             { \
00069                 value = (T)trainingData(t, i); \
00070                 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00071             } \
00072             for (o=0; o<oc; o++) \
00073             { \
00074                 value = (T)trainingTarget(t, o); \
00075                 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00076             } \
00077         } \
00078     } \
00079     if (test) \
00080     { \
00081         for (t=0; t<tte; t++) \
00082         { \
00083             for (i=0; i<ic; i++) \
00084             { \
00085                 value = (T)testData(t, i); \
00086                 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00087             } \
00088             for (o=0; o<oc; o++) \
00089             { \
00090                 value = (T)testTarget(t, o); \
00091                 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00092             } \
00093         } \
00094     }
00095 
00096 
00098 
00099 
00100 DataSource::DataSource()
00101 {
00102 }
00103 
00104 DataSource::~DataSource()
00105 {
00106 }
00107 
00108 
00110 
00111 
00112 DataFile::DataFile(const char* filename)
00113 {
00114     file = fopen(filename, "r");
00115     if (file == NULL) throw SHARKEXCEPTION("[DataFile::DataFile] cannot open file");
00116 
00117     if (! ReadHeaderLine())
00118     {
00119         fclose(file);
00120         file = NULL;
00121         throw SHARKEXCEPTION("[DataFile::DataFile] error in file header");
00122     }
00123 }
00124 
00125 DataFile::~DataFile()
00126 {
00127     if (file != NULL)
00128     {
00129         fclose(file);
00130         file = NULL;
00131     }
00132 }
00133 
00134 
00135 bool DataFile::GetData(Array<double>& data, Array<double>& target, int count)
00136 {
00137     if (currentExample + count > numberOfExamples) return false;
00138 
00139     data.resize(count, dataDim, false);
00140     target.resize(count, targetDim, false);
00141 
00142     int i;
00143     for (i = 0; i < count; i++)
00144     {
00145         if (! ReadExample(data, target, i)) return false;
00146         currentExample++;
00147     }
00148 
00149     return true;
00150 }
00151 
00152 bool DataFile::GetData(Array<double>& training_data, Array<double>& training_target, int training,
00153         Array<double>& test_data, Array<double>& test_target, int test,
00154         bool shuffle)
00155 {
00156     if (currentExample + training + test > numberOfExamples) return false;
00157 
00158     if (shuffle)
00159     {
00160         training_data.resize(training, dataDim, false);
00161         training_target.resize(training, targetDim, false);
00162         test_data.resize(test, dataDim, false);
00163         test_target.resize(test, targetDim, false);
00164 
00165         int i_train = 0;
00166         int i_test = 0;
00167         while (training + test > 0)
00168         {
00169             int r = Rng::discrete(0, training + test - 1);
00170             if (r < training)
00171             {
00172                 if (! ReadExample(training_data, training_target, i_train)) return false;
00173                 i_train++;
00174             }
00175             else
00176             {
00177                 if (! ReadExample(test_data, test_target, i_test)) return false;
00178                 i_test++;
00179             }
00180             currentExample++;
00181         }
00182 
00183         return true;
00184     }
00185     else
00186     {
00187         return (GetData(training_data, training_target, training)
00188                 && GetData(test_data, test_target, test));
00189     }
00190 }
00191 
00192 bool DataFile::ReadHeaderLine()
00193 {
00194     int res;
00195     char buffer[256];
00196 
00197     if (fread(buffer, 1, 1, file) != 1) return false;
00198     if (buffer[0] != '#') return false;
00199 
00200     res = ReadToken(buffer, sizeof(buffer), " ");
00201     if (res != ' ') return false;
00202     numberOfExamples = atoi(buffer);
00203 
00204     res = ReadToken(buffer, sizeof(buffer), " ");
00205     if (res != ' ') return false;
00206     dataDim = atoi(buffer);
00207 
00208     res = ReadToken(buffer, sizeof(buffer), " ");
00209     if (res != ' ') return false;
00210     targetDim = atoi(buffer);
00211 
00212     res = ReadToken(buffer, sizeof(buffer), " \n");
00213     if (res > 1000) return false;
00214     if (res != '\n')
00215     {
00216         res = DiscardUntil("\n");
00217         if (res > 1000) return false;
00218     }
00219 
00220     if (strcmp(buffer, "ascii") == 0) format = 0;
00221     else if (strcmp(buffer, "sparse") == 0) format = 1;
00222     else if (strcmp(buffer, "float") == 0) format = 2;
00223     else if (strcmp(buffer, "double") == 0) format = 3;
00224     else if (strcmp(buffer, "int8") == 0) format = 4;
00225     else if (strcmp(buffer, "int16") == 0) format = 5;
00226     else if (strcmp(buffer, "int32") == 0) format = 6;
00227     else if (strcmp(buffer, "uint8") == 0) format = 7;
00228     else if (strcmp(buffer, "uint16") == 0) format = 8;
00229     else if (strcmp(buffer, "uint32") == 0) format = 9;
00230     else return false;
00231 
00232     currentExample = 0;
00233 
00234     return true;
00235 }
00236 
00237 bool DataFile::ReadExample(Array<double>& data, Array<double>& target, int number)
00238 {
00239     int i, index;
00240     int res;
00241     char buffer[256];
00242 
00243     if (format == 0)
00244     {
00245         // "ascii" format
00246         for (i = 0; i < dataDim; i++)
00247         {
00248             res = ReadToken(buffer, sizeof(buffer), " ");
00249             if (res != ' ') return false;
00250             data(number, i) = atof(buffer);
00251         }
00252         for (i = 0; i < targetDim; i++)
00253         {
00254             res = ReadToken(buffer, sizeof(buffer), " \n");
00255             if (res > 1000) return false;
00256             if (i == targetDim - 1)
00257             {
00258                 if (res == ' ')
00259                 {
00260                     if (DiscardUntil("\n") > 1000) return false;
00261                 }
00262             }
00263             else if (res != ' ') return false;
00264             target(number, i) = atof(buffer);
00265         }
00266     }
00267     else if (format == 1)
00268     {
00269         // "sparse" format
00270         for (i = 0; i < dataDim; i++) data(number, i) = 0.0;
00271         while (true)
00272         {
00273             res = ReadToken(buffer, sizeof(buffer), " :\n");
00274             if (buffer[0] == ';') break;
00275             if (res == '\n')
00276             {
00277                 printf("[number=%d --1--]", number); return false;
00278             }
00279             if (res > 1000)
00280             {
00281                 printf("[2]"); return false;
00282             }
00283             index = atoi(buffer);
00284             if (res == ' ') res = DiscardUntil(":;\n");
00285             if (res != ':')
00286             {
00287                 printf("[3]"); return false;
00288             }
00289             res = ReadToken(buffer, sizeof(buffer), " :;\n");
00290             if (res == ':')
00291             {
00292                 printf("[4]"); return false;
00293             }
00294             if (res == '\n')
00295             {
00296                 printf("[5]"); return false;
00297             }
00298             if (res > 1000)
00299             {
00300                 printf("[6]"); return false;
00301             }
00302             data(number, index) = atof(buffer);
00303             if (res == ';') break;
00304         }
00305         for (i = 0; i < targetDim; i++)
00306         {
00307             res = ReadToken(buffer, sizeof(buffer), " \n");
00308             if (res > 1000) return false;
00309             if (i == targetDim - 1)
00310             {
00311                 if (res == ' ')
00312                 {
00313                     if (DiscardUntil("\n") > 1000) return false;
00314                 }
00315             }
00316             else if (res != ' ') return false;
00317             target(number, i) = atof(buffer);
00318         }
00319     }
00320     else if (format == 2)
00321     {
00322         DataFile_ReadType(float);
00323     }
00324     else if (format == 3)
00325     {
00326         DataFile_ReadType(double);
00327     }
00328     else if (format == 4)
00329     {
00330         DataFile_ReadType(char);
00331     }
00332     else if (format == 5)
00333     {
00334         DataFile_ReadType(short);
00335     }
00336     else if (format == 6)
00337     {
00338         DataFile_ReadType(int);
00339     }
00340     else if (format == 7)
00341     {
00342         DataFile_ReadType(unsigned char);
00343     }
00344     else if (format == 8)
00345     {
00346         DataFile_ReadType(unsigned short);
00347     }
00348     else if (format == 9)
00349     {
00350         DataFile_ReadType(unsigned int);
00351     }
00352 
00353     return true;
00354 }
00355 
00356 int DataFile::ReadToken(char* buffer, int maxlength, const char* separators)
00357 {
00358     int i;
00359     int s, sc = strlen(separators);
00360     char c;
00361     bool start = true;
00362     for (i = 0; i < maxlength - 1; i++)
00363     {
00364         if (fread(&c, 1, 1, file) == 0) return 1001;
00365         for (s = 0; s < sc; s++)
00366         {
00367             if (separators[s] == c) break;
00368             if (separators[s] == '\n' && c == '\r')
00369             {
00370                 // assume CR/LF end of line
00371                 if (fread(&c, 1, 1, file) == 0) return 1001;
00372                 break;
00373             }
00374         }
00375         if (s < sc)
00376         {
00377             if (start)
00378             {
00379                 i--;
00380                 continue;
00381             }
00382             else
00383             {
00384                 buffer[i] = 0;
00385                 return separators[s];
00386             }
00387         }
00388         buffer[i] = c;
00389         start = false;
00390     }
00391     buffer[i] = 0;
00392     return 1003;
00393 }
00394 
00395 int DataFile::DiscardUntil(const char* separators)
00396 {
00397     int s, sc = strlen(separators);
00398     char c;
00399     while (true)
00400     {
00401         if (fread(&c, 1, 1, file) == 0) return 1001;
00402         for (s = 0; s < sc; s++)
00403         {
00404             if (separators[s] == c) return c;
00405             if (separators[s] == '\n' && c == '\r')
00406             {
00407                 // assume CR/LF end of line
00408                 if (fread(&c, 1, 1, file) == 0) return 1001;
00409                 return c;
00410             }
00411         }
00412     }
00413 }
00414 
00415 
00417 
00418 Dataset::Dataset(const Dataset& dataset)
00419 {
00420     this->trainingData   = dataset.getTrainingData();
00421     this->trainingTarget = dataset.getTrainingTarget();
00422     this->testData       = dataset.getTestData();
00423     this->testTarget     = dataset.getTestTarget();
00424 }
00425 
00426 Dataset::Dataset(DataSource& source, int train, int test)
00427 {
00428     if (! source.GetData(trainingData, trainingTarget, train))
00429         throw SHARKEXCEPTION("[Dataset::Dataset] error generating the dataset");
00430     if (! source.GetData(testData, testTarget, test))
00431         throw SHARKEXCEPTION("[Dataset::Dataset] error generating the dataset");
00432 }
00433 
00434 Dataset::Dataset(const char* filename, int train, int test)
00435 {
00436     DataFile file(filename);
00437     if (test == 0) test = file.getNumberOfExamples() - train;
00438     if (train + test > file.getNumberOfExamples() || train <= 0 || test < 0)
00439         throw SHARKEXCEPTION("[Dataset::Dataset] invalid split into training and test set");
00440     if (! file.GetData(trainingData, trainingTarget, train))
00441         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset)");
00442     if (! file.GetData(testData, testTarget, test))
00443         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset)");
00444 }
00445 
00446 Dataset::Dataset(const char* filename, double train)
00447 {
00448     DataFile file(filename);
00449     int n_train = (int)(file.getNumberOfExamples() * train);
00450     int n_test = (int)(file.getNumberOfExamples() * (1.0 - train));
00451     if (! file.GetData(trainingData, trainingTarget, n_train))
00452         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00453     if (! file.GetData(testData, testTarget, n_test))
00454         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00455 }
00456 
00457 Dataset::Dataset(const char* trainfile, const char* testfile)
00458 {
00459     DataFile file1(trainfile);
00460     DataFile file2(testfile);
00461     if (! file1.GetData(trainingData, trainingTarget, file1.getNumberOfExamples()))
00462         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00463     if (! file2.GetData(testData, testTarget, file2.getNumberOfExamples()))
00464         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00465 }
00466 
00467 Dataset::Dataset(const char* trainfile, const char* testfile, int train)
00468 {
00469     Array<double> dataTrain;
00470     Array<double> targetTrain;
00471     Array<double> dataTest;
00472     Array<double> targetTest;
00473     DataFile file1(trainfile);
00474     DataFile file2(testfile);
00475     if (! file1.GetData(dataTrain, targetTrain, file1.getNumberOfExamples()))
00476         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00477     if (! file2.GetData(dataTest, targetTest, file2.getNumberOfExamples()))
00478         throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00479 
00480     int tfs = dataTrain.dim(0);
00481     int all = tfs + dataTest.dim(0);
00482     int test = all - train;
00483     int dim = dataTrain.dim(1);
00484     int i, j, k;
00485     if (train <= 0 || test <= 0) throw SHARKEXCEPTION("[Dataset::Dataset] invalid split into training and test set");
00486 
00487     trainingData.resize(train, dim, false);
00488     trainingTarget.resize(train, 1, false);
00489     testData.resize(test, dim, false);
00490     testTarget.resize(test, 1, false);
00491     std::vector<int> entry(all);
00492     for (i = 0; i < all; i++) entry[i] = i;
00493 
00494     for (i = 0; i < train; i++)
00495     {
00496         j = Rng::discrete(0, entry.size() - 1);
00497         k = entry[j];
00498         entry.erase(entry.begin() + j);
00499 
00500         if (k < tfs)
00501         {
00502             trainingData[i] = dataTrain[k];
00503             trainingTarget(i, 0) = targetTrain(k, 0);
00504         }
00505         else
00506         {
00507             trainingData[i] = dataTest[k - tfs];
00508             trainingTarget(i, 0) = targetTest(k - tfs, 0);
00509         }
00510     }
00511 
00512     for (i = 0; i < test; i++)
00513     {
00514         k = entry[i];
00515 
00516         if (k < tfs)
00517         {
00518             testData[i] = dataTrain[k];
00519             testTarget(i, 0) = targetTrain(k, 0);
00520         }
00521         else
00522         {
00523             testData[i] = dataTest[k - tfs];
00524             testTarget(i, 0) = targetTest(k - tfs, 0);
00525         }
00526     }
00527 }
00528 
00529 Dataset::Dataset(const char* datafile, const char* splitfile, double disambiguation)
00530 {
00531     DataFile data(datafile);
00532 
00533     std::vector<unsigned int> train;
00534     std::vector<unsigned int> test;
00535     if (! ReadSplitFile(splitfile, train, test)) throw SHARKEXCEPTION("[Dataset::Dataset] error reading the split file");
00536 
00537     int n_train = train.size();
00538     int n_test = test.size();
00539     if (data.getNumberOfExamples() != n_train + n_test) throw SHARKEXCEPTION("[Dataset::Dataset] data file and split file do not match");
00540     int dim_data = data.getDataDimension();
00541     int dim_target = data.getTargetDimension();
00542 
00543     trainingData.resize(n_train, dim_data, false);
00544     trainingTarget.resize(n_train, dim_target, false);
00545     testData.resize(n_test, dim_data, false);
00546     testTarget.resize(n_test, dim_target, false);
00547 
00548     std::sort(train.begin(), train.end());
00549     std::sort(test.begin(), test.end());
00550 
00551     int i;
00552     int tr = 0;
00553     int te = 0;
00554     for (i=0; i<n_train + n_test; i++)
00555     {
00556         Array<double> tmp_data;
00557         Array<double> tmp_target;
00558         data.GetData(tmp_data, tmp_target, 1);
00559         if (tr < n_train && (int)train[tr] == i)
00560         {
00561             trainingData[tr] = tmp_data[0];
00562             trainingTarget[tr] = tmp_target[0];
00563             tr++;
00564         }
00565         else if (te < n_test && (int)test[te] == i)
00566         {
00567             testData[te] = tmp_data[0];
00568             testTarget[te] = tmp_target[0];
00569             te++;
00570         }
00571         else throw SHARKEXCEPTION("[Dataset::Dataset] split file is inconsistent");
00572     }
00573 }
00574 
00575 Dataset::Dataset(const Array<double>& trainingData, const Array<double>& trainingTarget, const Array<double>& testData, const Array<double>& testTarget)
00576 {
00577     this->trainingData = trainingData;
00578     this->trainingTarget = trainingTarget;
00579     this->testData = testData;
00580     this->testTarget = testTarget;
00581 }
00582 
00583 
00584 void Dataset::ShuffleTraining()
00585 {
00586     Array<double> tmp1;
00587     Array<double> tmp2;
00588     unsigned int i, ic = trainingData.dim(0);
00589     for (i=1; i<ic; i++)
00590     {
00591         unsigned int j = Rng::discrete(0, i);
00592         if (i != j)
00593         {
00594             tmp1 = trainingData[i];
00595             trainingData[i] = trainingData[j];
00596             trainingData[j] = tmp1;
00597             tmp2 = trainingTarget[i];
00598             trainingTarget[i] = trainingTarget[j];
00599             trainingTarget[j] = tmp2;
00600         }
00601     }
00602 }
00603 
00604 void Dataset::ShuffleTest()
00605 {
00606     Array<double> tmp1;
00607     Array<double> tmp2;
00608     unsigned int i, ic = testData.dim(0);
00609     for (i=1; i<ic; i++)
00610     {
00611         unsigned int j = Rng::discrete(0, i);
00612         if (i != j)
00613         {
00614             tmp1 = testData[i];
00615             testData[i] = testData[j];
00616             testData[j] = tmp1;
00617             tmp2 = testTarget[i];
00618             testTarget[i] = testTarget[j];
00619             testTarget[j] = tmp2;
00620         }
00621     }
00622 }
00623 
00624 void Dataset::ShuffleAll()
00625 {
00626     Array<double> tmp1;
00627     Array<double> tmp2;
00628     unsigned int i, ic = trainingData.dim(0) + testData.dim(0);
00629     unsigned int c = trainingData.dim(0);
00630     for (i=1; i<ic; i++)
00631     {
00632         unsigned int j = Rng::discrete(0, i);
00633         if (i != j)
00634         {
00635             if (i < c)
00636             {
00637                 if (j < c)
00638                 {
00639                     tmp1 = trainingData[i];   trainingData[i]   = trainingData[j];   trainingData[j]   = tmp1;
00640                     tmp2 = trainingTarget[i]; trainingTarget[i] = trainingTarget[j]; trainingTarget[j] = tmp2;
00641                 }
00642                 else
00643                 {
00644                     tmp1 = trainingData[i];   trainingData[i]   = testData[j-c];   testData[j-c]   = tmp1;
00645                     tmp2 = trainingTarget[i]; trainingTarget[i] = testTarget[j-c]; testTarget[j-c] = tmp2;
00646                 }
00647             }
00648             else
00649             {
00650                 if (j < c)
00651                 {
00652                     tmp1 = testData[i-c];   testData[i-c]   = trainingData[j];   trainingData[j]   = tmp1;
00653                     tmp2 = testTarget[i-c]; testTarget[i-c] = trainingTarget[j]; trainingTarget[j] = tmp2;
00654                 }
00655                 else
00656                 {
00657                     tmp1 = testData[i-c];   testData[i-c]   = testData[j-c];   testData[j-c]   = tmp1;
00658                     tmp2 = testTarget[i-c]; testTarget[i-c] = testTarget[j-c]; testTarget[j-c] = tmp2;
00659                 }
00660             }
00661         }
00662     }
00663 }
00664 
00665 bool Dataset::Save(const char* filename, bool training, bool test, const char* format)
00666 {
00667     FILE* file = fopen(filename, "w+");
00668     if (file == NULL) return false;
00669 
00670     int i, ic = 0;
00671     int o, oc = 0;
00672     int t, ttr = 0, tte = 0, total = 0;
00673     if (training)
00674     {
00675         ttr = trainingData.dim(0);
00676         ic = trainingData.dim(1);
00677         oc = trainingTarget.dim(1);
00678         total += ttr;
00679     }
00680     if (test)
00681     {
00682         tte = testData.dim(0);
00683         ic = testData.dim(1);
00684         oc = testTarget.dim(1);
00685         total += tte;
00686     }
00687     if (total == 0) return false;       // does not make any sense --> failure.
00688 
00689     fprintf(file, "# %d %d %d %s\n", total, ic, oc, format);
00690 
00691     if (strcmp(format, "ascii") == 0)
00692     {
00693         if (training)
00694         {
00695             for (t=0; t<ttr; t++)
00696             {
00697                 for (i=0; i<ic; i++)
00698                 {
00699                     fprintf(file, "%g ", trainingData(t, i));
00700                 }
00701                 for (o=0; o<oc-1; o++)
00702                 {
00703                     fprintf(file, "%g ", trainingTarget(t, o));
00704                 }
00705                 fprintf(file, "%g\n", trainingTarget(t, oc-1));
00706             }
00707         }
00708         if (test)
00709         {
00710             for (t=0; t<tte; t++)
00711             {
00712                 for (i=0; i<ic; i++)
00713                 {
00714                     fprintf(file, "%g ", testData(t, i));
00715                 }
00716                 for (o=0; o<oc-1; o++)
00717                 {
00718                     fprintf(file, "%g ", testTarget(t, o));
00719                 }
00720                 fprintf(file, "%g\n", testTarget(t, oc-1));
00721             }
00722         }
00723     }
00724     else if (strcmp(format, "sparse") == 0)
00725     {
00726         if (training)
00727         {
00728             for (t=0; t<ttr; t++)
00729             {
00730                 for (i=0; i<ic; i++)
00731                 {
00732                     if (trainingData(t, i) != 0.0)
00733                     {
00734                         fprintf(file, "%d:%g ", i, trainingData(t, i));
00735                     }
00736                 }
00737                 fprintf(file, "; ");
00738                 for (o=0; o<oc-1; o++)
00739                 {
00740                     fprintf(file, "%g ", trainingTarget(t, o));
00741                 }
00742                 fprintf(file, "%g\n", trainingTarget(t, oc-1));
00743             }
00744         }
00745         if (test)
00746         {
00747             for (t=0; t<tte; t++)
00748             {
00749                 for (i=0; i<ic; i++)
00750                 {
00751                     if (testData(t, i) != 0.0)
00752                     {
00753                         fprintf(file, "%d:%g ", i, testData(t, i));
00754                     }
00755                 }
00756                 fprintf(file, "; ");
00757                 for (o=0; o<oc-1; o++)
00758                 {
00759                     fprintf(file, "%g ", testTarget(t, o));
00760                 }
00761                 fprintf(file, "%g\n", testTarget(t, oc-1));
00762             }
00763         }
00764     }
00765     else if (strcmp(format, "float") == 0)
00766     {
00767         Dataset_WriteType(float);
00768     }
00769     else if (strcmp(format, "double") == 0)
00770     {
00771         Dataset_WriteType(double);
00772     }
00773     else if (strcmp(format, "int8") == 0)
00774     {
00775         Dataset_WriteType(char);
00776     }
00777     else if (strcmp(format, "int16") == 0)
00778     {
00779         Dataset_WriteType(short);
00780     }
00781     else if (strcmp(format, "int32") == 0)
00782     {
00783         Dataset_WriteType(int);
00784     }
00785     else if (strcmp(format, "uint8") == 0)
00786     {
00787         Dataset_WriteType(unsigned char);
00788     }
00789     else if (strcmp(format, "uint16") == 0)
00790     {
00791         Dataset_WriteType(unsigned short);
00792     }
00793     else if (strcmp(format, "uint32") == 0)
00794     {
00795         Dataset_WriteType(unsigned int);
00796     }
00797     else return false;
00798 
00799     fclose(file);
00800     return true;
00801 }
00802 
00803 bool Dataset::SaveLIBSVM(const char* filename, bool training, bool test)
00804 {
00805     std::ofstream f(filename);
00806     if (! f.is_open()) return false;
00807 
00808     if (training)
00809     {
00810         int i, ic = trainingData.dim(0);
00811         int d, dim = trainingData.dim(1);
00812         SIZE_CHECK(trainingTarget.dim(1) == 1);
00813 
00814         for (i=0; i<ic; i++)
00815         {
00816             double label = trainingTarget(i, 0);
00817             RANGE_CHECK (label == 1.0 || label == -1.0);
00818             f << label;
00819             for (d=0; d<dim; d++)
00820             {
00821                 double value = trainingData(i, d);
00822                 if (value != 0.0) f << " " << d << ":" << value;
00823             }
00824             f << "\n";
00825         }
00826     }
00827 
00828     if (test)
00829     {
00830         int i, ic = testData.dim(0);
00831         int d, dim = testData.dim(1);
00832         SIZE_CHECK(testTarget.dim(1) == 1);
00833 
00834         for (i=0; i<ic; i++)
00835         {
00836             double label = testTarget(i, 0);
00837             RANGE_CHECK (label == 1.0 || label == -1.0);
00838             f << label;
00839             for (d=0; d<dim; d++)
00840             {
00841                 double value = testData(i, d);
00842                 if (value != 0.0) f << " " << d << ":" << value;
00843             }
00844             f << "\n";
00845         }
00846     }
00847 
00848     f.close();
00849 
00850     return true;
00851 }
00852 
00853 void Dataset::NormalizeComponents()
00854 {
00855     int i, ic = trainingData.dim(0);
00856     int j, jc = testData.dim(0);
00857     int d, dim = trainingData.dim(1);
00858     for (d=0; d<dim; d++)
00859     {
00860         double sum = 0.0;
00861         for (i=0; i<ic; i++)
00862         {
00863             sum += trainingData(i, d);
00864         }
00865         double mean = sum / (double)ic;
00866         double var = 0.0;
00867         for (i=0; i<ic; i++)
00868         {
00869             double diff = trainingData(i, d) - mean;
00870             var += diff * diff;
00871         }
00872         var /= (double)ic;
00873         double stddev = sqrt(var);
00874         if (stddev == 0.0) continue;
00875 
00876         for (i=0; i<ic; i++)
00877         {
00878             trainingData(i, d) = (trainingData(i, d) - mean) / stddev;
00879         }
00880         for (j=0; j<jc; j++)
00881         {
00882             testData(j, d) = (testData(j, d) - mean) / stddev;
00883         }
00884     }
00885 }
00886 
00887 void Dataset::NormalizeComponent( int d )
00888 {
00889     int i, ic = trainingData.dim(0);
00890     int j, jc = testData.dim(0);
00891     int dim = trainingData.dim(1);
00892 
00893     if( d < 0 || d >= dim )
00894         return;
00895 
00896     double sum = 0.0;
00897     for (i=0; i<ic; i++)
00898     {
00899         sum += trainingData(i, d);
00900     }
00901     double mean = sum / (double)ic;
00902     double var = 0.0;
00903     for (i=0; i<ic; i++)
00904     {
00905         double diff = trainingData(i, d) - mean;
00906         var += diff * diff;
00907     }
00908     var /= (double)ic;
00909     double stddev = sqrt(var);
00910     if (stddev == 0.0) return;
00911 
00912     for (i=0; i<ic; i++)
00913     {
00914         trainingData(i, d) = (trainingData(i, d) - mean) / stddev;
00915     }
00916     for (j=0; j<jc; j++)
00917     {
00918         testData(j, d) = (testData(j, d) - mean) / stddev;
00919     }
00920 
00921 }
00922 
00923 bool Dataset::ReadLine(FILE* file, char* buffer, int bufferlength)
00924 {
00925     int pos = 0;
00926     while (true)
00927     {
00928         if (pos == bufferlength) return false;
00929         if (fread(&buffer[pos], 1, 1, file) != 1) return false;
00930         if (buffer[pos] == '\n')
00931         {
00932             buffer[pos] = 0;
00933             return true;
00934         }
00935         pos++;
00936     }
00937 }
00938 
00939 bool Dataset::ReadSplitFile(const char* filename, std::vector<unsigned int>& train, std::vector<unsigned int>& test)
00940 {
00941     FILE* file = fopen(filename, "r");
00942     if (file == NULL) return false;
00943 
00944     int i;
00945     char buffer[256];
00946     char* end;
00947 
00948     // read the header line
00949     if (! ReadLine(file, buffer, 256)) return false;
00950     if (buffer[0] != '#' || buffer[1] != ' ') return false;
00951     int n_train = strtol(buffer+2, &end, 10);
00952     if (n_train <= 0) return false;
00953     if (*end != ' ') return false;
00954     int n_test = strtol(end+1, &end, 10);
00955     if (n_test <= 0) return false;
00956 
00957     // read the split
00958     train.resize(n_train);
00959     test.resize(n_test);
00960     for (i=0; i<n_train; i++)
00961     {
00962         if (! ReadLine(file, buffer, 256)) return false;
00963         train[i] = atoi(buffer);
00964     }
00965     for (i=0; i<n_test; i++)
00966     {
00967         if (! ReadLine(file, buffer, 256)) return false;
00968         test[i] = atoi(buffer);
00969     }
00970 
00971     fclose(file);
00972     return true;
00973 }