00001
00037
00038
00039
00040 #include <fstream>
00041 #include <vector>
00042 #include <algorithm>
00043 #include <SharkDefs.h>
00044 #include <Rng/GlobalRng.h>
00045 #include <ReClaM/Dataset.h>
00046
00047
00048
00049 #define DataFile_ReadType(T) \
00050 T value; \
00051 for (i = 0; i < dataDim; i++) { \
00052 if (fread(&value, sizeof(T), 1, file) != 1) return false; \
00053 data(number, i) = value; \
00054 } \
00055 for (i = 0; i < targetDim; i++) { \
00056 if (fread(&value, sizeof(T), 1, file) != 1) return false; \
00057 target(number, i) = value; \
00058 }
00059
00060
00061 #define Dataset_WriteType(T) \
00062 T value; \
00063 if (training) \
00064 { \
00065 for (t=0; t<ttr; t++) \
00066 { \
00067 for (i=0; i<ic; i++) \
00068 { \
00069 value = (T)trainingData(t, i); \
00070 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00071 } \
00072 for (o=0; o<oc; o++) \
00073 { \
00074 value = (T)trainingTarget(t, o); \
00075 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00076 } \
00077 } \
00078 } \
00079 if (test) \
00080 { \
00081 for (t=0; t<tte; t++) \
00082 { \
00083 for (i=0; i<ic; i++) \
00084 { \
00085 value = (T)testData(t, i); \
00086 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00087 } \
00088 for (o=0; o<oc; o++) \
00089 { \
00090 value = (T)testTarget(t, o); \
00091 if (fwrite(&value, sizeof(T), 1, file) != 1) { fclose(file); return false; } \
00092 } \
00093 } \
00094 }
00095
00096
00098
00099
00100 DataSource::DataSource()
00101 {
00102 }
00103
00104 DataSource::~DataSource()
00105 {
00106 }
00107
00108
00110
00111
00112 DataFile::DataFile(const char* filename)
00113 {
00114 file = fopen(filename, "r");
00115 if (file == NULL) throw SHARKEXCEPTION("[DataFile::DataFile] cannot open file");
00116
00117 if (! ReadHeaderLine())
00118 {
00119 fclose(file);
00120 file = NULL;
00121 throw SHARKEXCEPTION("[DataFile::DataFile] error in file header");
00122 }
00123 }
00124
00125 DataFile::~DataFile()
00126 {
00127 if (file != NULL)
00128 {
00129 fclose(file);
00130 file = NULL;
00131 }
00132 }
00133
00134
00135 bool DataFile::GetData(Array<double>& data, Array<double>& target, int count)
00136 {
00137 if (currentExample + count > numberOfExamples) return false;
00138
00139 data.resize(count, dataDim, false);
00140 target.resize(count, targetDim, false);
00141
00142 int i;
00143 for (i = 0; i < count; i++)
00144 {
00145 if (! ReadExample(data, target, i)) return false;
00146 currentExample++;
00147 }
00148
00149 return true;
00150 }
00151
00152 bool DataFile::GetData(Array<double>& training_data, Array<double>& training_target, int training,
00153 Array<double>& test_data, Array<double>& test_target, int test,
00154 bool shuffle)
00155 {
00156 if (currentExample + training + test > numberOfExamples) return false;
00157
00158 if (shuffle)
00159 {
00160 training_data.resize(training, dataDim, false);
00161 training_target.resize(training, targetDim, false);
00162 test_data.resize(test, dataDim, false);
00163 test_target.resize(test, targetDim, false);
00164
00165 int i_train = 0;
00166 int i_test = 0;
00167 while (training + test > 0)
00168 {
00169 int r = Rng::discrete(0, training + test - 1);
00170 if (r < training)
00171 {
00172 if (! ReadExample(training_data, training_target, i_train)) return false;
00173 i_train++;
00174 }
00175 else
00176 {
00177 if (! ReadExample(test_data, test_target, i_test)) return false;
00178 i_test++;
00179 }
00180 currentExample++;
00181 }
00182
00183 return true;
00184 }
00185 else
00186 {
00187 return (GetData(training_data, training_target, training)
00188 && GetData(test_data, test_target, test));
00189 }
00190 }
00191
00192 bool DataFile::ReadHeaderLine()
00193 {
00194 int res;
00195 char buffer[256];
00196
00197 if (fread(buffer, 1, 1, file) != 1) return false;
00198 if (buffer[0] != '#') return false;
00199
00200 res = ReadToken(buffer, sizeof(buffer), " ");
00201 if (res != ' ') return false;
00202 numberOfExamples = atoi(buffer);
00203
00204 res = ReadToken(buffer, sizeof(buffer), " ");
00205 if (res != ' ') return false;
00206 dataDim = atoi(buffer);
00207
00208 res = ReadToken(buffer, sizeof(buffer), " ");
00209 if (res != ' ') return false;
00210 targetDim = atoi(buffer);
00211
00212 res = ReadToken(buffer, sizeof(buffer), " \n");
00213 if (res > 1000) return false;
00214 if (res != '\n')
00215 {
00216 res = DiscardUntil("\n");
00217 if (res > 1000) return false;
00218 }
00219
00220 if (strcmp(buffer, "ascii") == 0) format = 0;
00221 else if (strcmp(buffer, "sparse") == 0) format = 1;
00222 else if (strcmp(buffer, "float") == 0) format = 2;
00223 else if (strcmp(buffer, "double") == 0) format = 3;
00224 else if (strcmp(buffer, "int8") == 0) format = 4;
00225 else if (strcmp(buffer, "int16") == 0) format = 5;
00226 else if (strcmp(buffer, "int32") == 0) format = 6;
00227 else if (strcmp(buffer, "uint8") == 0) format = 7;
00228 else if (strcmp(buffer, "uint16") == 0) format = 8;
00229 else if (strcmp(buffer, "uint32") == 0) format = 9;
00230 else return false;
00231
00232 currentExample = 0;
00233
00234 return true;
00235 }
00236
00237 bool DataFile::ReadExample(Array<double>& data, Array<double>& target, int number)
00238 {
00239 int i, index;
00240 int res;
00241 char buffer[256];
00242
00243 if (format == 0)
00244 {
00245
00246 for (i = 0; i < dataDim; i++)
00247 {
00248 res = ReadToken(buffer, sizeof(buffer), " ");
00249 if (res != ' ') return false;
00250 data(number, i) = atof(buffer);
00251 }
00252 for (i = 0; i < targetDim; i++)
00253 {
00254 res = ReadToken(buffer, sizeof(buffer), " \n");
00255 if (res > 1000) return false;
00256 if (i == targetDim - 1)
00257 {
00258 if (res == ' ')
00259 {
00260 if (DiscardUntil("\n") > 1000) return false;
00261 }
00262 }
00263 else if (res != ' ') return false;
00264 target(number, i) = atof(buffer);
00265 }
00266 }
00267 else if (format == 1)
00268 {
00269
00270 for (i = 0; i < dataDim; i++) data(number, i) = 0.0;
00271 while (true)
00272 {
00273 res = ReadToken(buffer, sizeof(buffer), " :\n");
00274 if (buffer[0] == ';') break;
00275 if (res == '\n')
00276 {
00277 printf("[number=%d --1--]", number); return false;
00278 }
00279 if (res > 1000)
00280 {
00281 printf("[2]"); return false;
00282 }
00283 index = atoi(buffer);
00284 if (res == ' ') res = DiscardUntil(":;\n");
00285 if (res != ':')
00286 {
00287 printf("[3]"); return false;
00288 }
00289 res = ReadToken(buffer, sizeof(buffer), " :;\n");
00290 if (res == ':')
00291 {
00292 printf("[4]"); return false;
00293 }
00294 if (res == '\n')
00295 {
00296 printf("[5]"); return false;
00297 }
00298 if (res > 1000)
00299 {
00300 printf("[6]"); return false;
00301 }
00302 data(number, index) = atof(buffer);
00303 if (res == ';') break;
00304 }
00305 for (i = 0; i < targetDim; i++)
00306 {
00307 res = ReadToken(buffer, sizeof(buffer), " \n");
00308 if (res > 1000) return false;
00309 if (i == targetDim - 1)
00310 {
00311 if (res == ' ')
00312 {
00313 if (DiscardUntil("\n") > 1000) return false;
00314 }
00315 }
00316 else if (res != ' ') return false;
00317 target(number, i) = atof(buffer);
00318 }
00319 }
00320 else if (format == 2)
00321 {
00322 DataFile_ReadType(float);
00323 }
00324 else if (format == 3)
00325 {
00326 DataFile_ReadType(double);
00327 }
00328 else if (format == 4)
00329 {
00330 DataFile_ReadType(char);
00331 }
00332 else if (format == 5)
00333 {
00334 DataFile_ReadType(short);
00335 }
00336 else if (format == 6)
00337 {
00338 DataFile_ReadType(int);
00339 }
00340 else if (format == 7)
00341 {
00342 DataFile_ReadType(unsigned char);
00343 }
00344 else if (format == 8)
00345 {
00346 DataFile_ReadType(unsigned short);
00347 }
00348 else if (format == 9)
00349 {
00350 DataFile_ReadType(unsigned int);
00351 }
00352
00353 return true;
00354 }
00355
00356 int DataFile::ReadToken(char* buffer, int maxlength, const char* separators)
00357 {
00358 int i;
00359 int s, sc = strlen(separators);
00360 char c;
00361 bool start = true;
00362 for (i = 0; i < maxlength - 1; i++)
00363 {
00364 if (fread(&c, 1, 1, file) == 0) return 1001;
00365 for (s = 0; s < sc; s++)
00366 {
00367 if (separators[s] == c) break;
00368 if (separators[s] == '\n' && c == '\r')
00369 {
00370
00371 if (fread(&c, 1, 1, file) == 0) return 1001;
00372 break;
00373 }
00374 }
00375 if (s < sc)
00376 {
00377 if (start)
00378 {
00379 i--;
00380 continue;
00381 }
00382 else
00383 {
00384 buffer[i] = 0;
00385 return separators[s];
00386 }
00387 }
00388 buffer[i] = c;
00389 start = false;
00390 }
00391 buffer[i] = 0;
00392 return 1003;
00393 }
00394
00395 int DataFile::DiscardUntil(const char* separators)
00396 {
00397 int s, sc = strlen(separators);
00398 char c;
00399 while (true)
00400 {
00401 if (fread(&c, 1, 1, file) == 0) return 1001;
00402 for (s = 0; s < sc; s++)
00403 {
00404 if (separators[s] == c) return c;
00405 if (separators[s] == '\n' && c == '\r')
00406 {
00407
00408 if (fread(&c, 1, 1, file) == 0) return 1001;
00409 return c;
00410 }
00411 }
00412 }
00413 }
00414
00415
00417
00418 Dataset::Dataset(const Dataset& dataset)
00419 {
00420 this->trainingData = dataset.getTrainingData();
00421 this->trainingTarget = dataset.getTrainingTarget();
00422 this->testData = dataset.getTestData();
00423 this->testTarget = dataset.getTestTarget();
00424 }
00425
00426 Dataset::Dataset(DataSource& source, int train, int test)
00427 {
00428 if (! source.GetData(trainingData, trainingTarget, train))
00429 throw SHARKEXCEPTION("[Dataset::Dataset] error generating the dataset");
00430 if (! source.GetData(testData, testTarget, test))
00431 throw SHARKEXCEPTION("[Dataset::Dataset] error generating the dataset");
00432 }
00433
00434 Dataset::Dataset(const char* filename, int train, int test)
00435 {
00436 DataFile file(filename);
00437 if (test == 0) test = file.getNumberOfExamples() - train;
00438 if (train + test > file.getNumberOfExamples() || train <= 0 || test < 0)
00439 throw SHARKEXCEPTION("[Dataset::Dataset] invalid split into training and test set");
00440 if (! file.GetData(trainingData, trainingTarget, train))
00441 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset)");
00442 if (! file.GetData(testData, testTarget, test))
00443 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset)");
00444 }
00445
00446 Dataset::Dataset(const char* filename, double train)
00447 {
00448 DataFile file(filename);
00449 int n_train = (int)(file.getNumberOfExamples() * train);
00450 int n_test = (int)(file.getNumberOfExamples() * (1.0 - train));
00451 if (! file.GetData(trainingData, trainingTarget, n_train))
00452 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00453 if (! file.GetData(testData, testTarget, n_test))
00454 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00455 }
00456
00457 Dataset::Dataset(const char* trainfile, const char* testfile)
00458 {
00459 DataFile file1(trainfile);
00460 DataFile file2(testfile);
00461 if (! file1.GetData(trainingData, trainingTarget, file1.getNumberOfExamples()))
00462 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00463 if (! file2.GetData(testData, testTarget, file2.getNumberOfExamples()))
00464 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00465 }
00466
00467 Dataset::Dataset(const char* trainfile, const char* testfile, int train)
00468 {
00469 Array<double> dataTrain;
00470 Array<double> targetTrain;
00471 Array<double> dataTest;
00472 Array<double> targetTest;
00473 DataFile file1(trainfile);
00474 DataFile file2(testfile);
00475 if (! file1.GetData(dataTrain, targetTrain, file1.getNumberOfExamples()))
00476 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00477 if (! file2.GetData(dataTest, targetTest, file2.getNumberOfExamples()))
00478 throw SHARKEXCEPTION("[Dataset::Dataset] error loading the dataset");
00479
00480 int tfs = dataTrain.dim(0);
00481 int all = tfs + dataTest.dim(0);
00482 int test = all - train;
00483 int dim = dataTrain.dim(1);
00484 int i, j, k;
00485 if (train <= 0 || test <= 0) throw SHARKEXCEPTION("[Dataset::Dataset] invalid split into training and test set");
00486
00487 trainingData.resize(train, dim, false);
00488 trainingTarget.resize(train, 1, false);
00489 testData.resize(test, dim, false);
00490 testTarget.resize(test, 1, false);
00491 std::vector<int> entry(all);
00492 for (i = 0; i < all; i++) entry[i] = i;
00493
00494 for (i = 0; i < train; i++)
00495 {
00496 j = Rng::discrete(0, entry.size() - 1);
00497 k = entry[j];
00498 entry.erase(entry.begin() + j);
00499
00500 if (k < tfs)
00501 {
00502 trainingData[i] = dataTrain[k];
00503 trainingTarget(i, 0) = targetTrain(k, 0);
00504 }
00505 else
00506 {
00507 trainingData[i] = dataTest[k - tfs];
00508 trainingTarget(i, 0) = targetTest(k - tfs, 0);
00509 }
00510 }
00511
00512 for (i = 0; i < test; i++)
00513 {
00514 k = entry[i];
00515
00516 if (k < tfs)
00517 {
00518 testData[i] = dataTrain[k];
00519 testTarget(i, 0) = targetTrain(k, 0);
00520 }
00521 else
00522 {
00523 testData[i] = dataTest[k - tfs];
00524 testTarget(i, 0) = targetTest(k - tfs, 0);
00525 }
00526 }
00527 }
00528
00529 Dataset::Dataset(const char* datafile, const char* splitfile, double disambiguation)
00530 {
00531 DataFile data(datafile);
00532
00533 std::vector<unsigned int> train;
00534 std::vector<unsigned int> test;
00535 if (! ReadSplitFile(splitfile, train, test)) throw SHARKEXCEPTION("[Dataset::Dataset] error reading the split file");
00536
00537 int n_train = train.size();
00538 int n_test = test.size();
00539 if (data.getNumberOfExamples() != n_train + n_test) throw SHARKEXCEPTION("[Dataset::Dataset] data file and split file do not match");
00540 int dim_data = data.getDataDimension();
00541 int dim_target = data.getTargetDimension();
00542
00543 trainingData.resize(n_train, dim_data, false);
00544 trainingTarget.resize(n_train, dim_target, false);
00545 testData.resize(n_test, dim_data, false);
00546 testTarget.resize(n_test, dim_target, false);
00547
00548 std::sort(train.begin(), train.end());
00549 std::sort(test.begin(), test.end());
00550
00551 int i;
00552 int tr = 0;
00553 int te = 0;
00554 for (i=0; i<n_train + n_test; i++)
00555 {
00556 Array<double> tmp_data;
00557 Array<double> tmp_target;
00558 data.GetData(tmp_data, tmp_target, 1);
00559 if (tr < n_train && (int)train[tr] == i)
00560 {
00561 trainingData[tr] = tmp_data[0];
00562 trainingTarget[tr] = tmp_target[0];
00563 tr++;
00564 }
00565 else if (te < n_test && (int)test[te] == i)
00566 {
00567 testData[te] = tmp_data[0];
00568 testTarget[te] = tmp_target[0];
00569 te++;
00570 }
00571 else throw SHARKEXCEPTION("[Dataset::Dataset] split file is inconsistent");
00572 }
00573 }
00574
00575 Dataset::Dataset(const Array<double>& trainingData, const Array<double>& trainingTarget, const Array<double>& testData, const Array<double>& testTarget)
00576 {
00577 this->trainingData = trainingData;
00578 this->trainingTarget = trainingTarget;
00579 this->testData = testData;
00580 this->testTarget = testTarget;
00581 }
00582
00583
00584 void Dataset::ShuffleTraining()
00585 {
00586 Array<double> tmp1;
00587 Array<double> tmp2;
00588 unsigned int i, ic = trainingData.dim(0);
00589 for (i=1; i<ic; i++)
00590 {
00591 unsigned int j = Rng::discrete(0, i);
00592 if (i != j)
00593 {
00594 tmp1 = trainingData[i];
00595 trainingData[i] = trainingData[j];
00596 trainingData[j] = tmp1;
00597 tmp2 = trainingTarget[i];
00598 trainingTarget[i] = trainingTarget[j];
00599 trainingTarget[j] = tmp2;
00600 }
00601 }
00602 }
00603
00604 void Dataset::ShuffleTest()
00605 {
00606 Array<double> tmp1;
00607 Array<double> tmp2;
00608 unsigned int i, ic = testData.dim(0);
00609 for (i=1; i<ic; i++)
00610 {
00611 unsigned int j = Rng::discrete(0, i);
00612 if (i != j)
00613 {
00614 tmp1 = testData[i];
00615 testData[i] = testData[j];
00616 testData[j] = tmp1;
00617 tmp2 = testTarget[i];
00618 testTarget[i] = testTarget[j];
00619 testTarget[j] = tmp2;
00620 }
00621 }
00622 }
00623
00624 void Dataset::ShuffleAll()
00625 {
00626 Array<double> tmp1;
00627 Array<double> tmp2;
00628 unsigned int i, ic = trainingData.dim(0) + testData.dim(0);
00629 unsigned int c = trainingData.dim(0);
00630 for (i=1; i<ic; i++)
00631 {
00632 unsigned int j = Rng::discrete(0, i);
00633 if (i != j)
00634 {
00635 if (i < c)
00636 {
00637 if (j < c)
00638 {
00639 tmp1 = trainingData[i]; trainingData[i] = trainingData[j]; trainingData[j] = tmp1;
00640 tmp2 = trainingTarget[i]; trainingTarget[i] = trainingTarget[j]; trainingTarget[j] = tmp2;
00641 }
00642 else
00643 {
00644 tmp1 = trainingData[i]; trainingData[i] = testData[j-c]; testData[j-c] = tmp1;
00645 tmp2 = trainingTarget[i]; trainingTarget[i] = testTarget[j-c]; testTarget[j-c] = tmp2;
00646 }
00647 }
00648 else
00649 {
00650 if (j < c)
00651 {
00652 tmp1 = testData[i-c]; testData[i-c] = trainingData[j]; trainingData[j] = tmp1;
00653 tmp2 = testTarget[i-c]; testTarget[i-c] = trainingTarget[j]; trainingTarget[j] = tmp2;
00654 }
00655 else
00656 {
00657 tmp1 = testData[i-c]; testData[i-c] = testData[j-c]; testData[j-c] = tmp1;
00658 tmp2 = testTarget[i-c]; testTarget[i-c] = testTarget[j-c]; testTarget[j-c] = tmp2;
00659 }
00660 }
00661 }
00662 }
00663 }
00664
00665 bool Dataset::Save(const char* filename, bool training, bool test, const char* format)
00666 {
00667 FILE* file = fopen(filename, "w+");
00668 if (file == NULL) return false;
00669
00670 int i, ic = 0;
00671 int o, oc = 0;
00672 int t, ttr = 0, tte = 0, total = 0;
00673 if (training)
00674 {
00675 ttr = trainingData.dim(0);
00676 ic = trainingData.dim(1);
00677 oc = trainingTarget.dim(1);
00678 total += ttr;
00679 }
00680 if (test)
00681 {
00682 tte = testData.dim(0);
00683 ic = testData.dim(1);
00684 oc = testTarget.dim(1);
00685 total += tte;
00686 }
00687 if (total == 0) return false;
00688
00689 fprintf(file, "# %d %d %d %s\n", total, ic, oc, format);
00690
00691 if (strcmp(format, "ascii") == 0)
00692 {
00693 if (training)
00694 {
00695 for (t=0; t<ttr; t++)
00696 {
00697 for (i=0; i<ic; i++)
00698 {
00699 fprintf(file, "%g ", trainingData(t, i));
00700 }
00701 for (o=0; o<oc-1; o++)
00702 {
00703 fprintf(file, "%g ", trainingTarget(t, o));
00704 }
00705 fprintf(file, "%g\n", trainingTarget(t, oc-1));
00706 }
00707 }
00708 if (test)
00709 {
00710 for (t=0; t<tte; t++)
00711 {
00712 for (i=0; i<ic; i++)
00713 {
00714 fprintf(file, "%g ", testData(t, i));
00715 }
00716 for (o=0; o<oc-1; o++)
00717 {
00718 fprintf(file, "%g ", testTarget(t, o));
00719 }
00720 fprintf(file, "%g\n", testTarget(t, oc-1));
00721 }
00722 }
00723 }
00724 else if (strcmp(format, "sparse") == 0)
00725 {
00726 if (training)
00727 {
00728 for (t=0; t<ttr; t++)
00729 {
00730 for (i=0; i<ic; i++)
00731 {
00732 if (trainingData(t, i) != 0.0)
00733 {
00734 fprintf(file, "%d:%g ", i, trainingData(t, i));
00735 }
00736 }
00737 fprintf(file, "; ");
00738 for (o=0; o<oc-1; o++)
00739 {
00740 fprintf(file, "%g ", trainingTarget(t, o));
00741 }
00742 fprintf(file, "%g\n", trainingTarget(t, oc-1));
00743 }
00744 }
00745 if (test)
00746 {
00747 for (t=0; t<tte; t++)
00748 {
00749 for (i=0; i<ic; i++)
00750 {
00751 if (testData(t, i) != 0.0)
00752 {
00753 fprintf(file, "%d:%g ", i, testData(t, i));
00754 }
00755 }
00756 fprintf(file, "; ");
00757 for (o=0; o<oc-1; o++)
00758 {
00759 fprintf(file, "%g ", testTarget(t, o));
00760 }
00761 fprintf(file, "%g\n", testTarget(t, oc-1));
00762 }
00763 }
00764 }
00765 else if (strcmp(format, "float") == 0)
00766 {
00767 Dataset_WriteType(float);
00768 }
00769 else if (strcmp(format, "double") == 0)
00770 {
00771 Dataset_WriteType(double);
00772 }
00773 else if (strcmp(format, "int8") == 0)
00774 {
00775 Dataset_WriteType(char);
00776 }
00777 else if (strcmp(format, "int16") == 0)
00778 {
00779 Dataset_WriteType(short);
00780 }
00781 else if (strcmp(format, "int32") == 0)
00782 {
00783 Dataset_WriteType(int);
00784 }
00785 else if (strcmp(format, "uint8") == 0)
00786 {
00787 Dataset_WriteType(unsigned char);
00788 }
00789 else if (strcmp(format, "uint16") == 0)
00790 {
00791 Dataset_WriteType(unsigned short);
00792 }
00793 else if (strcmp(format, "uint32") == 0)
00794 {
00795 Dataset_WriteType(unsigned int);
00796 }
00797 else return false;
00798
00799 fclose(file);
00800 return true;
00801 }
00802
00803 bool Dataset::SaveLIBSVM(const char* filename, bool training, bool test)
00804 {
00805 std::ofstream f(filename);
00806 if (! f.is_open()) return false;
00807
00808 if (training)
00809 {
00810 int i, ic = trainingData.dim(0);
00811 int d, dim = trainingData.dim(1);
00812 SIZE_CHECK(trainingTarget.dim(1) == 1);
00813
00814 for (i=0; i<ic; i++)
00815 {
00816 double label = trainingTarget(i, 0);
00817 RANGE_CHECK (label == 1.0 || label == -1.0);
00818 f << label;
00819 for (d=0; d<dim; d++)
00820 {
00821 double value = trainingData(i, d);
00822 if (value != 0.0) f << " " << d << ":" << value;
00823 }
00824 f << "\n";
00825 }
00826 }
00827
00828 if (test)
00829 {
00830 int i, ic = testData.dim(0);
00831 int d, dim = testData.dim(1);
00832 SIZE_CHECK(testTarget.dim(1) == 1);
00833
00834 for (i=0; i<ic; i++)
00835 {
00836 double label = testTarget(i, 0);
00837 RANGE_CHECK (label == 1.0 || label == -1.0);
00838 f << label;
00839 for (d=0; d<dim; d++)
00840 {
00841 double value = testData(i, d);
00842 if (value != 0.0) f << " " << d << ":" << value;
00843 }
00844 f << "\n";
00845 }
00846 }
00847
00848 f.close();
00849
00850 return true;
00851 }
00852
00853 void Dataset::NormalizeComponents()
00854 {
00855 int i, ic = trainingData.dim(0);
00856 int j, jc = testData.dim(0);
00857 int d, dim = trainingData.dim(1);
00858 for (d=0; d<dim; d++)
00859 {
00860 double sum = 0.0;
00861 for (i=0; i<ic; i++)
00862 {
00863 sum += trainingData(i, d);
00864 }
00865 double mean = sum / (double)ic;
00866 double var = 0.0;
00867 for (i=0; i<ic; i++)
00868 {
00869 double diff = trainingData(i, d) - mean;
00870 var += diff * diff;
00871 }
00872 var /= (double)ic;
00873 double stddev = sqrt(var);
00874 if (stddev == 0.0) continue;
00875
00876 for (i=0; i<ic; i++)
00877 {
00878 trainingData(i, d) = (trainingData(i, d) - mean) / stddev;
00879 }
00880 for (j=0; j<jc; j++)
00881 {
00882 testData(j, d) = (testData(j, d) - mean) / stddev;
00883 }
00884 }
00885 }
00886
00887 void Dataset::NormalizeComponent( int d )
00888 {
00889 int i, ic = trainingData.dim(0);
00890 int j, jc = testData.dim(0);
00891 int dim = trainingData.dim(1);
00892
00893 if( d < 0 || d >= dim )
00894 return;
00895
00896 double sum = 0.0;
00897 for (i=0; i<ic; i++)
00898 {
00899 sum += trainingData(i, d);
00900 }
00901 double mean = sum / (double)ic;
00902 double var = 0.0;
00903 for (i=0; i<ic; i++)
00904 {
00905 double diff = trainingData(i, d) - mean;
00906 var += diff * diff;
00907 }
00908 var /= (double)ic;
00909 double stddev = sqrt(var);
00910 if (stddev == 0.0) return;
00911
00912 for (i=0; i<ic; i++)
00913 {
00914 trainingData(i, d) = (trainingData(i, d) - mean) / stddev;
00915 }
00916 for (j=0; j<jc; j++)
00917 {
00918 testData(j, d) = (testData(j, d) - mean) / stddev;
00919 }
00920
00921 }
00922
00923 bool Dataset::ReadLine(FILE* file, char* buffer, int bufferlength)
00924 {
00925 int pos = 0;
00926 while (true)
00927 {
00928 if (pos == bufferlength) return false;
00929 if (fread(&buffer[pos], 1, 1, file) != 1) return false;
00930 if (buffer[pos] == '\n')
00931 {
00932 buffer[pos] = 0;
00933 return true;
00934 }
00935 pos++;
00936 }
00937 }
00938
00939 bool Dataset::ReadSplitFile(const char* filename, std::vector<unsigned int>& train, std::vector<unsigned int>& test)
00940 {
00941 FILE* file = fopen(filename, "r");
00942 if (file == NULL) return false;
00943
00944 int i;
00945 char buffer[256];
00946 char* end;
00947
00948
00949 if (! ReadLine(file, buffer, 256)) return false;
00950 if (buffer[0] != '#' || buffer[1] != ' ') return false;
00951 int n_train = strtol(buffer+2, &end, 10);
00952 if (n_train <= 0) return false;
00953 if (*end != ' ') return false;
00954 int n_test = strtol(end+1, &end, 10);
00955 if (n_test <= 0) return false;
00956
00957
00958 train.resize(n_train);
00959 test.resize(n_test);
00960 for (i=0; i<n_train; i++)
00961 {
00962 if (! ReadLine(file, buffer, 256)) return false;
00963 train[i] = atoi(buffer);
00964 }
00965 for (i=0; i<n_test; i++)
00966 {
00967 if (! ReadLine(file, buffer, 256)) return false;
00968 test[i] = atoi(buffer);
00969 }
00970
00971 fclose(file);
00972 return true;
00973 }