Commit 58ff6a2f authored by Michael Zbyszyński's avatar Michael Zbyszyński
Browse files

Handling labels in a smarter way in RapidLib

parent dd605c5d
......@@ -10,14 +10,14 @@ RAPIDMIX_BEGIN_NAMESPACE
void trainingData2rapidLib (const trainingData &newTrainingData, std::vector<trainingExample> &trainingSet) {
for (int h = 0; h < newTrainingData.trainingSet.size(); ++h) { //Go through every phrase
for (int i = 0; i < newTrainingData.trainingSet[h].elements.size(); ++i) { //...and every element
trainingExample tempExample;
tempExample.input = newTrainingData.trainingSet[h].elements[i].input;
if (newTrainingData.trainingSet[h].elements[i].output.size() > 0) {
tempExample.output = newTrainingData.trainingSet[h].elements[i].output;
} else {
std::unordered_map<std::string, int>::const_iterator mappedLabel = newTrainingData.labels.find(newTrainingData.trainingSet[h].label);
tempExample.output.push_back(double(mappedLabel->second));
tempExample.output.push_back(double(h));
}
trainingSet.push_back(tempExample);
}
......@@ -27,6 +27,10 @@ void trainingData2rapidLib (const trainingData &newTrainingData, std::vector<tra
template<>
bool machineLearning<classification>::train(const trainingData &newTrainingData) {
std::vector<trainingExample> trainingSet;
labels.clear();
for (int i = 0; i < newTrainingData.trainingSet.size(); ++i) {
labels.push_back(newTrainingData.trainingSet[i].label);
}
trainingData2rapidLib(newTrainingData, trainingSet);
return classification::train(trainingSet);
}
......@@ -41,7 +45,7 @@ bool machineLearning<regression>::train(const trainingData &newTrainingData) {
template<>
bool machineLearning<seriesClassification>::train(const trainingData &newTrainingData) {
std::vector<trainingSeries> seriesSet;
for (int i = 1; i < newTrainingData.trainingSet.size(); ++i) { //each phrase
for (int i = 0; i < newTrainingData.trainingSet.size(); ++i) { //each phrase
trainingSeries tempSeries;
tempSeries.label = newTrainingData.trainingSet[i].label;
for (int j = 0; j < newTrainingData.trainingSet[i].elements.size(); ++j) { //each element
......@@ -52,6 +56,12 @@ bool machineLearning<seriesClassification>::train(const trainingData &newTrainin
return seriesClassification::trainLabel(seriesSet);
}
template<>
std::string machineLearning<classification>::run(const std::vector<double> &inputVector, const std::string &label) {
int classIndex = classification::run(inputVector)[0];
return labels[classIndex];
};
template<>
std::string machineLearning<seriesClassification>::run(const std::vector<std::vector<double> > &inputSeries) {
return seriesClassification::runLabel(inputSeries);
......
......@@ -56,6 +56,9 @@ public:
return MachineLearningModule::run(inputVector);
}
// This is a hack while I think about how to do this. -MZ //
std::string run(const std::vector<double> &inputVector, const std::string &label);
//* This is the one I'm using for DTW */
std::string run(const std::vector<std::vector<double> > &inputSeries);
......@@ -65,6 +68,11 @@ public:
private:
MachineLearningModule module;
//this holds string labels
std::vector<std::string> labels; //FIXME: This probably should be pushed down into rapidLib?
std::string getLabel(int value);
};
////////// typedefs for calling different algorithms
......
......@@ -10,9 +10,7 @@
RAPIDMIX_BEGIN_NAMESPACE
trainingData::trainingData () {
phrase defaultPhrase = {0, "default"};
trainingSet.push_back(defaultPhrase);
currentId = 1;
currentId = 0;
targetPhrase = 0;
};
......@@ -22,7 +20,6 @@ uint32_t trainingData::assignCurrentId() {
return returnVal;
}
uint32_t trainingData::startRecording() {
phrase tempPhrase = { assignCurrentId(), std::to_string(tempPhrase.uniqueId) }; //TODO: Is this label helpful? -MZ
trainingSet.push_back(tempPhrase);
......@@ -30,16 +27,15 @@ uint32_t trainingData::startRecording() {
return tempPhrase.uniqueId;
};
uint32_t trainingData::startRecording(std::string label) {
uint32_t trainingData::startRecording(const std::string &label) {
phrase tempPhrase = { assignCurrentId(), label };
labels.insert(std::make_pair(label, labels.size()));
trainingSet.push_back(tempPhrase);
targetPhrase = int(trainingSet.size() - 1);
return tempPhrase.uniqueId;
};
uint32_t trainingData::addElement(std::vector<double>input, std::vector<double> output) {
uint32_t trainingData::addElement(const std::vector<double> &input, const std::vector<double> &output) {
element newElement;
newElement.uniqueId = assignCurrentId();
newElement.input = input;
......@@ -49,7 +45,7 @@ uint32_t trainingData::addElement(std::vector<double>input, std::vector<double>
return newElement.uniqueId;
}
uint32_t trainingData::addElement(std::vector<double>input) {
uint32_t trainingData::addElement(const std::vector<double> &input) {
element newElement;
newElement.uniqueId = assignCurrentId();
newElement.input = input;
......@@ -60,24 +56,28 @@ uint32_t trainingData::addElement(std::vector<double>input) {
void trainingData::stopRecording() {
targetPhrase = 0; //direct elements to default phrase
//TODO: This doesn't do much. -MZ
}
std::string trainingData::getLabel(int value) {
std::string theLabel = "not found";
for (const auto& element : labels) {
if (element.second == value) {
theLabel = element.first;
}
}
return theLabel;
}
uint32_t trainingData::recordSingleElement(const std::string &label, const std::vector<double> &input) {
startRecording(label);
int returnId = addElement(input);
stopRecording();
return returnId;
};
uint32_t trainingData::recordSingleElement(const std::string &label, const std::vector<double> &input, const std::vector<double> &output) {
startRecording(label);
int returnId = addElement(input, output);
stopRecording();
return returnId;
};
std::vector<std::string> trainingData::getColumnNames() {
return trainingSet[targetPhrase].columnNames;
}
void trainingData::setColumnNames(std::vector<std::string> column_names) {
void trainingData::setColumnNames(const std::vector<std::string> &column_names) {
trainingSet[targetPhrase].columnNames = column_names;
}
......
......@@ -42,23 +42,29 @@ public:
std::vector<phrase> trainingSet;
/** Create a new phrase that can be recorded into. Returns phrase id */
uint32_t startRecording();
uint32_t startRecording(); //FIXME: this should go away. -MZ
/** Create new phrase, with a label, that can be recorded into. Returns phrase id */
uint32_t startRecording(std::string label);
uint32_t startRecording(const std::string &label);
/** Add an element with input and output to the phrase that is recording,
or to the default phrase if recording is stopped. Returns phrase id. */
uint32_t addElement(std::vector<double> input, std::vector<double> output);
uint32_t addElement(const std::vector<double> &input, const std::vector<double> &output);
/** Add an element with just input to the phrase that is recording,
or to the default phrase if recording is stopped. Returns phrase id. */
uint32_t addElement(std::vector<double> input);
uint32_t addElement(const std::vector<double> &input);
void stopRecording();
/** Create a phrase with a single element that has a label and input. Returns phrase id. */
uint32_t recordSingleElement(const std::string &label, const std::vector<double> &input);
/** Create a phrase with a single element that has a label, input, and output. Returns phrase id. */
uint32_t recordSingleElement(const std::string &label, const std::vector<double> &input, const std::vector<double> &output);
std::vector<std::string> getColumnNames();
void setColumnNames(std::vector<std::string> columnNames);
void setColumnNames(const std::vector<std::string> &columnNames);
/** Get a JSON representation of the data set in the form of a styled string */
......@@ -69,11 +75,6 @@ public:
bool putJSON(const std::string &jsonMessage);
/** read a JSON file at file path and build a training set from it */
bool readJSON(const std::string &filepath);
//this holds string labels
std::unordered_map<std::string, int> labels;
std::string getLabel(int value);
private:
int targetPhrase;
......
......@@ -20,11 +20,11 @@ SCENARIO("Test NN Regression", "[machineLearning]")
rapidmix::trainingData myData;
std::vector<double> input = { 0.2, 0.7 };
std::vector<double> output = { 3.0 };
REQUIRE(myData.addElement(input, output) == 1);
REQUIRE(myData.recordSingleElement("label", input, output) == 1); //FIXME: Label is stupd here. -MZ
input = { 2.0, 44.2 };
output = { 20.14 };
REQUIRE(myData.addElement(input, output) == 2);
REQUIRE(myData.recordSingleElement("label", input, output) == 3);
myData.writeJSON("/var/tmp/testTrainingData.json");
......@@ -67,14 +67,8 @@ SCENARIO("Test kNN classification", "[machineLearning]")
rapidmix::staticClassification myKnn;
rapidmix::trainingData myData;
std::vector<double> input = { 0.2, 0.7 };
std::vector<double> output = { 3.0 };
REQUIRE(myData.addElement(input, output) == 1);
input = { 2.0, 44.2 };
output = { 20.14 };
REQUIRE(myData.addElement(input, output) == 2);
REQUIRE(myData.recordSingleElement("cat", { 0.2, 0.7 }) == 1);
REQUIRE(myData.recordSingleElement("dog", { 2.0, 44.2 }) == 3); // This is not 2, because phrases get numbers, too.
REQUIRE(myKnn.train(myData) == true);
std::string filepath2 = "/var/tmp/modelSetDescription_knn.json";
......@@ -89,7 +83,7 @@ SCENARIO("Test kNN classification", "[machineLearning]")
std::vector<double> inputVec = { 2.0, 44.2 };
REQUIRE(myKnn.run(inputVec)[0] == 20.0); //FIXME: kNN truncates to ints!
REQUIRE(myKnn.run(inputVec, "label") == "dog");
WHEN("when kNN model is read from file")
{
......@@ -99,7 +93,7 @@ SCENARIO("Test kNN classification", "[machineLearning]")
}
}
WHEN("when NN model is read from JSON stream")
WHEN("when kNN model is read from JSON stream")
{
THEN("run models and compare")
{
......@@ -119,15 +113,15 @@ SCENARIO("Test DTW classification", "[machineLearning]")
myData.startRecording("setOne");
std::vector<double> input = { 0.1, 0.5 };
std::vector<double> output = {};
REQUIRE(myData.addElement(input, output) == 2); //TODO: Shouldn't this be 1?
REQUIRE(myData.addElement(input, output) == 1);
input = { 0.2, 0.4 };
REQUIRE(myData.addElement(input, output) == 3);
REQUIRE(myData.addElement(input, output) == 2);
input = { 0.3, 0.3 };
REQUIRE(myData.addElement(input, output) == 4);
REQUIRE(myData.addElement(input, output) == 3);
input = { 0.4, 0.2 };
REQUIRE(myData.addElement(input, output) == 5);
REQUIRE(myData.addElement(input, output) == 4);
input = { 0.5, 0.1 };
REQUIRE(myData.addElement(input, output) == 6);
REQUIRE(myData.addElement(input, output) == 5);
myData.stopRecording();
myData.startRecording("setTwo");
......@@ -169,7 +163,7 @@ SCENARIO("Test both classes reject bad data", "[machineLearning]") {
std::vector<double> input = { 0.1, 0.2, 0.3};
std::vector<double> output = { 1.0 };
myBadData.addElement(input,output);
myBadData.recordSingleElement("label", input,output); //FIXME: This label is uselsess? -MZ
input = { 1.0, 2.0, 3.0, 4.0 };
myBadData.addElement(input, output);
......@@ -179,14 +173,3 @@ SCENARIO("Test both classes reject bad data", "[machineLearning]") {
//TODO: These should return false with empty data set. I think it just crashes now. -mz
}
/*
rapidmix::staticClassification labelKnn;
labelKnn.train(myXmmData);
std::vector<double> input = { 0.8, 0.1 };
std::cout << "knn test: " << myXmmData.getLabel(labelKnn.run(input)[0]) << std::endl;
return 0;
*/
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment