/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef PADDLE_NO_PYTHON #include #include #include #include #include #include #include #include #include #include #include "picojson.h" void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual); void checkValue(std::vector& arguments, picojson::array& arr); const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/"; TEST(PyDataProviderWrapper, NoSequenceData) { paddle::DataConfig conf; conf.set_type("py"); conf.set_load_data_module(std::string("testPyDataWrapper")); conf.set_load_data_object(std::string("processNonSequenceData")); conf.set_async_load_data(false); conf.clear_files(); conf.set_files(kDir + "test_pydata_provider_wrapper.list"); paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false)); provider->setSkipShuffle(); provider->reset(); paddle::DataBatch batchFromPy; provider->getNextBatch(100, &batchFromPy); paddle::DataConfig conf2; conf2.set_type("proto"); conf2.set_async_load_data(false); conf2.clear_files(); conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist"); provider.reset(paddle::DataProvider::create(conf2, false)); provider->setSkipShuffle(); provider->reset(); paddle::DataBatch batchFromProto; provider->getNextBatch(100, &batchFromProto); std::vector& pyArguments = batchFromPy.getStreams(); std::vector& protoArguments = batchFromProto.getStreams(); EXPECT_EQ(pyArguments.size(), protoArguments.size()); for (size_t i = 0; i < pyArguments.size(); ++i) { checkEqual(protoArguments[i], pyArguments[i]); } } TEST(PyDataProviderWrapper, SequenceData) { paddle::DataConfig conf; conf.set_type("py"); conf.set_load_data_module("testPyDataWrapper"); conf.set_load_data_object("processSeqAndGenerateData"); conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json"); conf.clear_files(); conf.set_files(kDir + "test_pydata_provider_wrapper.list"); paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false)); provider->setSkipShuffle(); provider->reset(); paddle::DataBatch batchFromPy; provider->getNextBatch(100, &batchFromPy); picojson::value val; std::fstream fin; fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in); EXPECT_TRUE(fin.is_open()); if (fin.is_open()) { std::string err = picojson::parse(val, fin); EXPECT_TRUE(err.empty()); EXPECT_TRUE(val.is()); picojson::array& arr = val.get(); std::vector& arguments = batchFromPy.getStreams(); // CHECK Value checkValue(arguments, arr); // CHECK sequenceStartPositions for (size_t i = 0; i < arr.size(); i++) { int row_id = arr[i].get().size(); EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]); EXPECT_EQ((int)row_id, arguments[i].sequenceStartPositions->getData(false)[1]); } fin.close(); } } TEST(PyDataProviderWrapper, HasSubSequenceData) { paddle::DataConfig conf; conf.set_type("py"); conf.set_load_data_module("testPyDataWrapper"); conf.set_load_data_object("processSubSeqAndGenerateData"); conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json"); conf.clear_files(); conf.set_files(kDir + "test_pydata_provider_wrapper.list"); paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false)); provider->setSkipShuffle(); provider->reset(); paddle::DataBatch batchFromPy; provider->getNextBatch(1, &batchFromPy); picojson::value val; std::fstream fin; fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in); EXPECT_TRUE(fin.is_open()); if (fin.is_open()) { std::string err = picojson::parse(val, fin); EXPECT_TRUE(err.empty()); EXPECT_TRUE(val.is()); picojson::array& arr = val.get(); std::vector& arguments = batchFromPy.getStreams(); // CHECK Value checkValue(arguments, arr); // CHECK sequenceStartPositions and subSequenceStartPositions for (size_t i = 0; i < arr.size(); i++) { int row_id = arr[i].get().size(); EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]); EXPECT_EQ((int)row_id, arguments[i].sequenceStartPositions->getData(false)[1]); EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]); EXPECT_EQ((int)row_id, arguments[i].subSequenceStartPositions->getData(false)[1]); } fin.close(); } } int main(int argc, char** argv) { paddle::initMain(argc, argv); paddle::initPython(argc, argv); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual) { if (expect.value) { EXPECT_TRUE(actual.value != nullptr); paddle::Matrix* e = expect.value.get(); paddle::Matrix* a = actual.value.get(); EXPECT_EQ(e->getWidth(), a->getWidth()); EXPECT_EQ(e->getHeight(), a->getHeight()); if (dynamic_cast(e)) { paddle::CpuSparseMatrix* se = dynamic_cast(e); paddle::CpuSparseMatrix* sa = dynamic_cast(a); EXPECT_EQ(se->getFormat(), sa->getFormat()); EXPECT_EQ(se->getElementCnt(), sa->getElementCnt()); size_t rowSize = se->getFormat() == paddle::SPARSE_CSC ? se->getElementCnt() : se->getHeight() + 1; size_t colSize = se->getFormat() == paddle::SPARSE_CSC ? se->getWidth() + 1 : se->getElementCnt(); for (size_t i = 0; i < rowSize; ++i) { EXPECT_EQ(se->getRows()[i], sa->getRows()[i]); } for (size_t i = 0; i < colSize; ++i) { EXPECT_EQ(se->getCols()[i], sa->getCols()[i]); } if (se->getValueType() == paddle::FLOAT_VALUE) { EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType()); for (size_t i = 0; i < se->getElementCnt(); ++i) { EXPECT_EQ(se->getValue()[i], sa->getValue()[i]); } } } else if (dynamic_cast(e)) { EXPECT_EQ(e->getElementCnt(), a->getElementCnt()); for (size_t i = 0; i < e->getElementCnt(); ++i) { EXPECT_EQ(e->getData()[i], a->getData()[i]); } } } if (expect.ids) { EXPECT_TRUE(actual.ids != nullptr); paddle::VectorT* e = expect.ids.get(); paddle::VectorT* a = actual.ids.get(); EXPECT_EQ(e->getSize(), a->getSize()); for (size_t i = 0; i < e->getSize(); ++i) { EXPECT_EQ(e->getData()[i], a->getData()[i]); } } if (expect.strs) { EXPECT_TRUE(actual.strs != nullptr); std::vector* e = expect.strs.get(); std::vector* a = actual.strs.get(); EXPECT_EQ(e->size(), a->size()); for (size_t i = 0; i < e->size(); ++i) { EXPECT_EQ((*e)[i], (*a)[i]); } } } void checkValue(std::vector& arguments, picojson::array& arr) { // CHECK SLOT 0, Sparse Value. paddle::Argument& sparse_values_seq = arguments[0]; paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value; EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr); paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat = dynamic_cast(sparse_values_seq_rawmatrix.get()); EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr); EXPECT_EQ(arr.size(), arguments.size()); EXPECT_TRUE(arr[0].is()); size_t row_id = 0; for (picojson::value& sparse_val_seq : arr[0].get()) { std::unordered_map cols; for (picojson::value& kv : sparse_val_seq.get()) { EXPECT_TRUE(kv.get(0).is()); EXPECT_TRUE(kv.get(1).is()); int col = (int)(kv.get(0).get()); real val = (real)(kv.get(1).get()); cols.insert({col, val}); } size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id); EXPECT_EQ(cols.size(), colNum); int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id); real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id); for (size_t i = 0; i < colNum; ++i) { int id = rowIds[i]; auto it = cols.find(id); EXPECT_NE(cols.end(), it); real expect = it->second; EXPECT_NEAR(expect, *rowBuf, 1e-5); ++rowBuf; } ++row_id; } // CHECK SLOT 1, Dense Value. paddle::Argument& dense_arg = arguments[1]; paddle::MatrixPtr& dense_mat = dense_arg.value; EXPECT_NE(nullptr, dense_mat); EXPECT_TRUE(arr[1].is()); row_id = 0; for (picojson::value& dense_seq : arr[1].get()) { EXPECT_TRUE(dense_seq.is()); picojson::array& row = dense_seq.get(); EXPECT_EQ(row.size(), dense_mat->getWidth()); real* rowBuf = dense_mat->getRowBuf(row_id++); for (picojson::value& val : row) { EXPECT_TRUE(val.is()); real expect = val.get(); EXPECT_NEAR(expect, *rowBuf++, 1e-5); } } // CHECK SLOT 2, Sparse Non Value. paddle::Argument& sparse_non_val_arg = arguments[2]; paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value; EXPECT_NE(nullptr, sparse_non_val_rawm); paddle::CpuSparseMatrix* sparse_non_val_m = dynamic_cast(sparse_non_val_rawm.get()); EXPECT_NE(nullptr, sparse_non_val_m); row_id = 0; for (picojson::value& row : arr[2].get()) { EXPECT_TRUE(row.is()); std::unordered_set ids; for (picojson::value& id : row.get()) { EXPECT_TRUE(id.is()); ids.insert((int)(id.get())); } size_t colNum = sparse_non_val_m->getColNum(row_id); EXPECT_EQ(ids.size(), colNum); for (size_t i = 0; i < colNum; ++i) { int col = sparse_non_val_m->getRowCols(row_id)[i]; EXPECT_TRUE(ids.find(col) != ids.end()); } ++row_id; } // CHECK SLOT 3, Index. paddle::Argument& index_arg = arguments[3]; paddle::IVectorPtr indices = index_arg.ids; EXPECT_NE(nullptr, indices); int* idPtr = indices->getData(); for (picojson::value& id : arr[3].get()) { EXPECT_TRUE(id.is()); int _id = (int)(id.get()); EXPECT_EQ(_id, *idPtr++); } // CHECK SLOT 4, String. paddle::Argument& strArg = arguments[4]; std::vector* strPtr = strArg.strs.get(); EXPECT_NE(nullptr, strPtr); size_t vecIndex = 0; for (picojson::value& str : arr[4].get()) { EXPECT_TRUE(str.is()); std::string _str = str.get(); EXPECT_EQ(_str, (*strPtr)[vecIndex++]); } } #else int main() { return 0; } #endif