diff --git a/paddle/function/Function.h b/paddle/function/Function.h index afbd4911b0a2b9bed3bf254f8bafcaf77867db19..b0c6ba0facc7f83796464e6324b6fb0c2eca868d 100644 --- a/paddle/function/Function.h +++ b/paddle/function/Function.h @@ -75,8 +75,17 @@ public: // Tensor can be Matrix, Vector, IVector. // For inputs, do not need argType. // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO. - template - void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) { + void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) { + _args_.push_back(new BufferArg(arg, argType)); + addArg(*_args_.back()); + } + + void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) { + _args_.push_back(new BufferArg(arg, argType)); + addArg(*_args_.back()); + } + + void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) { _args_.push_back(new BufferArg(arg, argType)); addArg(*_args_.back()); } diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h index 2847188fd67989c612e01e8188e4049709fe31e5..412e3a7d1b30a775ee329da2dcfe82462f156984 100644 --- a/paddle/function/FunctionTest.h +++ b/paddle/function/FunctionTest.h @@ -19,6 +19,8 @@ limitations under the License. */ namespace paddle { +typedef std::shared_ptr BufferArgPtr; + /** * \brief A class for comparing CPU and GPU implementations of Function. * @@ -45,143 +47,121 @@ namespace paddle { class FunctionCompare { public: FunctionCompare(const std::string& name, const FuncConfig& config) - : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")), - gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) { - cpu->init(config); - gpu->init(config); + : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")), + gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) { + cpuFunc_->init(config); + gpuFunc_->init(config); + } + + ~FunctionCompare() {} + + // input need only contains shape, do not contains data. + void addInputs(const BufferArg& input) { + size_t size = + input.shape().getElements() * sizeOfValuType(input.valueType()); + cpuMemory_.emplace_back(std::make_shared(size)); + gpuMemory_.emplace_back(std::make_shared(size)); + + cpuInputs_.emplace_back(std::make_shared( + cpuMemory_.back()->getBuf(), input.valueType(), input.shape())); + gpuInputs_.emplace_back(std::make_shared( + gpuMemory_.back()->getBuf(), input.valueType(), input.shape())); + } + + // output need only contains shape, do not contains data. + void addOutputs(const BufferArg& output) { + size_t size = + output.shape().getElements() * sizeOfValuType(output.valueType()); + cpuMemory_.emplace_back(std::make_shared(size)); + gpuMemory_.emplace_back(std::make_shared(size)); + + cpuOutputs_.emplace_back( + std::make_shared(cpuMemory_.back()->getBuf(), + output.valueType(), + output.shape(), + ASSIGN_TO)); + gpuOutputs_.emplace_back( + std::make_shared(gpuMemory_.back()->getBuf(), + output.valueType(), + output.shape(), + ASSIGN_TO)); } - void addInputs(const BufferArg& input) { inputs.push_back(input); } + void addInputs(const SequenceArg& input) { + size_t batchSize = input.shape()[0]; + size_t numSeqs = batchSize / 10 + 1; + + size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32); + cpuMemory_.emplace_back(std::make_shared(sizeId)); + gpuMemory_.emplace_back(std::make_shared(sizeId)); - void addOutputs(const BufferArg& output) { outputs.push_back(output); } + TensorShape seqsId({numSeqs + 1}); + // void* cpuBuffer = cpuMemory_.back()->getBuf(); + // void* gpuBuffer = gpuMemory_.back()->getBuf(); + + size_t size = + input.shape().getElements() * sizeOfValuType(input.valueType()); + cpuMemory_.emplace_back(std::make_shared(size)); + gpuMemory_.emplace_back(std::make_shared(size)); + + // TODO: need be implemented. + } void run() { // prepare cpu/gpu arguments - prepareArgs(); + initInputs(); // function calculate - cpu->calc(cpuInputs, cpuOutputs); - gpu->calc(gpuInputs, gpuOutputs); - - // check outputs and inouts - auto checkArgs = [=](const BufferArgs& cpuArgs, const BufferArgs& gpuArgs) { - for (size_t i = 0; i < cpuArgs.size(); i++) { - auto cpu = cpuArgs[i]; - auto gpu = gpuArgs[i]; - CpuVector cpuVector(cpu.shape().getElements(), (real*)cpu.getData()); - GpuVector gpuVector(cpu.shape().getElements(), (real*)gpu.getData()); - - autotest::TensorCheckErr(cpuVector, gpuVector); + auto callFunction = [](FunctionBase* function, + std::vector& inputs, + std::vector& outputs) { + BufferArgs inArgs; + BufferArgs outArgs; + for (auto arg : inputs) { + inArgs.addArg(*arg); } - }; - checkArgs(cpuOutputs, gpuOutputs); - } -#if 0 - void cmpWithArg(const Arguments& inputs, - const Arguments& outputs, - const Arguments& inouts) { - // init cpu and gpu arguments - auto initArgs = [=]( - Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) { - for (const auto arg : inArgs) { - size_t size = sizeof(real); - for (const auto dim : arg.dims_) { - size *= dim; - } - if (arg.getData()) { - // todo(tianbing), waste unnecessary mem here - cpuMemory.emplace_back(std::make_shared(size)); - gpuMemory.emplace_back(std::make_shared(size)); - cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_)); - gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_)); - // already init outside - } else { - cpuMemory.emplace_back(std::make_shared(size)); - gpuMemory.emplace_back(std::make_shared(size)); - cpuArgs.emplace_back( - Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_)); - gpuArgs.emplace_back( - Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_)); - // will use an api to refactor this code. - CpuVector cpuVector(size / sizeof(real), - (real*)cpuArgs.back().getData()); - GpuVector gpuVector(size / sizeof(real), - (real*)gpuArgs.back().getData()); - cpuVector.uniform(0.001, 1); - gpuVector.copyFrom(cpuVector); - } + for (auto arg : outputs) { + outArgs.addArg(*arg); } + function->calc(inArgs, outArgs); }; - initArgs(cpuInputs, gpuInputs, inputs); - initArgs(cpuOutputs, gpuOutputs, outputs); - // function calculate - cpu->calc(cpuInputs, cpuOutputs); - gpu->calc(gpuInputs, gpuOutputs); + callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_); + callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_); // check outputs and inouts - auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) { - for (size_t i = 0; i < cpuArgs.size(); i++) { - auto cpu = cpuArgs[i]; - auto gpu = gpuArgs[i]; - size_t size = 1; - for (auto dim : cpu.dims_) { - size *= dim; - } - CpuVector cpuVector(size, (real*)cpu.getData()); - GpuVector gpuVector(size, (real*)gpu.getData()); - - autotest::TensorCheckErr(cpuVector, gpuVector); - } - }; - checkArgs(cpuOutputs, gpuOutputs); + compareOutputs(); } -#endif - std::shared_ptr getCpuFunction() const { return cpu; } + std::shared_ptr getCpuFunction() const { return cpuFunc_; } - std::shared_ptr getGpuFunction() const { return gpu; } + std::shared_ptr getGpuFunction() const { return gpuFunc_; } protected: - void prepareArgs() { - // TODO, if inputs has data - } + void initInputs() { + for (size_t i = 0; i < cpuInputs_.size(); i++) { + initArg(*cpuInputs_[i]); - void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, BufferArg& arg) { - size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + // TODO: Need a BufferCopy used to copy from one BufferArg to another. + CpuVector cpuVector(cpuInputs_[i]->shape().getElements(), + (real*)cpuInputs_[i]->data()); + GpuVector gpuVector(gpuInputs_[i]->shape().getElements(), + (real*)gpuInputs_[i]->data()); - cpuArgs.emplace_back( - BufferArg(cpuMemory_.back()->getBuf()), arg.valueType(), arg.shape()); - gpuArgs.emplace_back( - BufferArg(gpuMemory_.back()->getBuf()), arg.valueType(), arg.shape()); + gpuVector.copyFrom(cpuVector); + } } - void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, SequenceArg& arg) { - size_t batchSize = arg.shape()[0]; - size_t numSeqs = batchSize / 10 + 1; + void compareOutputs() { + for (size_t i = 0; i < cpuOutputs_.size(); i++) { + // TODO, Need a BufferCheck used to compare the two buffers. + auto cpu = cpuOutputs_[i]; + auto gpu = gpuOutputs_[i]; + CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data()); + GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data()); - size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); - - TensorShape seqsId({numSeqs + 1}); - void* cpuBuffer = cpuMemory_.back()->getBuf(); - void* gpuBuffer = gpuMemory_.back()->getBuf(); - - size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); - - cpuArgs.emplace_back(SequenceArg(cpuMemory_.back()->getBuf(), - arg.valueType(), - arg.shape(), - SequenceIdArg(cpuBuffer, seqsId))); - gpuArgs.emplace_back(SequenceArg(gpuMemory_.back()->getBuf(), - arg.valueType(), - arg.shape(), - SequenceIdArg(gpuBuffer, seqsId))); + autotest::TensorCheckErr(cpuVector, gpuVector); + } } // only init cpu argument, gpu argument copy from cpu argument. @@ -192,10 +172,10 @@ protected: void initArg(SequenceIdArg& arg, size_t batchSize) { size_t numSeqs = arg.numSeqs(); - int* buf = arg.data(); + int* buf = (int*)arg.data(); int pos = 0; size_t maxLen = 2 * batchSize / numSeqs; - for (int i = 0; i < numSeqs; ++i) { + for (int i = 0; i < (int)numSeqs; ++i) { int len = uniformRandom( std::min(maxLen, batchSize - pos - numSeqs + i)) + 1; @@ -207,17 +187,14 @@ protected: } protected: - std::shared_ptr cpu; - std::shared_ptr gpu; + std::shared_ptr cpuFunc_; + std::shared_ptr gpuFunc_; std::vector cpuMemory_; std::vector gpuMemory_; - // inputs and outputs - BufferArgs inputs; - BufferArgs outputs; - BufferArgs cpuInputs_; - BufferArgs cpuOutputs_; - BufferArgs gpuInputs_; - BufferArgs gpuOutputs_; + std::vector cpuInputs_; + std::vector cpuOutputs_; + std::vector gpuInputs_; + std::vector gpuOutputs_; }; } // namespace paddle