提交 4ebb3eb7 编写于 作者: H hedaoyuan

imporve Function

上级 ce1d98e0
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
#include "paddle/math/cross_map_normal_op.h"
#include "NormProjectionLayer.h" #include "NormProjectionLayer.h"
namespace paddle { namespace paddle {
...@@ -45,6 +46,16 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap, ...@@ -45,6 +46,16 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
/* the size of inputs for norm-layer is 1 */ /* the size of inputs for norm-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1); CHECK_EQ(config_.inputs_size(), 1);
if (useGpu_) {
normal_ = FunctionBase::funcRegistrar_.createByType(
FUNC_NAME(CrossMapNormal, GPU));
} else {
normal_ = FunctionBase::funcRegistrar_.createByType(
FUNC_NAME(CrossMapNormal, CPU));
}
normal_->init(
FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
return true; return true;
} }
...@@ -62,10 +73,14 @@ void CMRProjectionNormLayer::forward(PassType passType) { ...@@ -62,10 +73,14 @@ void CMRProjectionNormLayer::forward(PassType passType) {
Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_); Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
denoms_->zeroMem(); Dims dims{(size_t)batchSize,
(size_t)channels_,
outV->crossMapNormalFwd( (size_t)imgSizeH_,
*input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_); (size_t)imgSizeW_};
normal_->calc(
{Tensor(input->getData(), dims)},
{Tensor(outV->getData(), dims), Tensor(denoms_->getData(), dims)},
{});
} }
void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
...@@ -80,15 +95,32 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { ...@@ -80,15 +95,32 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
MatrixPtr localOutV = getOutputValue(); MatrixPtr localOutV = getOutputValue();
MatrixPtr preOutV = inputLayers_[0]->getOutputValue(); MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
preOutGrad->crossMapNormalBwd(*localGrad, if (useGpu_) {
*denoms_, CrossMapNormalGrad<DEVICE_TYPE_GPU> crossGrad;
*preOutV, crossGrad(dynamic_cast<GpuMatrix&>(*preOutGrad),
*localOutV, dynamic_cast<GpuMatrix&>(*preOutV),
channels_, dynamic_cast<GpuMatrix&>(*localGrad),
imgSizeH_, dynamic_cast<GpuMatrix&>(*localOutV),
imgSizeW_, dynamic_cast<GpuMatrix&>(*denoms_),
size_, channels_,
scale_, imgSizeH_,
pow_); imgSizeW_,
size_,
scale_,
pow_);
} else {
CrossMapNormalGrad<DEVICE_TYPE_CPU> crossGrad;
crossGrad(dynamic_cast<CpuMatrix&>(*preOutGrad),
dynamic_cast<CpuMatrix&>(*preOutV),
dynamic_cast<CpuMatrix&>(*localGrad),
dynamic_cast<CpuMatrix&>(*localOutV),
dynamic_cast<CpuMatrix&>(*denoms_),
channels_,
imgSizeH_,
imgSizeW_,
size_,
scale_,
pow_);
}
} }
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "NormLayer.h" #include "NormLayer.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/math/Function.h"
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -39,5 +40,8 @@ public: ...@@ -39,5 +40,8 @@ public:
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType); void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr); void backward(const UpdateCallback& callback = nullptr);
protected:
FunctionBase* normal_;
}; };
} // namespace paddle } // namespace paddle
...@@ -31,15 +31,17 @@ real FuncConfig::get<real>(const std::string& key) const { ...@@ -31,15 +31,17 @@ real FuncConfig::get<real>(const std::string& key) const {
} }
template <> template <>
void FuncConfig::set<size_t>(const std::string& key, size_t v) { FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key; CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
valueMap_[key].s = v; valueMap_[key].s = v;
return *this;
} }
template <> template <>
void FuncConfig::set<real>(const std::string& key, real v) { FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key; CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
valueMap_[key].r = v; valueMap_[key].r = v;
return *this;
} }
ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_; ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
......
...@@ -46,6 +46,8 @@ class Tensor { ...@@ -46,6 +46,8 @@ class Tensor {
public: public:
Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {} Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
real* getData() const { return buf_; }
real* buf_; real* buf_;
Dims dims_; Dims dims_;
}; };
...@@ -63,7 +65,7 @@ public: ...@@ -63,7 +65,7 @@ public:
T get(const std::string& key) const; T get(const std::string& key) const;
template <typename T> template <typename T>
void set(const std::string& key, T v); FuncConfig& set(const std::string& key, T v);
protected: protected:
std::map<std::string, value> valueMap_; std::map<std::string, value> valueMap_;
...@@ -84,11 +86,11 @@ public: ...@@ -84,11 +86,11 @@ public:
#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName #define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
#define REGISTER_TYPED_FUNC(typeName, deviceName, className) \ #define REGISTER_TYPED_FUNC(typeName, deviceName, className) \
static InitFunction __reg_type_##typeName([]() { \ static InitFunction __reg_type_##typeName##deviceName([]() { \
FunctionBase::funcRegistrar_ \ FunctionBase::funcRegistrar_ \
.registerClass<className<DEVICE_TYPE_##deviceName>>( \ .registerClass<className<DEVICE_TYPE_##deviceName>>( \
FUNC_NAME(typeName, deviceName)); \ FUNC_NAME(typeName, deviceName)); \
}) })
} // namespace paddle } // namespace paddle
...@@ -18,45 +18,41 @@ namespace paddle { ...@@ -18,45 +18,41 @@ namespace paddle {
// NCHW // NCHW
template <> template <>
void CrossMapNormal<DEVICE_TYPE_CPU>::operator()(CpuMatrix& outputs, void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
CpuMatrix& denoms, real* denoms,
CpuMatrix& inputs, real* inputs,
size_t channels, size_t numSamples,
size_t imgSizeH, size_t channels,
size_t imgSizeW, size_t height,
size_t sizeX, size_t width,
real scale, size_t size,
real pow) { real scale,
CHECK(outputs.isContiguous()); real pow) {
CHECK(inputs.isContiguous()); size_t oneImage = height * width;
CHECK(denoms.isContiguous()); size_t oneSample = channels * oneImage;
CHECK_EQ(outputs.getHeight(), inputs.getHeight());
CHECK_EQ(outputs.getWidth(), inputs.getWidth()); CpuVector outputsV(numSamples * oneSample, outputs);
CHECK_EQ(outputs.getHeight(), denoms.getHeight()); CpuVector inputsV(numSamples * oneSample, inputs);
CHECK_EQ(outputs.getWidth(), denoms.getWidth()); CpuVector denomsV(numSamples * oneSample, denoms);
size_t numSample = inputs.getHeight(); denomsV = denomsV.constant(1.0);
size_t numCols = inputs.getWidth(); const int start = -((int)size - 1) / 2;
size_t imageSize = imgSizeH * imgSizeW; const int end = (int)size + start;
CHECK(imageSize * channels == numCols); for (size_t i = 0; i < numSamples; i++) {
real* oneDenom = denoms + i * oneSample;
denoms = denoms.constant(1.0); real* oneInput = inputs + i * oneSample;
const int start = -((int)sizeX - 1) / 2;
const int end = (int)sizeX + start;
for (size_t i = 0; i < numSample; i++) {
real* denomsData = denoms.getData() + i * numCols;
real* inputData = inputs.getData() + i * numCols;
for (int c = 0; c < (int)channels; c++) { for (int c = 0; c < (int)channels; c++) {
CpuVector denom(imageSize, denomsData + c * imageSize); CpuVector denom(oneImage, oneDenom + c * oneImage);
for (int s = start; s < end; s++) { for (int s = start; s < end; s++) {
if (c + s >= 0 && c + s < (int)channels) { if (c + s >= 0 && c + s < (int)channels) {
CpuVector input(imageSize, inputData + (c + s) * imageSize); CpuVector input(oneImage, oneInput + (c + s) * oneImage);
denom += input.square() * scale; denom += input.square() * scale;
} }
} }
} }
} }
outputs = inputs * denoms.pow(-pow);
outputsV = inputsV * denomsV.pow(-pow);
} }
template <> template <>
...@@ -154,13 +150,17 @@ public: ...@@ -154,13 +150,17 @@ public:
size_t channels = inputs[0].dims_[1]; size_t channels = inputs[0].dims_[1];
size_t height = inputs[0].dims_[2]; size_t height = inputs[0].dims_[2];
size_t width = inputs[0].dims_[3]; size_t width = inputs[0].dims_[3];
size_t imageSize = channels * height * width;
CpuMatrix input(inputs[0].buf_, samples, imageSize);
CpuMatrix output(outputs[0].buf_, samples, imageSize);
CpuMatrix denom(outputs[1].buf_, samples, imageSize);
CrossMapNormal<Device> cross; CrossMapNormal<Device>(outputs[0].getData(),
cross(output, denom, input, channels, height, width, size_, scale_, pow_); outputs[1].getData(),
inputs[0].getData(),
samples,
channels,
height,
width,
size_,
scale_,
pow_);
} }
private: private:
...@@ -170,5 +170,6 @@ private: ...@@ -170,5 +170,6 @@ private:
}; };
REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc); REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,18 @@ limitations under the License. */ ...@@ -19,6 +19,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
template <DeviceType Device>
void CrossMapNormal(real* outputs,
real* denoms,
real* inputs,
size_t numSamples,
size_t channels,
size_t height,
size_t width,
size_t size,
real scale,
real pow);
#if 0
template <DeviceType Device> template <DeviceType Device>
struct CrossMapNormal { struct CrossMapNormal {
void operator()(typename MatrixT<Device>::type& outputs, void operator()(typename MatrixT<Device>::type& outputs,
...@@ -31,6 +43,7 @@ struct CrossMapNormal { ...@@ -31,6 +43,7 @@ struct CrossMapNormal {
real scale, real scale,
real pow); real pow);
}; };
#endif
template <DeviceType Device> template <DeviceType Device>
struct CrossMapNormalGrad { struct CrossMapNormalGrad {
......
...@@ -61,45 +61,29 @@ __global__ void KeCMRNormOutput(size_t inputSize, const real* in, ...@@ -61,45 +61,29 @@ __global__ void KeCMRNormOutput(size_t inputSize, const real* in,
} }
template <> template <>
void CrossMapNormal<DEVICE_TYPE_GPU>::operator()(GpuMatrix& outputs, void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
GpuMatrix& denoms, real* denoms,
GpuMatrix& inputs, real* inputs,
size_t channels, size_t numSamples,
size_t imgSizeH, size_t channels,
size_t imgSizeW, size_t height,
size_t sizeX, size_t width,
real scale, size_t size,
real pow) { real scale,
CHECK(outputs.isContiguous()); real pow) {
CHECK(inputs.isContiguous()); size_t imageSize = numSamples * height * width;
CHECK(denoms.isContiguous());
CHECK_EQ(outputs.getHeight(), inputs.getHeight());
CHECK_EQ(outputs.getWidth(), inputs.getWidth());
CHECK_EQ(outputs.getHeight(), denoms.getHeight());
CHECK_EQ(outputs.getWidth(), denoms.getWidth());
size_t numSample = inputs.getHeight();
size_t numCols = inputs.getWidth();
CHECK(imgSizeH * imgSizeW * channels == numCols);
real* inputsData = inputs.getData();
real* denomsData = denoms.getData();
real* outputsData = outputs.getData();
size_t imageSize = numSample * imgSizeH * imgSizeW;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024; int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
(imageSize, inputsData, denomsData, (imageSize, inputs, denoms, channels, height, width, size, scale);
channels, imgSizeH, imgSizeW, sizeX, scale);
size_t inputSize = numSample * imgSizeH * imgSizeW *channels; size_t inputSize = numSamples * height * width *channels;
blockSize = 1024; blockSize = 1024;
gridSize = (inputSize + 1024 - 1) / 1024; gridSize = (inputSize + 1024 - 1) / 1024;
KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>> KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
(inputSize, inputsData, denomsData, -pow, outputsData); (inputSize, inputs, denoms, -pow, outputs);
CHECK_SYNC("CrossMapNormalFwd"); CHECK_SYNC("CrossMapNormal");
} }
__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
......
...@@ -1281,24 +1281,40 @@ void testCrossMapNormalFwd( ...@@ -1281,24 +1281,40 @@ void testCrossMapNormalFwd(
inputsGpu.copyFrom(inputs); inputsGpu.copyFrom(inputs);
outputsGpu.copyFrom(outputs); outputsGpu.copyFrom(outputs);
#if 0
FuncConfig config; FuncConfig config;
config.set("size", (size_t)sizeX); config.set("size", (size_t)sizeX);
config.set("scale", scale); config.set("scale", scale);
config.set("pow", pow); config.set("pow", pow);
#endif
FunctionBase* cpu = FunctionBase* cpu =
FunctionBase::funcRegistrar_.createByType(FUNC_NAME(CrossMapNormal, CPU)); FunctionBase::funcRegistrar_.createByType(FUNC_NAME(CrossMapNormal, CPU));
cpu->init(config); FunctionBase* gpu =
FunctionBase::funcRegistrar_.createByType(FUNC_NAME(CrossMapNormal, GPU));
cpu->init(FuncConfig()
.set("size", (size_t)sizeX)
.set("scale", scale)
.set("pow", pow));
gpu->init(FuncConfig()
.set("size", (size_t)sizeX)
.set("scale", scale)
.set("pow", pow));
Dims dims{ Dims dims{
(size_t)numSamples, (size_t)channels, (size_t)imgSizeH, (size_t)imgSizeW}; (size_t)numSamples, (size_t)channels, (size_t)imgSizeH, (size_t)imgSizeW};
cpu->calc({Tensor(inputs.getData(), dims)}, cpu->calc({Tensor(inputs.getData(), dims)},
{Tensor(outputs.getData(), dims), Tensor(denoms.getData(), dims)}, {Tensor(outputs.getData(), dims), Tensor(denoms.getData(), dims)},
{}); {});
gpu->calc(
{Tensor(inputsGpu.getData(), dims)},
{Tensor(outputsGpu.getData(), dims), Tensor(denomsGpu.getData(), dims)},
{});
#if 0 #if 0
CrossMapNormal<DEVICE_TYPE_CPU> cpuCross; CrossMapNormal<DEVICE_TYPE_CPU> cpuCross;
cpuCross( cpuCross(
outputs, denoms, inputs, channels, imgSizeH, imgSizeW, sizeX, scale, pow); outputs, denoms, inputs, channels, imgSizeH, imgSizeW, sizeX, scale, pow);
#endif
CrossMapNormal<DEVICE_TYPE_GPU> gpuCross; CrossMapNormal<DEVICE_TYPE_GPU> gpuCross;
gpuCross(outputsGpu, gpuCross(outputsGpu,
denomsGpu, denomsGpu,
...@@ -1309,6 +1325,7 @@ void testCrossMapNormalFwd( ...@@ -1309,6 +1325,7 @@ void testCrossMapNormalFwd(
sizeX, sizeX,
scale, scale,
pow); pow);
#endif
TensorCheckErr(outputs, outputsGpu); TensorCheckErr(outputs, outputsGpu);
TensorCheckErr(denoms, denomsGpu); TensorCheckErr(denoms, denomsGpu);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册