diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu index bb7b97df5a6d353d5ea27fe6418af756c7bdb39a..28e6aa4a0105b35755f85f4de6743b053d1278bf 100644 --- a/paddle/function/DepthwiseConvOpGpu.cu +++ b/paddle/function/DepthwiseConvOpGpu.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "DepthwiseConvOp.h" #include "GemmFunctor.h" #include "paddle/math/BaseMatrix.h" @@ -93,29 +94,32 @@ void ConvolutionDepthwiseInputBackward(const int nthreads, const int c_in = (index / inputHeight / inputWidth) % inputChannels; const int h_in = (index / inputWidth) % inputHeight; const int w_in = index % inputWidth; + const int c_out_start = c_in * filterMultiplier; + + int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + int h_out_end = (h_in + paddingH)/strideH; + h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end; + int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + int w_out_end = (w_in + paddingW)/strideW; + w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end; + T value = 0; + for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier; c_out ++) { - const T* weight = weight_data + c_out * filterHeight * filterWidth; - for (int kh = 0; kh < filterHeight; ++kh) { - for (int kw = 0; kw < filterWidth; ++kw) { - const int h_out_s = h_in + paddingH - kh; - const int w_out_s = w_in + paddingW - kw; - if (((h_out_s % strideH) == 0) && ((w_out_s % strideW) == 0)) { - const int h_out = h_out_s / strideH; - const int w_out = w_out_s / strideW; - // TODO(zhaolong) : the 'if' affect the effectiveness, - // it needs to optimize - if ((h_out >= 0) && (h_out < outputHeight) - && (w_out >= 0) && (w_out < outputWidth)) { - const int offset = ((batch * outputChannels + c_out) - * outputHeight + h_out) * outputWidth + w_out; - value += (*weight) * top_diff[offset]; - } - } - ++weight; - } + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + paddingH - h_out * strideH; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + paddingW - w_out * strideW; + const int filter_offset = c_out * filterHeight * filterWidth + + filter_h * filterWidth + filter_w; + const int top_diff_offset = ((batch * outputChannels + c_out) * + outputHeight + h_out)* outputWidth + w_out; + value += top_diff[top_diff_offset] * weight_data[filter_offset]; + } } } bottom_diff[index] += value;