conv_kernel.inc.cl 19.0 KB
Newer Older
L
liuruilong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

L
liuruilong 已提交
15 16 17 18 19 20 21 22 23 24
/*
conv
conv_bn
conv_add
conv_relu
conv_bn_relu
conv_add_relu
conv_add_bn_relu
*/

L
liuruilong 已提交
25
#include "cl_common.h"
L
liuruilong 已提交
26 27 28 29

__kernel void conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,
L
liuruilong 已提交
30
                                              __read_only image2d_t input_image,
L
liuruilong 已提交
31
                                              __read_only image2d_t filter,
L
liuruilong 已提交
32

L
liuruilong 已提交
33
#ifdef BIASE
L
liuruilong 已提交
34
                                              __read_only image2d_t bias,
L
liuruilong 已提交
35 36 37 38 39 40
#endif

#ifdef BATCH_NORM
                                              __read_only image2d_t new_scale,
                                              __read_only image2d_t new_biase,
#endif
L
liuruilong 已提交
41

L
liuruilong 已提交
42 43 44 45 46 47
                                              __write_only image2d_t output_image,
                                              __private const int stride,
                                              __private const int offset,
                                              __private const int input_c,
                                              __private const int dilation,
                                              __private const int input_width,/* of one block */
L
liuruilong 已提交
48 49 50
                                              __private const int input_height,/* of one block */
                                              __private const int output_width,
                                              __private const int output_height) {
L
liuruilong 已提交
51

L
liuruilong 已提交
52 53 54 55
    const int out_c = get_global_id(0);
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);

L
liuruilong 已提交
56 57 58 59 60 61 62 63
    if (out_c >= global_size_dim0 ||
        out_w >= global_size_dim1 ||
        out_nh >= global_size_dim2) {
        printf(" out of range ");
        return;
    }


L
liuruilong 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;

    int2 ouput_pos_in_one_block;
    ouput_pos_in_one_block.x = out_w;
    ouput_pos_in_one_block.y = out_nh;


    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                              CLK_ADDRESS_CLAMP          |
                              CLK_FILTER_NEAREST;

    int2 in_pos_in_one_block;
    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

L
liuruilong 已提交
81
#ifdef BIASE
L
liuruilong 已提交
82
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
83
#else
L
liuruilong 已提交
84
    half4 output = 0.0f;
L
liuruilong 已提交
85
#endif
L
liuruilong 已提交
86

L
liuruilong 已提交
87
   half4 input[9];
L
liuruilong 已提交
88

L
liuruilong 已提交
89 90 91 92 93
   for (int i = 0; i < input_c; ++i) {
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
        input[0] = select(read_imageh(input_image, sampler,
                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
                            (half4)(0.0f),
L
liuruilong 已提交
94
                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
95

L
liuruilong 已提交
96 97 98
        input[1] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y - dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
99
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
100

L
liuruilong 已提交
101 102 103
        input[2] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
104
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
105

L
liuruilong 已提交
106 107 108
        input[3] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x - dilation, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
109
                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
110

L
liuruilong 已提交
111 112 113
        input[4] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
114
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
115

L
liuruilong 已提交
116 117 118
        input[5] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
119
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
120

L
liuruilong 已提交
121 122 123
        input[6] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
124
                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
125

L
liuruilong 已提交
126 127 128
        input[7] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
129
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
130

L
liuruilong 已提交
131 132 133
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
134
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
135 136

        for (int j = 0; j < 9; ++j) {
L
liuruilong 已提交
137 138
            int2 fuck;
            fuck.x = i * 3 + j % 3;
L
liuruilong 已提交
139
            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
L
liuruilong 已提交
140
            half4 weight_x = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
141 142
            output.x += dot(input[j], weight_x);

L
liuruilong 已提交
143
            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
L
liuruilong 已提交
144
            half4 weight_y = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
145 146
            output.y += dot(input[j], weight_y);

L
liuruilong 已提交
147
            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
L
liuruilong 已提交
148
            half4 weight_z = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
149 150
            output.z += dot(input[j], weight_z);

L
liuruilong 已提交
151
            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
L
liuruilong 已提交
152
            half4 weight_w = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
153 154 155 156
            output.w += dot(input[j], weight_w);
        }
    }

L
liuruilong 已提交
157
#ifdef BATCH_NORM
L
liuruilong 已提交
158
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
159 160 161
#endif

#ifdef RELU
L
liuruilong 已提交
162 163 164
    output = activation(output);
#endif

L
liuruilong 已提交
165
    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
L
liuruilong 已提交
166 167
}

L
liuruilong 已提交
168 169 170



L
liuruilong 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
__kernel void depth_conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,
                                              __read_only image2d_t input,
                                              __read_only image2d_t filter,
#ifdef BIASE
                                              __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
                                              __read_only image2d_t new_scale,
                                              __read_only image2d_t new_biase,
#endif
                                              __write_only image2d_t output_image,
                                              __private const int stride,
                                              __private const int offset,
                                              __private const int input_c,
                                              __private const int dilation,
                                              __private const int input_width,/* of one block */
                                              __private const int input_height, /* of one block */
                                              __private const int output_width,
                                              __private const int output_height) {

    const int out_c = get_global_id(0);
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);

L
liuruilong 已提交
197 198 199
    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);


L
liuruilong 已提交
200 201 202 203 204
    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                              CLK_ADDRESS_CLAMP          |
                              CLK_FILTER_NEAREST;

    const int batch_index = out_nh / output_height;
L
liuruilong 已提交
205

L
liuruilong 已提交
206
    const int out_nh_in_one_batch = out_nh % output_height;
L
liuruilong 已提交
207 208


L
liuruilong 已提交
209 210
    int2 stride_xy = (int2)(stride, stride);
    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
L
liuruilong 已提交
211

L
liuruilong 已提交
212
    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
L
liuruilong 已提交
213 214

#ifdef BIASE
L
liuruilong 已提交
215
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
216
#else
L
liuruilong 已提交
217
    half4 output = 0.0f;
L
liuruilong 已提交
218 219
#endif

L
liuruilong 已提交
220
    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
L
liuruilong 已提交
221
    int weight_y_to = out_c * 12;
L
liuruilong 已提交
222 223 224

    half4 inputs[9];

L
liuruilong 已提交
225 226
    inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
227
                       (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
L
liuruilong 已提交
228 229 230

    inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
231
                       (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
L
liuruilong 已提交
232

L
liuruilong 已提交
233 234
    inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
235
                       (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
L
liuruilong 已提交
236

L
liuruilong 已提交
237 238
    inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
                       (half4)(0.0f),
L
liuruilong 已提交
239 240 241 242 243 244 245 246 247 248
                       (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
    /*
    if (output_pos.x == 112 && output_pos.y == 0) {
          half4 input1 = inputs[3];
          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
          printf(" input4 3 - %v4hlf \n", in);
          printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
    }
    */

L
liuruilong 已提交
249

L
liuruilong 已提交
250 251
    inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
                       (half4)(0.0f),
L
liuruilong 已提交
252
                       (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
253

L
liuruilong 已提交
254 255
    inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
                       (half4)(0.0f),
L
liuruilong 已提交
256
                       (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
257

L
liuruilong 已提交
258 259
    inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
260
                       (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
L
liuruilong 已提交
261

L
liuruilong 已提交
262 263
    inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
264
                       (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
L
liuruilong 已提交
265 266 267

    inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                       (half4)(0.0f),
L
liuruilong 已提交
268
                       (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
L
liuruilong 已提交
269 270 271

    for (int j = 0; j < 9; ++j) {
        half4 input = inputs[j];
L
liuruilong 已提交
272 273 274 275 276 277 278 279
        half4 weight0 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + j / 3));
        half4 weight1 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 3 + j / 3));
        half4 weight2 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 6 + j / 3));
        half4 weight3 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 9 + j / 3));
        output.x += input.x * weight0.x;
        output.y += input.y * weight1.x;
        output.z += input.z * weight2.x;
        output.w += input.w * weight3.x;
L
liuruilong 已提交
280 281 282
    }

#ifdef BATCH_NORM
L
liuruilong 已提交
283
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
284 285 286 287 288
#endif

#ifdef RELU
    output = activation(output);
#endif
L
liuruilong 已提交
289

L
liuruilong 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310

    /*

    if (output_pos.x == 112 && output_pos.y == 0) {

        for (int i = 0; i < 9; ++i) {
            half4 input1 = inputs[i];
            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
            printf(" input4 %d - %v4hlf \n", i, in);
        }

        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" depth wise output output4 = %v4hlf \n", out);
        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
    }

    */

L
liuruilong 已提交
311
    write_imageh(output_image, output_pos, output);
L
liuruilong 已提交
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352

}

__kernel void conv_1x1(__private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2,
                       __read_only image2d_t input_image,
                       __read_only image2d_t filter,
#ifdef BIASE
                       __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
                       __read_only image2d_t new_scale,
                       __read_only image2d_t new_biase,
#endif
                       __write_only image2d_t output_image,
                       __private const int stride,
                       __private const int offset,
                       __private const int input_c,
                       __private const int dilation,
                       __private const int input_width,/* of one block */
                       __private const int input_height,/* of one block */
                       __private const int output_width,
                       __private const int output_height) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);

  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                           CLK_ADDRESS_CLAMP         |
                           CLK_FILTER_NEAREST;
  const uint kernelHXW = 1;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
    half4 output = 0.0f;
#endif

L
liuruilong 已提交
353 354 355 356 357 358 359 360 361 362 363
  int out_c_p = 0, out_w_p = 0, out_nh_p = 0;

/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after bias output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
364
   for (int i = 0; i < input_c; ++i) {
L
liuruilong 已提交
365
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
L
liuruilong 已提交
366 367 368 369 370 371 372 373 374 375 376 377 378
        half4 input = read_imageh(input_image, sampler, pos_in);

        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
        output.x += dot(input, weight_x);

        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
        output.y += dot(input, weight_y);

        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
        output.z += dot(input, weight_z);

        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
        output.w += dot(input, weight_w);
L
liuruilong 已提交
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
/*
        if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
            printf("x - %d \n", pos_in.x);

            printf("y - %d \n", pos_in.y);

            float4 in = (float4)(input.x, input.y, input.z, input.w);
            printf("input4 = %v4hlf \n", in);

            float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w);
            printf("weight4 = %v4hlf \n", w);

        }
*/
  }
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf("output4 = %v4hlf \n", out);
L
liuruilong 已提交
398 399 400

  }

L
liuruilong 已提交
401 402
*/

L
liuruilong 已提交
403 404 405 406
#ifdef BATCH_NORM
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif

L
liuruilong 已提交
407 408 409 410 411 412 413 414 415
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after batch output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
416 417 418 419
#ifdef RELU
  output = activation(output);
#endif

L
liuruilong 已提交
420 421 422 423 424 425 426 427 428
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after relu output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
429 430
  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
  write_imageh(output_image, output_pos, output);
L
liuruilong 已提交
431
}