conv_kernel.inc.cl 19.5 KB
Newer Older
L
liuruilong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

L
liuruilong 已提交
15 16 17 18 19 20 21 22 23 24
/*
conv
conv_bn
conv_add
conv_relu
conv_bn_relu
conv_add_relu
conv_add_bn_relu
*/

L
liuruilong 已提交
25
#include "cl_common.h"
L
liuruilong 已提交
26 27 28 29

__kernel void conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,
L
liuruilong 已提交
30
                                              __read_only image2d_t input_image,
L
liuruilong 已提交
31
                                              __read_only image2d_t filter,
L
liuruilong 已提交
32

L
liuruilong 已提交
33
#ifdef BIASE
L
liuruilong 已提交
34
                                              __read_only image2d_t bias,
L
liuruilong 已提交
35 36 37 38 39 40
#endif

#ifdef BATCH_NORM
                                              __read_only image2d_t new_scale,
                                              __read_only image2d_t new_biase,
#endif
L
liuruilong 已提交
41

L
liuruilong 已提交
42 43 44 45 46 47
                                              __write_only image2d_t output_image,
                                              __private const int stride,
                                              __private const int offset,
                                              __private const int input_c,
                                              __private const int dilation,
                                              __private const int input_width,/* of one block */
L
liuruilong 已提交
48 49 50
                                              __private const int input_height,/* of one block */
                                              __private const int output_width,
                                              __private const int output_height) {
L
liuruilong 已提交
51

L
liuruilong 已提交
52 53 54 55
    const int out_c = get_global_id(0);
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);

L
liuruilong 已提交
56 57 58 59 60 61 62 63
    if (out_c >= global_size_dim0 ||
        out_w >= global_size_dim1 ||
        out_nh >= global_size_dim2) {
        printf(" out of range ");
        return;
    }


L
liuruilong 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;

    int2 ouput_pos_in_one_block;
    ouput_pos_in_one_block.x = out_w;
    ouput_pos_in_one_block.y = out_nh;


    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                              CLK_ADDRESS_CLAMP          |
                              CLK_FILTER_NEAREST;

    int2 in_pos_in_one_block;
    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

L
liuruilong 已提交
81
#ifdef BIASE
L
liuruilong 已提交
82
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
83
#else
L
liuruilong 已提交
84
    half4 output = 0.0f;
L
liuruilong 已提交
85
#endif
L
liuruilong 已提交
86

L
liuruilong 已提交
87
   half4 input[9];
L
liuruilong 已提交
88

L
liuruilong 已提交
89 90 91 92 93
   for (int i = 0; i < input_c; ++i) {
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
        input[0] = select(read_imageh(input_image, sampler,
                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
                            (half4)(0.0f),
L
liuruilong 已提交
94
                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
95

L
liuruilong 已提交
96 97 98
        input[1] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y - dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
99
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
100

L
liuruilong 已提交
101 102 103
        input[2] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
104
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
L
liuruilong 已提交
105

L
liuruilong 已提交
106 107 108
        input[3] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x - dilation, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
109
                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
110

L
liuruilong 已提交
111 112 113
        input[4] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
114
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
115

L
liuruilong 已提交
116 117 118
        input[5] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y)),
                          (half4)(0.0f),
L
liuruilong 已提交
119
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
L
liuruilong 已提交
120

L
liuruilong 已提交
121 122 123
        input[6] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
124
                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
125

L
liuruilong 已提交
126 127 128
        input[7] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
129
                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
130

L
liuruilong 已提交
131 132 133
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
L
liuruilong 已提交
134
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
L
liuruilong 已提交
135 136

        for (int j = 0; j < 9; ++j) {
L
liuruilong 已提交
137 138
            int2 fuck;
            fuck.x = i * 3 + j % 3;
L
liuruilong 已提交
139
            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
L
liuruilong 已提交
140
            half4 weight_x = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
141 142
            output.x += dot(input[j], weight_x);

L
liuruilong 已提交
143
            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
L
liuruilong 已提交
144
            half4 weight_y = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
145 146
            output.y += dot(input[j], weight_y);

L
liuruilong 已提交
147
            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
L
liuruilong 已提交
148
            half4 weight_z = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
149 150
            output.z += dot(input[j], weight_z);

L
liuruilong 已提交
151
            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
L
liuruilong 已提交
152
            half4 weight_w = read_imageh(filter, sampler, fuck);
L
liuruilong 已提交
153 154 155 156
            output.w += dot(input[j], weight_w);
        }
    }

L
liuruilong 已提交
157
#ifdef BATCH_NORM
L
liuruilong 已提交
158
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
159 160 161
#endif

#ifdef RELU
L
liuruilong 已提交
162 163 164
    output = activation(output);
#endif

L
liuruilong 已提交
165
    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
L
liuruilong 已提交
166 167
}

L
liuruilong 已提交
168 169 170



L
liuruilong 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
__kernel void depth_conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,
                                              __read_only image2d_t input,
                                              __read_only image2d_t filter,
#ifdef BIASE
                                              __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
                                              __read_only image2d_t new_scale,
                                              __read_only image2d_t new_biase,
#endif
                                              __write_only image2d_t output_image,
                                              __private const int stride,
                                              __private const int offset,
                                              __private const int input_c,
                                              __private const int dilation,
                                              __private const int input_width,/* of one block */
                                              __private const int input_height, /* of one block */
                                              __private const int output_width,
                                              __private const int output_height) {

    const int out_c = get_global_id(0);
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);

L
liuruilong 已提交
197 198 199
    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);


L
liuruilong 已提交
200 201 202 203 204
    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                              CLK_ADDRESS_CLAMP          |
                              CLK_FILTER_NEAREST;

    const int batch_index = out_nh / output_height;
L
liuruilong 已提交
205

L
liuruilong 已提交
206
    const int out_nh_in_one_batch = out_nh % output_height;
L
liuruilong 已提交
207 208


L
liuruilong 已提交
209 210
    int2 stride_xy = (int2)(stride, stride);
    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
L
liuruilong 已提交
211

L
liuruilong 已提交
212
    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
L
liuruilong 已提交
213 214

#ifdef BIASE
L
liuruilong 已提交
215
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
216
#else
L
liuruilong 已提交
217
    half4 output = 0.0f;
L
liuruilong 已提交
218 219
#endif

Y
yangfei 已提交
220 221
    const int filter_width = 3;
    const int filter_height = 3;
L
liuruilong 已提交
222

Y
yangfei 已提交
223
    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
L
liuruilong 已提交
224

Y
yangfei 已提交
225
    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
L
liuruilong 已提交
226

Y
yangfei 已提交
227 228
    int filter_x = pos_in_filter_block.x ;
    int filter_y = pos_in_filter_block.y ;
L
liuruilong 已提交
229

Y
yangfei 已提交
230
    half4 inputs[9];
L
liuruilong 已提交
231

Y
yangfei 已提交
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));

        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));

        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));

        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
        /*
        if (output_pos.x == 112 && output_pos.y == 0) {
              half4 input1 = inputs[3];
              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
              printf(" input4 3 - %v4hlf \n", in);
              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
        }
        */


        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));

        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));

        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));

        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));

        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
                           (half4)(0.0f),
                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));

    half4 filters[9];
    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));

    for(int i = 0 ;i < 9 ; i++){
     output += inputs[i] * filters[i];
L
liuruilong 已提交
290 291
    }
#ifdef BATCH_NORM
L
liuruilong 已提交
292
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
L
liuruilong 已提交
293 294 295 296 297
#endif

#ifdef RELU
    output = activation(output);
#endif
L
liuruilong 已提交
298

L
liuruilong 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319

    /*

    if (output_pos.x == 112 && output_pos.y == 0) {

        for (int i = 0; i < 9; ++i) {
            half4 input1 = inputs[i];
            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
            printf(" input4 %d - %v4hlf \n", i, in);
        }

        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" depth wise output output4 = %v4hlf \n", out);
        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
    }

    */

L
liuruilong 已提交
320
    write_imageh(output_image, output_pos, output);
L
liuruilong 已提交
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361

}

__kernel void conv_1x1(__private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2,
                       __read_only image2d_t input_image,
                       __read_only image2d_t filter,
#ifdef BIASE
                       __read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
                       __read_only image2d_t new_scale,
                       __read_only image2d_t new_biase,
#endif
                       __write_only image2d_t output_image,
                       __private const int stride,
                       __private const int offset,
                       __private const int input_c,
                       __private const int dilation,
                       __private const int input_width,/* of one block */
                       __private const int input_height,/* of one block */
                       __private const int output_width,
                       __private const int output_height) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);

  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                           CLK_ADDRESS_CLAMP         |
                           CLK_FILTER_NEAREST;
  const uint kernelHXW = 1;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
    half4 output = 0.0f;
#endif

L
liuruilong 已提交
362 363 364 365 366 367 368 369 370 371 372
  int out_c_p = 0, out_w_p = 0, out_nh_p = 0;

/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after bias output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
373
   for (int i = 0; i < input_c; ++i) {
L
liuruilong 已提交
374
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
L
liuruilong 已提交
375 376 377 378 379 380 381 382 383 384 385 386 387
        half4 input = read_imageh(input_image, sampler, pos_in);

        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
        output.x += dot(input, weight_x);

        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
        output.y += dot(input, weight_y);

        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
        output.z += dot(input, weight_z);

        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
        output.w += dot(input, weight_w);
L
liuruilong 已提交
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
/*
        if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
            printf("x - %d \n", pos_in.x);

            printf("y - %d \n", pos_in.y);

            float4 in = (float4)(input.x, input.y, input.z, input.w);
            printf("input4 = %v4hlf \n", in);

            float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w);
            printf("weight4 = %v4hlf \n", w);

        }
*/
  }
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf("output4 = %v4hlf \n", out);
L
liuruilong 已提交
407 408 409

  }

L
liuruilong 已提交
410 411
*/

L
liuruilong 已提交
412 413 414 415
#ifdef BATCH_NORM
    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif

L
liuruilong 已提交
416 417 418 419 420 421 422 423 424
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after batch output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
425 426 427 428
#ifdef RELU
  output = activation(output);
#endif

L
liuruilong 已提交
429 430 431 432 433 434 435 436 437
/*
  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
        float4 out = (float4)(output.x, output.y, output.z, output.w);
        printf(" after relu output4 = %v4hlf \n", out);

  }

*/

L
liuruilong 已提交
438 439
  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
  write_imageh(output_image, output_pos, output);
L
liuruilong 已提交
440
}