device_io.cpp 6.3 KB
Newer Older
1
/**
2
 * \file example/cpp_example/device_io.cpp
3
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
4
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6
 *
7 8 9
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
 */

#include <thread>
#include "../example.h"
#if LITE_BUILD_WITH_MGE

using namespace lite;
using namespace example;

#if LITE_WITH_CUDA

bool lite::example::device_input(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! set NetworkIO
    NetworkIO network_io;
    std::string input_name = "data";
    bool is_host = false;
    IO device_input{input_name, is_host};
    network_io.inputs.push_back(device_input);

    //! create and load the network
    std::shared_ptr<Network> network =
            std::make_shared<Network>(config, network_io);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    Layout input_layout = input_tensor->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);

    //! malloc the device memory
    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);

    //! copy to the device memory
    tensor_device.copy_from(*src_tensor);

    //! Now the device memory if filled with user input data, set it to the
    //! input tensor
    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
}

bool lite::example::device_input_output(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! set NetworkIO include input and output
    NetworkIO network_io;
    std::string input_name = "data";
    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
    bool is_host = false;
    IO device_input{input_name, is_host};
    IO device_output{output_name, is_host};
    network_io.inputs.push_back(device_input);
    network_io.outputs.push_back(device_output);

    //! create and load the network
    std::shared_ptr<Network> network =
            std::make_shared<Network>(config, network_io);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
    Layout input_layout = input_tensor_device->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);

    //! malloc the device memory
    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);

    //! copy to the device memory
    tensor_device.copy_from(*src_tensor);

    //! Now the device memory is filled with user input data, set it to the
    //! input tensor
    input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! output is in device, should copy it to host
    std::shared_ptr<Tensor> output_tensor_device =
            network->get_io_tensor(output_name);

    auto output_tensor = std::make_shared<Tensor>();
    output_tensor->copy_from(*output_tensor_device);

    //! get the output data or read tensor set in network_in
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
}

bool lite::example::pinned_host_input(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    Layout input_layout = input_tensor->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);
    //! malloc the pinned host memory
    bool is_pinned_host = true;
    auto tensor_pinned_input =
            Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
    //! copy to the pinned memory
    tensor_pinned_input.copy_from(*src_tensor);
    //! set the pinned host memory to the network as input
    input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
}

#endif
#endif

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}