// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #ifdef BCLOUD #include #else #include #endif #include #include #include "core/predictor/common/inner_common.h" #include "core/predictor/framework/memory.h" // this file is included by bsf.h namespace im { namespace bsf { template bool Task::task_fetch_init(BatchTasks& batchTask) { // 双检锁,减少加锁的粒度 if (!fetch_init) { if (total_taskmeta_num > 1) { // 对于task被拆分为多个taskmeta,需要加锁。 AutoMutex lock(task_mut); task_fetch_create(batchTask); } else { // 对于task只有1个taskmeta,不需要加锁。 task_fetch_create(batchTask); } } return true; } template bool Task::task_fetch_create(BatchTasks& batchTask) { if (!fetch_init) { vector_fetch_lod_index = batchTask.vector_fetch_lod_index; set_fetch_nobatch_index = batchTask.set_fetch_nobatch_index; OutVectorT taskMetaOutLodTensor; size_t fetchvar_num = batchTask._batch_out.size(); for (size_t fetchvar_index = 0; fetchvar_index < fetchvar_num; ++fetchvar_index) { size_t fetchvar_bytesize_index = batchTask.fetchvar_bytesize(fetchvar_index); size_t fetchvar_batch = 0; // 1. nobatch fetchvar情况 if (set_fetch_nobatch_index.size() > 0 && set_fetch_nobatch_index.find(fetchvar_index) != set_fetch_nobatch_index.end()) { fetchvar_batch = 1; } else if (vector_fetch_lod_index.size() > 0 && std::find(vector_fetch_lod_index.begin(), vector_fetch_lod_index.end(), fetchvar_index) != vector_fetch_lod_index.end()) { // lod fetchvar情况,此时无法确定总的shape[0] // 根据task中的task_num总数开辟task_num个临时空间 // 每个lod型的fetchvar拷贝到对应的临时空间中 // 最后再计算临时空间的总量,合并fetchvar和lod fetchvar_batch = 0; } else { // 普通fetchvar情况,此时该Task总的fetchvar_batch = // 输入的总的batch_size() fetchvar_batch = batch_size(); } paddle::PaddleTensor tensor_out; tensor_out.name = batchTask._batch_out[fetchvar_index].name; tensor_out.dtype = paddle::PaddleDType(batchTask._batch_out[fetchvar_index].dtype); tensor_out.shape = batchTask._batch_out[fetchvar_index].shape; tensor_out.shape[0] = fetchvar_batch; if (fetchvar_batch != 0) { // 此时 lod 为空。 tensor_out.lod = batchTask._batch_out[fetchvar_index].lod; // resize all batch memory at one time size_t databuf_size = fetchvar_batch * fetchvar_bytesize_index; void* databuf_data = MempoolWrapper::instance().malloc(databuf_size, memoryPtr); paddle::PaddleBuf paddleBuf(databuf_data, databuf_size); tensor_out.data = paddleBuf; // tensor_out.data.Resize(databuf_size); } else { // 当taskmeta_num = 1时,由于同时只有一个taskMeta操作task // 不涉及线程安全问题,所以此时可以直接由taskMeta->task->resize->copy // 当task被分为多个taskMeta时,需要临时对象记录 // 收齐后再一起合并 if (total_taskmeta_num > 1) { taskMetaOutLodTensor.push_back(tensor_out); } } outVectorT_ptr->push_back(tensor_out); } // outLodTensorVector实际是一个双层vector // shape为taskmeta_num * vector_fetch_lod_index.size(); outLodTensorVector.resize(total_taskmeta_num, taskMetaOutLodTensor); fetch_init = true; } return true; } template void* TaskExecutor::thread_entry(void* args) { ThreadContext* context = static_cast*>(args); TaskExecutor* executor = static_cast*>(context->executor); executor->work(context); return nullptr; } template int TaskExecutor::start(uint32_t thread_num, uint32_t init_timeout_sec) { _stop = false; if (!_thread_contexts.empty()) { LOG(WARNING) << "BSF has started"; return 0; } if (thread_num == 0) { LOG(ERROR) << "cannot init BSF with zero thread"; return -1; } ThreadContext* contexts = new ThreadContext[thread_num]; for (uint32_t i = 0; i < thread_num; ++i) { contexts[i].executor = this; if (_user_thread_contexts != NULL) { contexts[i].user_thread_context = _user_thread_contexts[i]; } int rc = THREAD_CREATE( &contexts[i].tid, NULL, &TaskExecutor::thread_entry, &contexts[i]); if (rc != 0) { LOG(ERROR) << "failed to create BSF worker thread: index=" << i << ", rc=" << rc << ", errno=" << errno << ":" << strerror(errno); return -1; } _thread_contexts.push_back(&contexts[i]); } size_t init_timeout = init_timeout_sec * 1000 * 1000; bool has_error = false; bool has_timeout = true; if (init_timeout == 0) { has_timeout = false; } while (!has_timeout || init_timeout > 0) { bool done = true; for (size_t i = 0; i < _thread_contexts.size(); ++i) { if (_thread_contexts[i]->init_status < 0) { has_error = true; break; } if (_thread_contexts[i]->init_status == 0) { done = false; } } if (has_error) { LOG(ERROR) << "BSF thread init error"; return -1; } if (done) { LOG(INFO) << "BSF thread init done"; return 0; } // 100ms const size_t sleep_interval = 100 * 1000; usleep(sleep_interval); init_timeout -= sleep_interval; } LOG(ERROR) << "BSF thread init timed out"; return -1; } template void TaskExecutor::stop() { _stop = true; for (size_t i = 0; i < _thread_contexts.size(); ++i) { THREAD_CANCEL(_thread_contexts[i]->tid); } for (size_t i = 0; i < _thread_contexts.size(); ++i) { THREAD_JOIN(_thread_contexts[i]->tid, NULL); } _thread_contexts.clear(); } template TaskHandler TaskExecutor::schedule( const void* inVectorT_ptr, void* outVectorT_ptr, MempoolRegion* memoryPtr) { // NOLINT TaskT* task = butil::get_object(); if (!task) { LOG(ERROR) << "Failed get TaskT from object pool"; return TaskHandler::valid_handle(); } task->clear(); /* if (!BatchTasks::check_valid(in, out, _overrun)) { LOG(ERROR) << "Invalid input & output"; return TaskHandler::valid_handle(); } */ int fds[2]; int rc = pipe(fds); if (rc != 0) { LOG(ERROR) << "call pipe() failed, errno=" << errno << ":" << strerror(errno); return TaskHandler::valid_handle(); } task->read_fd = fds[0]; task->write_fd = fds[1]; task->owner_tid = ::syscall(SYS_gettid); task->memoryPtr = memoryPtr; // task->_bspec_key = _bspec_key; task->inVectorT_ptr = (const InVectorT*)inVectorT_ptr; task->outVectorT_ptr = (OutVectorT*)outVectorT_ptr; if (!task->task_init()) { LOG(ERROR) << "task->init() failed"; } task->rem = task->batch_size(); task->index.store(0, butil::memory_order_relaxed); AutoMutex lock(_mut); _task_queue.push_back(task); THREAD_COND_SIGNAL(&_cond); return TaskHandler(*task); } // this function is accessed by multi thread. // so AutoMutex at first. // so batchTask.append_task is thread safe. // you dont need to add extra lock in append_task() // task is already init. template bool TaskExecutor::move_task_to_batch( BatchTasks& batchTask) { // NOLINT AutoMutex lock(_mut); while (_task_queue.empty()) { THREAD_COND_WAIT(&_cond, &_mut); } if (_task_queue.empty()) { LOG(ERROR) << "invalid task queue!"; return false; } TaskT* previous_task = nullptr; int padding_task_count = 0; while (!_task_queue.empty()) { TaskT* task = _task_queue.front(); // 由于无法确定fetchVar是否为lod(即使输入是非lod,输出也可能是lod) // 简单的处理方法是:task不能被拆分,即用户的请求可以合并一起预测,但不能拆分两个小部分去预测。 // 只需要设置engine的属性allow_split_request = false即可。 // 复杂的处理方法是允许拆分Task,无论是否包含lod. // 难点:预测前,能够知道被拆成了几个taskmeta,但只有预测后,才知道有多少个fetchvar,多少个lod的fetchvar // 所以,task中先要创建taskmeta_num* fetchvar // num(lod类型的)个临时PaddleTensor(存储data及Lod) // 由于多线程调度的单位是taskmeta,故只能在notify_task中,用taskmeta->task去创建 // 此时由于多个taskmeta对应一个task,存在多线程竞争,所以需要在task中加锁。 // 原子操作不可行,因为多个线程必须等待创建好上述的PaddleTensor后才能继续。 // 对于普通的fetch,也需要加锁去创建PaddleTensor,后续才能往里拷贝。 // _overrun表示,异步BatchTasks是否允许单次临时超过限制。 // _overrun为true时,即使BatchTasks剩下1-batch,也会全放入一个完整的Task,允许临时超限。 // _overrun为false时,不允许。 // 对于模型本身有最大Batch限制的情况,应将该值设为false,默认为false。 // 对于模型本身无最大Batch限制,但自己设置了BatchTasks的最大Batch,可以考虑设置为True。 // _allow_split_request == // true,则允许拆分task.BatchTasks剩下1-batch,则会从下一个Task中拆出1-Batch // _allow_split_request == // false,则每个task不会被拆分。BatchTasks剩下1-batch会被浪费 // 默认为true,允许拆分task从而使得空间利用率最大。 if (!batchTask.get_allow_split_request()) { if (task->batch_size() > batchTask.get_rem_size() && !batchTask.get_overrun()) { break; } } // combine_task_valid负责判断是否能够合并 // 除最外层的shape外,内层shape应一致或者允许Padding才能合并。 // 否则跳出循环,放入下一个batchTask中。 // 以此保证batch.append_task(task)中的task的内层shape相同。 // 对于Shape[0] = 1 而!=batch的情况,因为合并时,取其中一个的值 // 所以要求该feedvar必须相等,才能合并。 // 否则跳出循环,放入下一个batchTask中。 // 目前没有PaddleTensor和PaddleBuff没有重载==,所以只能比较内存. if (previous_task != nullptr) { if (task->combine_task_valid(previous_task) == 0) { break; } } if (batchTask.padding(task) != 2) { break; } ++padding_task_count; size_t rem = batchTask.append_task(task); previous_task = task; if (task->rem <= 0) { _task_queue.pop_front(); } if (rem <= 0) break; } if (padding_task_count > 1) { LOG(INFO) << "Hit auto padding, merge " << padding_task_count << " tasks into 1 batch."; } LOG(INFO) << "Number of tasks remaining in _task_queue is " << _task_queue.size(); return true; } // this function is accessed by multi thread. // move_task_to_batch have add lock inside the function. // Packaging 1 TaskT as 1 or Several TaskMeta. // TaskT is from the SingleTon TaskExecutor`s _task_queue // although TaskMeta is a local variable, but several TaskMeta may points to // the same TaskT which is get from the SingleTon TaskExecutor`s _task_queue. // put TaskMeta to the local variable BatchTasks batchTask. // batchTask.merge_tasks() and batchTask.notify_tasks() has no lock. // BatchTasks batchTask itself is a local variable, it`s thread safe. // If batchTask.merge_tasks() and batchTask.notify_tasks() do something to // TaskMeta // you need to pay attention to that. // Multi-Thread deal with different TaskMeta(cause it`s created as local // variable) // But different TaskMeta may points to the same TaskT // which is get from the SingleTon TaskExecutor`s _task_queue. template int TaskExecutor::work(ThreadContext* context) { if (MempoolWrapper::instance().thread_initialize() != 0) { LOG(ERROR) << "Failed thread initialize mempool"; return -1; } if (_thread_init_fn != NULL) { if (_thread_init_fn(context->user_thread_context) != 0) { LOG(ERROR) << "execute thread init thunk failed, BSF thread will exit"; context->init_status = -1; return -1; } else { LOG(INFO) << "execute thread init thunk succeed"; } } context->init_status = 1; while (!_stop) { if (_thread_reset_fn != NULL) { if (_thread_reset_fn(context->user_thread_context) != 0) { LOG(ERROR) << "execute user thread reset failed"; } } if (MempoolWrapper::instance().thread_clear() != 0) { LOG(ERROR) << "Failed thread clear mempool"; return -1; } // move_task_to_batch() take the original task from the `_task_queue` // put the original task into its own Vector // the capacity of its own Vector is decided by `_batch_size` or // `_overrun` // merge_tasks() move the imput-data into `_batch_in` from its own // Vector. // because the predictor`s input is the `_batch_in` // notify_tasks() move the output-data into every single taskmeta from // `_batch_out`. // because the predictor`s output is the `_batch_out` BatchTasks batchTask(_batch_size, _overrun, _allow_split_request); if (move_task_to_batch(batchTask)) { batchTask.merge_tasks(); _fn(&batchTask.in(), &batchTask.out()); batchTask.notify_tasks(); } } return 0; } template bool TaskManager::schedule( const void* in, void* out, MempoolRegion* memoryPtr) { // NOLINT TaskHandler handler = TaskExecutorVector::instance()[_model_index].schedule( in, out, memoryPtr); if (handler.valid()) { _task_owned = handler; return true; } else { LOG(ERROR) << "failed to schedule task"; return false; } } template void TaskManager::wait() { char buffer[128]; while (read(_task_owned.read_fd, buffer, sizeof(buffer)) < 0 && errno == EINTR) { } close(_task_owned.read_fd); close(_task_owned.write_fd); _task_owned.read_fd = -1; _task_owned.write_fd = -1; return; } } // namespace bsf } // namespace im