提交 a7d0ac50 编写于 作者: S sneaxiy

Merge develop

......@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
......@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party
# 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
......
......@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
......@@ -43,7 +43,7 @@ ELSE()
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
......
......@@ -10,6 +10,9 @@ paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=No
paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
......@@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'f
paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2'))
paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864'))
paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
......@@ -58,6 +61,12 @@ paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program'
paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a3fefec8bacd6ce83f49906a9d05e779'))
paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '7abd9cf7d695bab5bb6cf7ded5903cb2'))
paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7'))
paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb'))
paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d'))
paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......@@ -222,6 +231,7 @@ paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label'
paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
......@@ -229,7 +239,7 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non
paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b'))
paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709'))
paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......@@ -510,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
......
......@@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
fi
PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
# download vis_demo data
function download() {
......
......@@ -115,14 +115,14 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
# mobilenet with transpose op
set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
endif()
inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
......
include(ExternalProject)
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
......
......@@ -61,4 +61,6 @@ nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocat
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle {
namespace memory {
namespace allocation {
//! Run allocate test cases for different places
void AllocateTestCases() {
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
TEST(Allocator, SpecifyGpuMemory) {
#ifdef PADDLE_WITH_CUDA
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use = 0.0;
// 512 MB
FLAGS_initial_gpu_memory_in_mb = 512;
// 4 MB
FLAGS_reallocate_gpu_memory_in_mb = 4;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -19,6 +19,8 @@
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time);
#endif
......@@ -26,13 +28,8 @@ namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
//! Run allocate test cases for different places
void AllocateTestCases() {
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
......@@ -82,6 +79,16 @@ TEST(allocator, allocator) {
#endif
}
TEST(Allocator, Allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark);
namespace paddle {
......@@ -153,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
}
});
......
......@@ -9,3 +9,5 @@ endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
......@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <algorithm>
#include <utility>
#include "glog/logging.h"
DEFINE_bool(free_idle_memory, false,
......@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
<< ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
......@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
it = RefillPool(size);
// if still failure, fail fatally
if (it == pool_.end()) {
return nullptr;
......@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
return static_cast<MemoryBlock*>(p)->data();
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t request_bytes) {
size_t allocate_bytes = max_chunk_size_;
size_t index = 0;
#ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
} else {
// Reallocation size
if (realloc_size_ == 0) {
realloc_size_ = platform::GpuReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
// Allocate a new maximum sized block
size_t index = 0;
void* p = system_allocator_->Alloc(&index, max_chunk_size_);
// Allocate a new block
void* p = system_allocator_->Alloc(&index, allocate_bytes);
if (p == nullptr) return pool_.end();
......@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);
allocate_bytes, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
......@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++;
}
total_free_ += max_chunk_size_;
total_free_ += allocate_bytes;
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
......@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= block->size(cache_);
fallback_alloc_count_--;
// If no fall allocation exists, return directly
......@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_;
total_free_ -= block->size(cache_);
if (!shall_free_alloc()) return;
}
......
......@@ -60,7 +60,7 @@ class BuddyAllocator {
void* SystemAlloc(size_t size);
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool();
PoolSet::iterator RefillPool(size_t request_bytes);
/**
* \brief Find the suitable chunk from existing pool and split
......@@ -89,6 +89,8 @@ class BuddyAllocator {
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk
size_t realloc_size_ = 0; // the size of re-allocated chunk
private:
/**
* \brief A list of free allocation
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <memory>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
namespace detail {
constexpr static int test_gpu_id = 0;
void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
bool freed = false;
size_t used_bytes = allocator->Used();
if (size_bytes > 0) {
void* p = allocator->Alloc(size_bytes);
EXPECT_NE(p, nullptr);
#ifdef PADDLE_WITH_CUDA
if (size_bytes < platform::GpuMaxChunkSize()) {
#else
if (size_bytes < platform::CpuMaxChunkSize()) {
#endif
// Not allocate from SystemAllocator
EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
} else {
// Allocate from SystemAllocator doesn't count in Used()
EXPECT_EQ(allocator->Used(), used_bytes);
}
int* intp = static_cast<int*>(p);
std::shared_ptr<int> ptr(intp, [&](void* p) {
allocator->Free(intp);
freed = true;
});
} else {
freed = true;
}
EXPECT_EQ(used_bytes, allocator->Used());
EXPECT_TRUE(freed);
}
#ifdef PADDLE_WITH_CUDA
TEST(BuddyAllocator, GpuFraction) {
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, InitRealloc) {
FLAGS_initial_gpu_memory_in_mb = 100;
FLAGS_reallocate_gpu_memory_in_mb = 50;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 40 << 20);
// Greater then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
FLAGS_initial_gpu_memory_in_mb = 5;
FLAGS_reallocate_gpu_memory_in_mb = 10;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then initial size and exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
// MB)
TestBuddyAllocator(&buddy_allocator, 7 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 8 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -32,6 +32,9 @@ limitations under the License. */
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
namespace paddle {
namespace memory {
namespace detail {
......@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size;
return p;
} else {
LOG(WARNING)
<< "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
"environment variable to a lower value. Current value is "
<< FLAGS_fraction_of_gpu_memory_to_use;
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink "
"FLAGS_fraction_of_gpu_memory_to_use or "
"FLAGS_initial_gpu_memory_in_mb or "
"FLAGS_reallocate_gpu_memory_in_mb"
"environment variable to a lower value. "
<< "Current FLAGS_fraction_of_gpu_memory_to_use value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current FLAGS_initial_gpu_memory_in_mb value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
<< FLAGS_reallocate_gpu_memory_in_mb;
return nullptr;
}
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fsp_op.h"
namespace paddle {
namespace operators {
class FSPOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FSPOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE(
x_dims.size() == 4,
"The Input(X) must have shape [batch_size, channel, height, width].");
PADDLE_ENFORCE(
y_dims.size() == 4,
"The Input(Y) must have shape [batch_size, channel, height, width].");
PADDLE_ENFORCE(
(x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
"The Input(X) and Input(Y) should have the same height and width.");
ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
ctx->ShareLoD("X", "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context(), layout_, library_);
}
};
class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input of FSP op with shape [batch_size, x_channel, "
"height, width]");
AddInput("Y",
"(Tensor) The input of FSP op with shape"
"[batch_size, y_channel, height, width]."
"The y_channel can be different with the x_channel of Input(X)"
" while the other dimensions must be the same with Input(X)'s.");
AddOutput(
"Out",
"(Tensor) The output of FSP op with shape "
"[batch_size, x_channel, y_channel]. The x_channel is the channel "
"of Input(X) and the y_channel is the channel of Input(Y).");
AddComment(R"DOC(
This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
Given feature map x with shape [x_channel, h, w] and feature map y with shape
[y_channel, h, w], we can get the fsp matrix of x and y in two steps:
step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
transpose y into matrix with shape [h * w, y_channel]
step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
The output is a batch of fsp matrices.
)DOC");
}
};
class FSPOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
REGISTER_OP_CPU_KERNEL(
fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/fsp_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
ops::FSPOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(fsp_grad,
ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class FSPOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x_dims = x->dims();
auto y_dims = y->dims();
auto batch_size = x_dims[0];
auto x_channel = x_dims[1];
auto y_channel = y_dims[1];
auto height = x_dims[2];
auto width = x_dims[3];
auto blas = math::GetBlas<DeviceContext, T>(context);
math::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = height * width;
x_mat_desc.batch_size_ = batch_size;
x_mat_desc.stride_ = x_channel * height * width;
math::MatDescriptor y_mat_desc;
y_mat_desc.height_ = height * width;
y_mat_desc.width_ = y_channel;
y_mat_desc.batch_size_ = batch_size;
y_mat_desc.stride_ = y_channel * height * width;
y_mat_desc.trans_ = true;
blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
static_cast<T>(1.0 / (height * width)), output,
static_cast<T>(0.0));
}
};
template <typename DeviceContext, typename T>
class FSPGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
if (d_x == nullptr && d_y == nullptr) {
return;
}
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto d_out_dims = d_out->dims();
auto batch_size = d_out_dims[0];
auto x_channel = d_out_dims[1];
auto y_channel = d_out_dims[2];
int64_t h = 0;
int64_t w = 0;
auto blas = math::GetBlas<DeviceContext, T>(context);
math::SetConstant<DeviceContext, T> set_zero;
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
set_zero(context.template device_context<DeviceContext>(), d_x,
static_cast<T>(0));
auto* y = context.Input<Tensor>("Y");
auto y_dims = y->dims();
h = y_dims[2];
w = y_dims[3];
math::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = x_channel;
d_out_mat_desc.width_ = y_channel;
d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel;
math::MatDescriptor y_mat_desc;
y_mat_desc.height_ = y_channel;
y_mat_desc.width_ = h * w;
y_mat_desc.batch_size_ = batch_size;
y_mat_desc.stride_ = y_channel * h * w;
blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
}
if (d_y != nullptr) {
d_y->mutable_data<T>(context.GetPlace());
set_zero(context.template device_context<DeviceContext>(), d_y,
static_cast<T>(0));
auto* x = context.Input<Tensor>("X");
auto x_dims = x->dims();
h = x_dims[2];
w = x_dims[3];
math::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = y_channel;
d_out_mat_desc.width_ = x_channel;
d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel;
d_out_mat_desc.trans_ = true;
math::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = h * w;
x_mat_desc.batch_size_ = batch_size;
x_mat_desc.stride_ = x_channel * h * w;
blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
}
}
};
} // namespace operators
} // namespace paddle
......@@ -17,7 +17,9 @@ function(reader_library TARGET_NAME)
PARENT_SCOPE)
endfunction()
cc_library(py_reader SRCS py_reader.cc DEPS reader)
cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
......@@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
if (NOT WIN32 AND NOT ON_INFER)
cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
......@@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent
# set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
op_library(read_op)
op_library(read_op DEPS py_reader buffered_reader)
foreach(src ${LOCAL_READER_LIBS})
set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
......
......@@ -16,6 +16,7 @@
#include <condition_variable> // NOLINT
#include <deque>
#include <utility>
#include "paddle/fluid/platform/enforce.h"
......@@ -34,7 +35,7 @@ class BlockingQueue {
explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
: capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
PADDLE_ENFORCE_GT(
capacity_, 0,
capacity_, static_cast<size_t>(0),
"The capacity of a reader::BlockingQueue must be greater than 0.");
}
......
......@@ -30,8 +30,10 @@ BufferedReader::~BufferedReader() {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamDestroy(stream));
for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
for (auto &event : events_) {
PADDLE_ENFORCE(cudaEventDestroy(event));
}
}
#endif
}
......@@ -46,15 +48,15 @@ BufferedReader::BufferedReader(
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
compute_stream =
compute_stream_ =
((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events.resize(buffer_size);
for (auto &event : events) {
events_.resize(buffer_size);
for (auto &event : events_) {
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
}
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
}
#endif
cpu_buffer_.resize(buffer_size);
......@@ -73,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
}
#endif
position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
......@@ -91,7 +93,7 @@ void BufferedReader::ReadAsync(size_t i) {
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
......@@ -106,12 +108,14 @@ void BufferedReader::ReadAsync(size_t i) {
if (platform::is_cuda_pinned_place(cpu_place)) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPinnedPlace>(cpu_place),
cpu_ptr, size, stream);
cpu_ptr, size, stream_);
} else if ((platform::is_gpu_place(cpu_place))) {
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream);
size, stream_);
} else {
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
// TODO(zcd): The default stream should not be used here.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
......@@ -119,7 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
}
gpu[i].set_lod(cpu[i].lod());
}
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
}
#endif
return i;
......
......@@ -15,6 +15,7 @@
#pragma once
#include <list>
#include <memory>
#include <queue>
#include <vector>
#include "ThreadPool.h"
......@@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<TensorVec> gpu_buffer_;
size_t prev_pos_{-1UL};
#ifdef PADDLE_WITH_CUDA
cudaStream_t stream;
cudaStream_t compute_stream;
std::vector<cudaEvent_t> events;
cudaStream_t stream_;
cudaStream_t compute_stream_;
std::vector<cudaEvent_t> events_;
#endif
};
......
......@@ -12,37 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/operators/reader/py_reader.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class PyReader : public framework::FileReader {
public:
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
: framework::FileReader() {
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
queue_ = queue;
}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
~PyReader() { queue_->Close(); }
void Shutdown() override { queue_->Close(); }
void Start() override { queue_->ReOpen(); }
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
class CreatePyReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/py_reader.h"
#include <memory>
namespace paddle {
namespace operators {
namespace reader {
PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
: framework::FileReader() {
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
queue_ = queue;
}
void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
PyReader::~PyReader() { queue_->Close(); }
void PyReader::Shutdown() { queue_->Close(); }
void PyReader::Start() { queue_->ReOpen(); }
} // namespace reader
} // namespace operators
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
namespace paddle {
namespace operators {
namespace reader {
class PyReader : public framework::FileReader {
public:
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
void ReadNext(std::vector<framework::LoDTensor>* out) override;
~PyReader();
void Shutdown() override;
void Start() override;
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
} // namespace reader
} // namespace operators
} // namespace paddle
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#include <algorithm>
#include <cstdlib>
#include <string>
......@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
constexpr static float fraction_reserve_gpu_memory = 0.05f;
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
......@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk.");
DEFINE_uint64(
initial_gpu_memory_in_mb, 0ul,
"Allocate a trunk of gpu memory whose byte size is specified by "
"the flag. Future memory usage will be allocated from the "
"truck. If the trunk doesn't have enough gpu memory, additional "
"trunks of the gpu memory will be requested from gpu with size "
"specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
"no memory left for the additional trunk. Note: if you set this "
"flag, the memory size set by "
"FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
"flag. If you don't set this flag, PaddlePaddle will use "
"FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
"If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use");
DEFINE_bool(
enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
......@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
}
size_t GpuMaxAllocSize() {
return std::max(GpuInitAllocSize(), GpuReallocSize());
}
size_t GpuInitAllocSize() {
if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
// Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
}
// FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
// fraction
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
// Reserve the rest for page tables, etc.
return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
return static_cast<size_t>((total - reserving) *
FLAGS_fraction_of_gpu_memory_to_use);
}
size_t GpuReallocSize() {
if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
// Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
}
// FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
// by fraction
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
return static_cast<size_t>((total - reserving) *
FLAGS_fraction_of_gpu_memory_to_use);
}
size_t GpuMinChunkSize() {
......@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() {
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
// If available less than minimum chunk size, no usable memory exists.
available =
std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
total - reserving);
// Reserving the rest memory for page tables, etc.
size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
(total - reserving));
size_t allocating = GpuMaxAllocSize();
PADDLE_ENFORCE_LE(allocating, available,
"Insufficient GPU memory to allocation.");
......
......@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
//! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize();
//! Get the initial allocation size of current GPU device.
size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device.
size_t GpuReallocSize();
//! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize();
......
......@@ -5,7 +5,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
endif()
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
if(WITH_PYTHON)
if(WITH_AMD_GPU)
......
......@@ -56,6 +56,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/recordio.h"
#include "paddle/fluid/pybind/tensor_py.h"
......@@ -133,6 +134,11 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
return paddle::platform::Place(p1) == paddle::platform::Place(p2);
}
template <typename PlaceType>
static inline int PlaceIndex(const PlaceType &p) {
return static_cast<int>(paddle::platform::Place(p).which());
}
PYBIND11_MODULE(core, m) {
// Not used, just make sure cpu_info.cc is linked.
paddle::platform::CpuTotalPhysicalMemory();
......@@ -550,6 +556,7 @@ PYBIND11_MODULE(core, m) {
All parameter, weight, gradient are variables in Paddle.
)DOC")
.def(py::init<>())
.def("is_int", [](const Variable &var) { return var.IsType<int>(); })
.def("set_int",
[](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
......@@ -591,14 +598,13 @@ All parameter, weight, gradient are variables in Paddle.
},
py::return_value_policy::reference);
py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("start", &framework::ReaderHolder::Start)
.def("reset", &framework::ReaderHolder::ResetAll);
BindReader(&m);
using LoDTensorBlockingQueue =
::paddle::operators::reader::LoDTensorBlockingQueue;
using LoDTensorBlockingQueueHolder =
::paddle::operators::reader::LoDTensorBlockingQueueHolder;
py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
m, "LoDTensorBlockingQueue", "")
.def("push",
......@@ -796,6 +802,7 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
#endif
})
.def("_type", &PlaceIndex<platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
......@@ -805,6 +812,7 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
.def(py::init<>())
.def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
......@@ -820,6 +828,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif
new (&self) platform::CUDAPinnedPlace();
})
.def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
......@@ -831,16 +840,25 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<platform::Place>(m, "Place")
.def(py::init<>())
.def("_type", &PlaceIndex<platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("is_cpu_place",
[](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
})
.def("gpu_device_id",
[](platform::Place &self) {
return boost::get<platform::CUDAPlace>(self).device;
})
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/reader_py.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/py_reader.h"
#include "paddle/fluid/platform/place.h"
#include "pybind11/stl.h"
namespace paddle {
namespace pybind {
class MultiDeviceFeedReader {
public:
using ResultDictList =
std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
MultiDeviceFeedReader(
const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
const std::vector<std::string> &names,
const std::vector<platform::Place> &dst_places, bool use_double_buffer)
: queue_(queue),
names_(names),
pool_(new ::ThreadPool(dst_places.size())) {
std::shared_ptr<framework::ReaderBase> reader(
new operators::reader::PyReader(queue));
readers_.reserve(dst_places.size());
for (auto &p : dst_places) {
auto *holder = new framework::ReaderHolder();
if (use_double_buffer) {
holder->Reset(
framework::MakeDecoratedReader<operators::reader::BufferedReader>(
reader, p, 2));
} else {
if (platform::is_gpu_place(p)) {
PADDLE_THROW(
"Place cannot be CUDAPlace when use_double_buffer is False");
}
holder->Reset(reader);
}
readers_.emplace_back(holder);
}
futures_.resize(dst_places.size());
ret_.resize(dst_places.size());
ReadAsync();
}
ResultDictList ReadNext() {
bool success = WaitFutures();
if (!success) {
return {};
}
ResultDictList result(ret_.size());
for (size_t i = 0; i < ret_.size(); ++i) {
for (size_t j = 0; j < names_.size(); ++j) {
result[i].emplace(names_[j], std::move(ret_[i][j]));
}
}
ReadAsync();
return result;
}
void Reset() {
Shutdown();
Start();
ReadAsync();
}
~MultiDeviceFeedReader() {
queue_->Close();
pool_.reset();
}
private:
bool WaitFutures() {
bool success = true;
for (auto &f : futures_) {
success &= f.get();
}
return success;
}
void Shutdown() {
for (auto &r : readers_) r->Shutdown();
}
void Start() {
for (auto &r : readers_) r->Start();
}
void ReadAsync() {
for (size_t i = 0; i < readers_.size(); ++i) {
futures_[i] = pool_->enqueue([this, i] {
readers_[i]->ReadNext(&ret_[i]);
return !ret_[i].empty();
});
}
}
std::shared_ptr<operators::reader::LoDTensorBlockingQueue> queue_;
std::vector<std::string> names_;
std::unique_ptr<::ThreadPool> pool_;
std::vector<std::unique_ptr<framework::ReaderHolder>> readers_;
std::vector<std::future<bool>> futures_;
std::vector<std::vector<framework::LoDTensor>> ret_;
};
namespace py = pybind11;
void BindReader(py::module *module) {
auto &m = *module;
namespace reader = ::paddle::operators::reader;
py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("start", &framework::ReaderHolder::Start)
.def("reset", &framework::ReaderHolder::ResetAll);
py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
.def("read_next", &MultiDeviceFeedReader::ReadNext,
py::call_guard<py::gil_scoped_release>())
.def("reset", &MultiDeviceFeedReader::Reset,
py::call_guard<py::gil_scoped_release>());
m.def("create_py_reader",
[](const std::shared_ptr<operators::reader::LoDTensorBlockingQueue>
&queue,
const std::vector<std::string> &names,
const std::vector<platform::Place> &dst_places,
bool use_double_buffer) {
return new MultiDeviceFeedReader(queue, names, dst_places,
use_double_buffer);
},
py::return_value_policy::take_ownership);
}
} // namespace pybind
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
namespace paddle {
namespace pybind {
void BindReader(pybind11::module *module);
} // namespace pybind
} // namespace paddle
......@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb");
envs.push_back("allocator_strategy");
#elif __clang__
envs.push_back("use_mkldnn");
......
......@@ -46,9 +46,9 @@ import six
from six.moves import cPickle as pickle
__all__ = ['train', 'test', 'valid']
DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
......
......@@ -163,7 +163,8 @@ def __bootstrap__():
if core.is_compiled_with_cuda():
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'sync_nccl_allreduce', 'limit_of_tmp_allocation',
......
......@@ -17,9 +17,10 @@ import os
import six
import sys
from .. import compat as cpt
from . import framework
from .framework import cuda_places, cpu_places
from . import core
from . import framework
__all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
......@@ -44,21 +45,6 @@ def _is_pserver_mode(main_program):
return False
def get_available_places(use_cuda):
if use_cuda:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
places = [core.CUDAPlace(i) for i in gpus]
else:
cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
assert places, "no place for execution"
return places
class CompiledProgram(object):
"""
Compiles to Graph for execution.
......@@ -117,7 +103,8 @@ class CompiledProgram(object):
loss_name=None,
build_strategy=None,
exec_strategy=None,
share_vars_from=None):
share_vars_from=None,
places=None):
"""Configs the program to run in data parallel way.
Args:
......@@ -132,10 +119,18 @@ class CompiledProgram(object):
threads are used, how many iterations to clean up the temp
variables. For more information, please refer
to fluid.ExecutionStrategy. Default None.
share_vars_from(CompiledProgram): If provide, this CompiledProgram
share_vars_from(CompiledProgram): If provided, this CompiledProgram
will share variables from `share_vars_from`. `share_vars_from`
must be run by the executor before this CompiledProgram so that
vars are ready.
places(list(CUDAPlace)|list(CPUPlace)|None): If provided, only compile
program in the given places. Otherwise, the places used when compiled
is determined by the Executor, and the places used are controlled
by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES
if using GPU; or CPU_NUM if using CPU. For example, if you want to
run on GPU 0 and 1, set places=[fluid.CUDAPlace(0), fluid.CUDAPlace(1)].
If you want to run on 2 CPU cores, set places=[fluid.CPUPlace()]*2.
Returns:
self
"""
......@@ -150,6 +145,12 @@ class CompiledProgram(object):
self._exec_strategy = ExecutionStrategy()
if self._build_strategy is None:
self._build_strategy = BuildStrategy()
if places is not None:
if not isinstance(places, (list, tuple)):
places = [places]
self._places = places
else:
self._places = None
self._build_strategy.is_distribution = _is_pserver_mode(self._program)
return self
......@@ -192,7 +193,15 @@ class CompiledProgram(object):
self._local_scopes = []
self._exec_strategy.use_cuda = use_cuda
self._places = get_available_places(self._exec_strategy.use_cuda)
has_set_place = (self._places is not None)
if has_set_place:
for p in self._places:
assert p._type() == self._place._type(), \
"Place type not match. You may set the wrong type of places"
else:
self._places = cuda_places(
) if self._exec_strategy.use_cuda else cpu_places()
assert self._places, "no place for execution"
if self._exec_strategy.num_threads == 0:
if self._exec_strategy.use_cuda:
......@@ -200,9 +209,7 @@ class CompiledProgram(object):
# performance. Worth tunning for other models in the future.
self._exec_strategy.num_threads = len(self._places) * 4
else:
cpu_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self._exec_strategy.num_threads = cpu_num * 2
self._exec_strategy.num_threads = len(self._places) * 2
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
......
......@@ -45,28 +45,41 @@ You can load INT8 model by load_inference_model [API](https://github.com/PaddleP
```
## 3. Result
We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 (single core).
**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| ------------ | ------------ | ------------ | ------------ | ------------ |
| ResNet-50 | Small | 72.00% | 72.00% | 0.00% |
| MobileNet-V1 | Small | 62.00% | 62.00% | 0.00% |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.17% | 0.46% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.49% | 0.29% |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.23% | 0.40% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.47% | 0.31% |
**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 11.54 images/s | 32.2 images/s | 2.79 |
| MobileNet-V1 | Full ImageNet Val | 49.21 images/s | 108.37 images/s | 2.2 |
Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset").
Please note that [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset") can be downloaded by script `test_calibration.py` with `DATASET=full`.
Notes:
* The accuracy measurement requires the model with `label`.
* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `providing a theoretical peak compute gain of 4x int8 OPS over fp32 OPS` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). However, the actual test results at the model level will be less than 4X, and in general the average is about 2X. In addition, the calculation library optimization of batch size 1 is not as good as the large batch size.
## 4. How to reproduce the results
* Small dataset
* Small dataset (Single core)
```bash
FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
```
* Full dataset
* Full dataset (Single core)
```bash
FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
```
* Full dataset (Multi-core)
```bash
FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
```
> Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
......@@ -271,7 +271,7 @@ class Compressor(object):
self.eval_reader = eval_reader
self.teacher_graphs = []
for teacher in teacher_programs:
self.teacher_graphs.append(ImitationGraph(teacher, scope=scope))
self.teacher_graphs.append(GraphWrapper(teacher))
self.checkpoint = None
self.checkpoint_path = checkpoint_path
......
......@@ -19,6 +19,7 @@ from collections import OrderedDict
from ..prune import *
from ..quantization import *
from .strategy import *
from ..distillation import *
__all__ = ['ConfigFactory']
"""This factory is used to create instances by loading and parsing configure file with yaml format.
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,3 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import distiller
from .distiller import *
from . import distillation_strategy
from .distillation_strategy import *
__all__ = distiller.__all__
__all__ += distillation_strategy.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..core.strategy import Strategy
from ....framework import Program, program_guard
from .... import Executor
import logging
__all__ = ['DistillationStrategy']
logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)
class DistillationStrategy(Strategy):
def __init__(self, distillers=None, start_epoch=0, end_epoch=0):
"""
Args:
distillers(list): A list of distiller used to combine student graph and teacher graph
by adding some loss.
start_epoch(int): The epoch when to merge student graph and teacher graph for
distillation training. default: 0
end_epoch(int): The epoch when to finish distillation training. default: 0
"""
super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
self.distillers = distillers
def on_compression_begin(self, context):
# load from checkpoint
if context.epoch_id > 0:
if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
_logger.info('Restore DistillationStrategy')
self._create_distillation_graph(context)
_logger.info('Restore DistillationStrategy finish.')
def on_epoch_begin(self, context):
if self.start_epoch == context.epoch_id:
_logger.info('DistillationStrategy::on_epoch_begin.')
self._create_distillation_graph(context)
_logger.info('DistillationStrategy set optimize_graph.')
def _create_distillation_graph(self, context):
"""
step 1: Merge student graph and teacher graph into distillation graph.
step 2: Add loss into distillation graph by distillers.
step 3: Append backward ops and optimize ops into distillation graph for training.
"""
# step 1
teacher = context.teacher_graphs[0]
for var in teacher.program.list_vars():
var.stop_gradient = True
graph = context.train_graph.clone()
graph.merge(teacher)
graph.out_nodes['student_loss'] = graph.out_nodes['loss']
# step 2
for distiller in self.distillers:
graph = distiller.distiller_loss(graph)
# step 3
startup_program = Program()
with program_guard(graph.program, startup_program):
context.distiller_optimizer._name = 'distillation_optimizer'
context.distiller_optimizer.minimize(
graph.var(graph.out_nodes['loss'])._var)
exe = Executor(context.place)
exe.run(startup_program, scope=context.scope)
# backup graph for fine-tune after distillation
context.put('distillation_backup_optimize_graph',
context.optimize_graph)
context.optimize_graph = graph
def on_epoch_end(self, context):
if context.epoch_id == (self.end_epoch - 1):
_logger.info('DistillationStrategy::on_epoch_end.')
# restore optimize_graph for fine-tune or other strategy in next stage.
context.optimize_graph = context.get(
'distillation_backup_optimize_graph')
_logger.info(
'DistillationStrategy set context.optimize_graph to None.')
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .... import layers
from .... import optimizer
from .... import Executor
from .... import Program
from .... import program_guard
from .... import regularizer
__all__ = ['FSPDistiller', 'L2Distiller']
class L2Distiller(object):
"""
Combine two layers from student net and teacher net by l2-loss.
And add the loss into the total loss using for distillation training.
"""
def __init__(self,
student_feature_map,
teacher_feature_map,
distillation_loss_weight=1):
"""
Args:
student_feature_map(str): The name of feature map from student network.
teacher_feature_map(str): The name of feature map from teacher network.
It's shape should be the same with student network.
distillation_loss_weight(float): The weight of the l2-loss.
"""
self.student_feature_map = student_feature_map
self.teacher_feature_map = teacher_feature_map
self.distillation_loss_weight = distillation_loss_weight
def distiller_loss(self, graph):
"""
Modify graph inplace to add l2-loss.
Args:
graph(GraphWrapper): The graph to be modified.
Returns:
GraphWrapper: The modified graph.
"""
distiller_pass = L2DistillerPass(self.student_feature_map,
self.teacher_feature_map,
self.distillation_loss_weight)
dis_graph = distiller_pass.apply(graph)
return dis_graph
class L2DistillerPass(object):
"""
The pass used to add l2-loss.
"""
def __init__(self,
student_feature_map,
teacher_feature_map,
distillation_loss_weight=1):
"""
Args:
student_feature_map(str): The name of feature map from student network.
teacher_feature_map(str): The name of feature map from teacher network.
It's shape should be the same with student network.
distillation_loss_weight(float): The weight of the l2-loss.
"""
self.student_feature_map = student_feature_map
self.teacher_feature_map = teacher_feature_map
self.distillation_loss_weight = distillation_loss_weight
def apply(self, graph):
ret_graph = graph
with program_guard(ret_graph.program):
student_feature_map = ret_graph.var(self.student_feature_map)._var
teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
l2loss = layers.reduce_mean(
layers.square(student_feature_map - teacher_feature_map))
distillation_loss = l2loss * self.distillation_loss_weight
student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
loss = distillation_loss + student_loss
ret_graph.out_nodes[
'l2loss_' + self.student_feature_map + "_" +
self.teacher_feature_map] = distillation_loss.name
ret_graph.out_nodes['loss'] = loss.name
return ret_graph
class FSPDistiller(object):
"""
Combine layers from student net and teacher net by fsp-loss.
"""
def __init__(self, student_pairs, teacher_pairs,
distillation_loss_weight=1):
"""
Args:
student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
a section in student network. The variables in a tuple should
have the same feature map size.
teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
a section in teacher network. The variables in a tuple should
have the same feature map size. Varibale named teacher_pairs[i][j]
should has the save channel number with that of variable named
student_pairs[i][j].
distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
"""
self.student_pairs = student_pairs
self.teacher_pairs = teacher_pairs
self.distillation_loss_weight = distillation_loss_weight
def distiller_loss(self, graph):
"""
Modify graph inplace to add fsp-loss.
Args:
graph(GraphWrapper): The graph to be modified.
Returns:
GraphWrapper: The modified graph.
"""
distiller_pass = FSPDistillerPass(self.student_pairs,
self.teacher_pairs,
self.distillation_loss_weight)
dis_graph = distiller_pass.apply(graph)
return dis_graph
class FSPDistillerPass(object):
'''
Combine layers from student net and teacher net by fsp-loss.
'''
def __init__(self, s_pairs, t_pairs, distillation_loss_weight=1):
"""
Args:
s_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
a section in student network. The variables in a tuple should
have the same feature map size.
t_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
a section in teacher network. The variables in a tuple should
have the same feature map size. Varibale named teacher_pairs[i][j]
should has the save channel number with that of variable named
student_pairs[i][j].
distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
"""
self.s_pairs = s_pairs
self.t_pairs = t_pairs
self.distillation_loss_weight = distillation_loss_weight
def apply(self, graph):
ret_graph = graph
with program_guard(ret_graph.program):
losses = []
for s_pair, t_pair in zip(self.s_pairs, self.t_pairs):
s_pair_start = ret_graph.var(s_pair[0])._var
s_pair_end = ret_graph.var(s_pair[1])._var
s_fsp_matrix = self._fsp_matrix(s_pair_start, s_pair_end)
t_pair_start = ret_graph.var(t_pair[0])._var
t_pair_end = ret_graph.var(t_pair[1])._var
t_fsp_matrix = self._fsp_matrix(t_pair_start, t_pair_end)
l2_loss = layers.reduce_mean(
layers.square(s_fsp_matrix - t_fsp_matrix))
losses.append(l2_loss)
distillation_loss = layers.sum(
losses) * self.distillation_loss_weight
student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
loss = distillation_loss + student_loss
ret_graph.out_nodes[
'fsp_distillation_loss'] = distillation_loss.name
ret_graph.out_nodes['loss'] = loss.name
return ret_graph
def _fsp_matrix(self, fea_map_0, fea_map_1):
return layers.fsp_matrix(fea_map_0, fea_map_1)
......@@ -300,7 +300,9 @@ class GraphWrapper(object):
graph(GraphWrapper): The graph to be merged by current graph.
"""
for var in graph.program.list_vars():
self.program.global_block()._clone_variable(var)
new_var = self.program.global_block()._clone_variable(
var, force_persistable=False)
new_var.stop_gradient = var.stop_gradient
# TODO: parameters should be cloned
for op in graph.ops():
op = op._op
......@@ -309,12 +311,12 @@ class GraphWrapper(object):
attrs = {}
for input_name in op.input_names:
inputs[input_name] = [
self.var(in_var_name)
for in_var_name in op.inputs(input_name)
self.var(in_var_name)._var
for in_var_name in op.input(input_name)
]
for output_name in op.output_names:
outputs[output_name] = [
self.var(out_var_name)
self.var(out_var_name)._var
for out_var_name in op.output(output_name)
]
for attr_name in op.attr_names:
......
......@@ -16,5 +16,7 @@ from __future__ import print_function
from . import quantization_pass
from .quantization_pass import *
from . import quantization_strategy
from .quantization_strategy import *
__all__ = quantization_pass.__all__
__all__ = quantization_pass.__all__ + quantization_strategy.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import sys
import numpy as np
from .... import Executor
from .... import io
from .... import core
from ....compiler import CompiledProgram
from ....compiler import BuildStrategy
from ....framework import IrGraph
from ..core.strategy import Strategy
from .quantization_pass import *
__all__ = ['QuantizationStrategy']
logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)
class QuantizationStrategy(Strategy):
"""
The strategy for Quantization.
"""
def __init__(self,
start_epoch=0,
end_epoch=0,
float_model_save_path=None,
mobile_model_save_path=None,
int8_model_save_path=None,
activation_bits=8,
weight_bits=8,
activation_quantize_type='abs_max',
save_in_nodes=None,
save_out_nodes=None):
"""
Args:
start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
float_model_save_path(str): The path to save model with float weights.
None means it doesn't save float model. defalut: None.
mobile_model_save_path(str): The path to save model for paddle-mobile execution.
None means it doesn't save mobile model. defalut: None.
int8_model_save_path(str): The path to save model with int8_t weight.
None means it doesn't save int8 model. defalut: None.
activation_bits(int): quantization bit number for activation. default: 8.
weight_bits(int): quantization bit number for weights. The bias is not quantized.
default: 8.
activation_quantize_type(str): quantization type for activation,
now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
If use 'abs_max' mode, the quantization scale will be calculated
dynamically each step in both training and testing period. If use
'range_abs_max', a static quantization scale will be calculated
during training and used in inference.
save_in_nodes(list<str>): A list of variable names used to prune graph
for saving inference model.
save_out_nodes(list<str>): A list of variable names used to prune graph
for saving inference model.
"""
super(QuantizationStrategy, self).__init__(start_epoch, end_epoch)
self.start_epoch = start_epoch
self.end_epoch = end_epoch
self.float_model_save_path = float_model_save_path
self.mobile_model_save_path = mobile_model_save_path
self.int8_model_save_path = int8_model_save_path
self.activation_bits = activation_bits
self.weight_bits = weight_bits
self.activation_quantize_type = activation_quantize_type
self.save_out_nodes = save_out_nodes
self.save_in_nodes = save_in_nodes
def on_epoch_begin(self, context):
"""
Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
"""
super(QuantizationStrategy, self).on_compression_begin(context)
if self.start_epoch == context.epoch_id:
_logger.info('QuantizationStrategy::on_epoch_begin')
train_ir_graph = IrGraph(
core.Graph(context.optimize_graph.program.desc), for_test=False)
test_ir_graph = IrGraph(
core.Graph(context.eval_graph.program.desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits,
activation_quantize_type=self.activation_quantize_type)
transform_pass.apply(train_ir_graph)
transform_pass.apply(test_ir_graph)
build_strategy = BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
# for quantization training
context.optimize_graph.compiled_graph = CompiledProgram(
train_ir_graph.graph).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'],
build_strategy=build_strategy)
# for evaluation. And program compiled from ir graph must be with data parallel.
context.eval_graph.compiled_graph = CompiledProgram(
test_ir_graph.graph).with_data_parallel(
build_strategy=build_strategy)
# for saving inference model after training
context.put('quantization_test_ir_graph_backup', test_ir_graph)
_logger.info('Finish QuantizationStrategy::on_epoch_begin')
def on_epoch_end(self, context):
"""
Free and save inference model.
"""
super(QuantizationStrategy, self).on_compression_end(context)
if context.epoch_id == self.end_epoch:
_logger.info('QuantizationStrategy::on_epoch_end')
test_ir_graph = context.get('quantization_test_ir_graph_backup')
# freeze the graph after training
freeze_pass = QuantizationFreezePass(
scope=context.scope,
place=context.place,
weight_bits=self.weight_bits,
activation_bits=self.activation_bits)
freeze_pass.apply(test_ir_graph)
# for other strategies
context.eval_graph.program = test_ir_graph.to_program()
if self.save_out_nodes == None:
out_vars = [
context.eval_graph.var(var_name)._var
for var_name in context.eval_graph.out_nodes.values()
]
else:
out_vars = [
context.eval_graph.var(var_name)._var
for var_name in self.save_out_nodes
]
if self.save_in_nodes == None:
in_vars = list(context.eval_graph.out_nodes.values())
else:
in_vars = self.save_in_nodes
# save float model
if self.float_model_save_path:
executor = Executor(context.place)
io.save_inference_model(
self.float_model_save_path,
in_vars,
out_vars,
executor,
main_program=test_ir_graph.to_program(),
model_filename='model',
params_filename='weights',
export_for_deployment=True)
# save int8 model
if self.int8_model_save_path:
convert_int8_pass = ConvertToInt8Pass(
scope=context.scope, place=context.place)
convert_int8_pass.apply(test_ir_graph)
executor = Executor(context.place)
io.save_inference_model(
self.int8_model_save_path,
in_vars,
out_vars,
executor,
main_program=test_ir_graph.to_program(),
model_filename='model',
params_filename='weights',
export_for_deployment=True)
# save mobile model
if self.mobile_model_save_path:
if not self.int8_model_save_path:
# convert the weights as int8_t type
convert_int8_pass = ConvertToInt8Pass(
scope=context.scope, place=context.place)
convert_int8_pass.apply(test_ir_graph)
# make some changes on the graph for the mobile inference
mobile_pass = TransformForMobilePass()
mobile_pass.apply(test_ir_graph)
executor = Executor(context.place)
io.save_inference_model(
self.mobile_model_save_path,
in_vars,
out_vars,
executor,
main_program=test_ir_graph.to_program(),
model_filename='model',
params_filename='weights',
export_for_deployment=True)
_logger.info('Finish QuantizationStrategy::on_epoch_end')
#start_epoch(int): The epoch when to merge student graph and teacher graph for
# distillation training. default: 0
#
#end_epoch(int): The epoch when to finish distillation training. default: 0
#
#student_feature_map(str): The name of feature map from student network.
#
#teacher_feature_map(str): The name of feature map from teacher network.
# It's shape should be the same with student network.
#
#student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
# a section in student network. The variables in a tuple should
# have the same feature map size.
#
#teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
# a section in teacher network. The variables in a tuple should
# have the same feature map size. Varibale named teacher_pairs[i][j]
# should has the save channel number with that of variable named
# student_pairs[i][j].
#
#distillation_loss_weight(float): The weight of the loss.
version: 1.0
distillers:
fsp_distiller:
class: 'FSPDistiller'
# teacher_pairs: [['teacher_depthwise_conv2d_1.tmp_0', 'teacher_conv2d_3.tmp_0']]
# student_pairs: [['student_depthwise_conv2d_1.tmp_0', 'student_conv2d_3.tmp_0']]
teacher_pairs: [['teacher_conv2_1_dw.tmp_0', 'teacher_conv1.tmp_0']]
student_pairs: [['student_conv2_1_dw.tmp_0', 'student_conv1.tmp_0']]
distillation_loss_weight: 1
l2_distiller:
class: 'L2Distiller'
teacher_feature_map: 'teacher.tmp_2'
student_feature_map: 'student.tmp_2'
distillation_loss_weight: 1
strategies:
distillation_strategy:
class: 'DistillationStrategy'
distillers: ['fsp_distiller', 'l2_distiller']
start_epoch: 0
end_epoch: 1
compressor:
epoch: 1
checkpoint_path: './distillation_checkpoints/'
strategies:
- distillation_strategy
......@@ -29,6 +29,6 @@ strategies:
metric_name: 'acc_top1'
compressor:
epoch: 2
checkpoint_path: './checkpoints/'
checkpoint_path: './checkpoints_pruning/'
strategies:
- sensitive_pruning_strategy
......@@ -35,8 +35,9 @@ train_parameters = {
class MobileNet():
def __init__(self):
def __init__(self, name=""):
self.params = train_parameters
self.name = name
def net(self, input, class_dim=1000, scale=1.0):
# conv1: 112x112
......@@ -47,7 +48,7 @@ class MobileNet():
num_filters=int(32 * scale),
stride=2,
padding=1,
name="conv1")
name=self.name + "_conv1")
# 56x56
input = self.depthwise_separable(
......@@ -57,7 +58,7 @@ class MobileNet():
num_groups=32,
stride=1,
scale=scale,
name="conv2_1")
name=self.name + "_conv2_1")
input = self.depthwise_separable(
input,
......@@ -66,7 +67,7 @@ class MobileNet():
num_groups=64,
stride=2,
scale=scale,
name="conv2_2")
name=self.name + "_conv2_2")
# 28x28
input = self.depthwise_separable(
......@@ -76,7 +77,7 @@ class MobileNet():
num_groups=128,
stride=1,
scale=scale,
name="conv3_1")
name=self.name + "_conv3_1")
input = self.depthwise_separable(
input,
......@@ -85,7 +86,7 @@ class MobileNet():
num_groups=128,
stride=2,
scale=scale,
name="conv3_2")
name=self.name + "_conv3_2")
# 14x14
input = self.depthwise_separable(
......@@ -95,7 +96,7 @@ class MobileNet():
num_groups=256,
stride=1,
scale=scale,
name="conv4_1")
name=self.name + "_conv4_1")
input = self.depthwise_separable(
input,
......@@ -104,7 +105,7 @@ class MobileNet():
num_groups=256,
stride=2,
scale=scale,
name="conv4_2")
name=self.name + "_conv4_2")
# 14x14
for i in range(5):
......@@ -115,7 +116,7 @@ class MobileNet():
num_groups=512,
stride=1,
scale=scale,
name="conv5" + "_" + str(i + 1))
name=self.name + "_conv5" + "_" + str(i + 1))
# 7x7
input = self.depthwise_separable(
input,
......@@ -124,7 +125,7 @@ class MobileNet():
num_groups=512,
stride=2,
scale=scale,
name="conv5_6")
name=self.name + "_conv5_6")
input = self.depthwise_separable(
input,
......@@ -133,7 +134,7 @@ class MobileNet():
num_groups=1024,
stride=1,
scale=scale,
name="conv6")
name=self.name + "_conv6")
input = fluid.layers.pool2d(
input=input,
......@@ -142,12 +143,14 @@ class MobileNet():
pool_type='avg',
global_pooling=True)
output = fluid.layers.fc(input=input,
size=class_dim,
act='softmax',
param_attr=ParamAttr(
initializer=MSRA(), name="fc7_weights"),
bias_attr=ParamAttr(name="fc7_offset"))
output = fluid.layers.fc(
input=input,
size=class_dim,
act='softmax',
param_attr=ParamAttr(
initializer=MSRA(), name=self.name + "_fc7_weights"),
bias_attr=ParamAttr(name=self.name + "_fc7_offset"),
name=self.name)
return output
def conv_bn_layer(self,
......@@ -172,11 +175,13 @@ class MobileNet():
use_cudnn=use_cudnn,
param_attr=ParamAttr(
initializer=MSRA(), name=name + "_weights"),
name=name,
bias_attr=False)
bn_name = name + "_bn"
return fluid.layers.batch_norm(
input=conv,
act=act,
name=name,
param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + '_mean',
......
#start_epoch(int): The epoch to insert quantization operators. default: 0
#
#end_epoch(int): The epoch to save inferecne model. default: 0
#
#float_model_save_path(str): The path to save model with float weights.
# None means it doesn't save float model. defalut: None.
#
#mobile_model_save_path(str): The path to save model for paddle-mobile execution.
# None means it doesn't save mobile model. defalut: None.
#
#int8_model_save_path(str): The path to save model with int8_t weight.
# None means it doesn't save int8 model. defalut: None.
#
#activation_bits(int): quantization bit number for activation. default: 8.
#
#weight_bits(int): quantization bit number for weights. The bias is not quantized.
# default: 8.
#
#activation_quantize_type(str): quantization type for activation,
# now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
# If use 'abs_max' mode, the quantization scale will be calculated
# dynamically each step in both training and testing period. If use
# 'range_abs_max', a static quantization scale will be calculated
# during training and used in inference.
#
#save_in_nodes(list<str>): A list of variable names used to prune graph
# for saving inference model.
#
#save_out_nodes(list<str>): A list of variable names used to prune graph
# for saving inference model.
version: 1.0
strategies:
quantization_strategy:
class: 'QuantizationStrategy'
start_epoch: 0
end_epoch: 0
float_model_save_path: './output/float'
weight_bits: 8
activation_bits: 8
weight_quantize_type: 'abs_max'
activation_quantize_type: 'abs_max'
save_in_nodes: ['image']
save_out_nodes: ['quan.tmp_2']
compressor:
epoch: 1
checkpoint_path: './checkpoints_quan/'
strategies:
- quantization_strategy
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import paddle
import unittest
import paddle.fluid as fluid
from mobilenet import MobileNet
from paddle.fluid.contrib.slim.core import Compressor
from paddle.fluid.contrib.slim.graph import GraphWrapper
class TestDistillationStrategy(unittest.TestCase):
"""
Test API of distillation strategy.
"""
def test_compression(self):
if not fluid.core.is_compiled_with_cuda():
return
class_dim = 10
image_shape = [1, 28, 28]
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
image.stop_gradient = False
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = MobileNet(name="student").net(input=image, class_dim=class_dim)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
val_program = fluid.default_main_program().clone(for_test=False)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(
momentum=0.9,
learning_rate=0.01,
regularization=fluid.regularizer.L2Decay(4e-5))
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
val_feed_list = [('img', image.name), ('label', label.name)]
val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
acc_top5.name)]
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128)
train_feed_list = [('img', image.name), ('label', label.name)]
train_fetch_list = [('loss', avg_cost.name)]
# define teacher program
teacher_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(teacher_program, startup_program):
img = teacher_program.global_block()._clone_variable(
image, force_persistable=False)
predict = MobileNet(name="teacher").net(input=img,
class_dim=class_dim)
exe.run(startup_program)
com_pass = Compressor(
place,
fluid.global_scope(),
fluid.default_main_program(),
train_reader=train_reader,
train_feed_list=train_feed_list,
train_fetch_list=train_fetch_list,
eval_program=val_program,
eval_reader=val_reader,
eval_feed_list=val_feed_list,
eval_fetch_list=val_fetch_list,
teacher_programs=[teacher_program.clone(for_test=True)],
train_optimizer=optimizer,
distiller_optimizer=optimizer)
com_pass.config('./distillation/compress.yaml')
eval_graph = com_pass.run()
if __name__ == '__main__':
unittest.main()
......@@ -15,7 +15,7 @@
import paddle
import unittest
import paddle.fluid as fluid
from filter_pruning.mobilenet import MobileNet
from mobilenet import MobileNet
from paddle.fluid.contrib.slim.core import Compressor
from paddle.fluid.contrib.slim.graph import GraphWrapper
......
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import paddle
import unittest
import paddle.fluid as fluid
from mobilenet import MobileNet
from paddle.fluid.contrib.slim.core import Compressor
from paddle.fluid.contrib.slim.graph import GraphWrapper
class TestQuantizationStrategy(unittest.TestCase):
"""
Test API of quantization strategy.
"""
def test_compression(self):
if not fluid.core.is_compiled_with_cuda():
return
class_dim = 10
image_shape = [1, 28, 28]
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
image.stop_gradient = False
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = MobileNet(name='quan').net(input=image, class_dim=class_dim)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
val_program = fluid.default_main_program().clone(for_test=False)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(
momentum=0.9,
learning_rate=0.01,
regularization=fluid.regularizer.L2Decay(4e-5))
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
val_feed_list = [('img', image.name), ('label', label.name)]
val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
acc_top5.name)]
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128)
train_feed_list = [('img', image.name), ('label', label.name)]
train_fetch_list = [('loss', avg_cost.name)]
com_pass = Compressor(
place,
fluid.global_scope(),
fluid.default_main_program(),
train_reader=train_reader,
train_feed_list=train_feed_list,
train_fetch_list=train_fetch_list,
eval_program=val_program,
eval_reader=val_reader,
eval_feed_list=val_feed_list,
eval_fetch_list=val_fetch_list,
train_optimizer=optimizer)
com_pass.config('./quantization/compress.yaml')
eval_graph = com_pass.run()
if __name__ == '__main__':
unittest.main()
......@@ -136,7 +136,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
"full_data", False)
else:
data_urls.append(
'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
)
data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
self.data_cache_folder = self.download_data(data_urls, data_md5s,
......@@ -189,7 +189,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
def download_model(self):
# resnet50 fp32 data
data_urls = [
'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
]
data_md5s = ['4a5194524823d9b76da6e738e1367881']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
......@@ -307,7 +307,7 @@ class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
def download_model(self):
# mobilenetv1 fp32 data
data_urls = [
'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
]
data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
......
......@@ -26,6 +26,24 @@ from .framework import Variable, default_main_program
__all__ = ['DataFeeder']
def convert_dtype(dtype):
if dtype == core.VarDesc.VarType.FP32:
return 'float32'
elif dtype == core.VarDesc.VarType.INT64:
return 'int64'
elif dtype == core.VarDesc.VarType.FP64:
return 'float64'
elif dtype == core.VarDesc.VarType.FP16:
return 'float16'
elif dtype == core.VarDesc.VarType.INT32:
return 'int32'
elif dtype == core.VarDesc.VarType.UINT8:
return 'uint8'
else:
raise ValueError("dtype must be any of [int32, float32, int64, "
"float64, uint8]")
class DataToLoDTensorConverter(object):
def __init__(self, place, lod_level, shape, dtype):
self.place = place
......@@ -38,27 +56,12 @@ class DataToLoDTensorConverter(object):
if negtive_count > 1:
self.shape = None
break
if dtype == core.VarDesc.VarType.FP32:
self.dtype = 'float32'
elif dtype == core.VarDesc.VarType.INT64:
self.dtype = 'int64'
elif dtype == core.VarDesc.VarType.FP64:
self.dtype = 'float64'
elif dtype == core.VarDesc.VarType.FP16:
self.dtype = 'float16'
elif dtype == core.VarDesc.VarType.INT32:
self.dtype = 'int32'
elif dtype == core.VarDesc.VarType.UINT8:
self.dtype = 'uint8'
else:
raise ValueError("dtype must be any of [int32, float32, int64, "
"float64, uint8]")
self.dtype = convert_dtype(dtype)
self._reset()
def _reset(self):
self.data = []
self.lod = []
for i in six.moves.range(lod_level):
self.lod.append([])
self.lod = [[] for _ in six.moves.range(self.lod_level)]
def feed(self, data):
self._feed_impl_(data, self.lod, self.lod_level)
......@@ -88,15 +91,52 @@ class DataToLoDTensorConverter(object):
raise ValueError(
"Reshape error. What is defined in data layer is {}, but receive {}"
.format(self.shape, arr.shape))
#else:
# self._check_shape(arr.shape)
t = core.LoDTensor()
t.set(arr, self.place)
if self.lod_level > 0:
t.set_recursive_sequence_lengths(self.lod)
self._reset()
return t
class BatchedTensorProvider(object):
def __init__(self, feed_list, place, batch_size, generator, drop_last):
self.place = place
self.batch_size = batch_size
self.generator = generator
self.converters = []
self.drop_last = drop_last
for var in feed_list:
assert var.lod_level == 0, "lod_level must be 0"
self.converters.append(
DataToLoDTensorConverter(
place=self.place,
lod_level=0,
shape=var.shape,
dtype=var.dtype))
def _done(self):
return [c.done() for c in self.converters]
def __call__(self):
idx = 0
for each_sample in self.generator():
for each_slot, each_converter in six.moves.zip(each_sample,
self.converters):
each_converter.data.append(each_slot)
idx += 1
if idx == self.batch_size:
idx = 0
yield self._done()
if not self.drop_last and idx > 0:
yield self._done()
else:
[c._reset() for c in self.converters]
class DataFeeder(object):
"""
DataFeeder converts the data that returned by a reader into a data
......
......@@ -564,6 +564,10 @@ class Executor(object):
if feed is None:
feed = {}
elif isinstance(feed, (list, tuple)):
assert len(feed) == 1, "Not compiled with data parallel"
feed = feed[0]
if not isinstance(feed, dict):
raise TypeError(
"feed requires dict as its Parameter. But you passed in %s" %
......
......@@ -26,6 +26,7 @@ import six
import numpy as np
import subprocess
import multiprocessing
from .. import compat as cpt
from .proto import framework_pb2
......@@ -63,6 +64,9 @@ __all__ = [
'default_main_program',
'program_guard',
'name_scope',
'cuda_places',
'cpu_places',
'cuda_pinned_places',
]
EMPTY_VAR_NAME = core.kEmptyVarName()
......@@ -87,6 +91,87 @@ def _current_expected_place():
return _imperative_current_expected_place_
def _cpu_num():
return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
def cuda_places(device_ids=None):
'''
Create a list of :code:`fluid.CUDAPlace` objects.
If :code:`device_ids` is None, environment variable of
:code:`FLAGS_selected_gpus` would be checked first. If
:code:`FLAGS_selected_gpus=0,1,2`, the returned list would
be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
If :code:`FLAGS_selected_gpus` is not set, all visible
gpu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of gpus. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
Args:
device_ids (None|list(int)|tuple(int)): gpu device id list.
Returns:
out (list(fluid.CUDAPlace)): gpu place list.
'''
assert core.is_compiled_with_cuda(), \
"Not compiled with CUDA"
if device_ids is None:
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
device_ids = [int(s) for s in gpus_env.split(",")]
else:
device_ids = six.moves.range(core.get_cuda_device_count())
elif not isinstance(device_ids, (list, tuple)):
device_ids = [device_ids]
return [core.CUDAPlace(dev_id) for dev_id in device_ids]
def cpu_places(device_count=None):
'''
Create a list of :code:`fluid.CPUPlace` objects.
If :code:`device_count` is None, the device count would
be determined by environment variable :code:`CPU_NUM`.
If :code:`CPU_NUM` is not set, the device count would
be determined by :code:`multiprocessing.cpu_count()`.
Args:
device_count (None|int): device number.
Returns:
out (list(fluid.CPUPlace)): cpu place list.
'''
if device_count is None:
device_count = _cpu_num()
return [core.CPUPlace()] * device_count
def cuda_pinned_places(device_count=None):
'''
Create a list of :code:`fluid.CUDAPinnedPlace` objects.
If :code:`device_count` is None, the device count would
be determined by environment variable :code:`CPU_NUM`.
If :code:`CPU_NUM` is not set, the device count would
be determined by :code:`multiprocessing.cpu_count()`.
Args:
device_count (None|int): device number.
Returns:
out (list(fluid.CUDAPinnedPlace)): cuda pinned place list.
'''
assert core.is_compiled_with_cuda(), \
"Not compiled with CUDA"
if device_count is None:
device_count = _cpu_num()
return [core.cuda_pinned_places()] * device_count
class NameScope(object):
def __init__(self, name="", parent=None):
self._children = dict()
......@@ -318,8 +403,8 @@ class Variable(object):
self._ivar = core.VarBase(
name, dtype if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [],
_current_expected_place(), True
if persistable else False, stop_gradient)
_current_expected_place(), stop_gradient, True
if persistable else False)
if persistable:
_imperative_tracer().trace_var(name, self)
else:
......@@ -1559,12 +1644,15 @@ class Block(object):
name=v.name)
self.vars[new_p.name] = new_p
def _clone_variable(self, var):
def _clone_variable(self, var, force_persistable=True):
"""
Clone a variable into current block.
Args:
var: the variable to be cloned.
force_persistable(bool): True means setting the result variable to being persistable.
False means setting the persistable the same with that of input var.
default: True.
Returns:
Variable: the new variable cloned from 'var' in current block.
......@@ -1584,7 +1672,7 @@ class Block(object):
shape=var.shape,
dtype=var.dtype,
type=var.type,
persistable=True,
persistable=True if force_persistable else var.persistable,
is_data=var.is_data)
else:
ret_var = self.create_var(
......@@ -1593,7 +1681,7 @@ class Block(object):
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=True,
persistable=True if force_persistable else var.persistable,
is_data=var.is_data)
return ret_var
......
......@@ -26,12 +26,14 @@ from paddle.fluid import layers
from paddle.fluid.executor import Executor
from paddle.fluid.evaluator import Evaluator
from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
from . import reader
from .reader import *
from . import core
__all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
'load_persistables', 'save_inference_model', 'load_inference_model'
]
] + reader.__all__
def is_parameter(var):
......
......@@ -563,22 +563,26 @@ def _py_reader(capacity,
def start_provide_thread(func):
def __provider_thread__():
for tensors in func():
array = core.LoDTensorArray()
for item in tensors:
if not isinstance(item, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(item, core.CPUPlace())
item = tmp
array.append(item)
if reader.exited:
break
feed_queue.push(array)
if reader.exited:
break
feed_queue.close()
try:
for tensors in func():
array = core.LoDTensorArray()
for item in tensors:
if not isinstance(item, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(item, core.CPUPlace())
item = tmp
array.append(item)
if reader.exited:
break
feed_queue.push(array)
if reader.exited:
break
feed_queue.close()
except Exception as ex:
feed_queue.close()
raise ex
reader.thread = threading.Thread(target=__provider_thread__)
reader.thread.daemon = True
......@@ -628,6 +632,9 @@ def _py_reader(capacity,
reader.reset = __reset__
reader.decorate_tensor_provider = __set_tensor_provider__
reader.decorate_paddle_reader = __set_paddle_reader__
reader.decorate_batch_generator = __set_tensor_provider__
reader.decorate_sample_list_generator = __set_paddle_reader__
reader.start = __start__
return reader
......@@ -692,6 +699,11 @@ def py_reader(capacity,
>>> exe.run(fetch_list=[loss.name])
>>> except fluid.core.EOFException:
>>> reader.reset()
>>>
>>> ...
>>>
>>> fluid.io.save_inference_model(dirname='./model', feeded_var_names=[img, label],
>>> target_vars=[loss], executor=fluid.Executor(fluid.CUDAPlace(0)))
2. When training and testing are both performed, two different
:code:`py_reader` should be created with different names, e.g.:
......
......@@ -189,6 +189,7 @@ __all__ = [
'huber_loss',
'tree_conv',
'npair_loss',
'fsp_matrix',
]
kIgnoreIndex = -100
......@@ -10790,3 +10791,46 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
celoss = reduce_mean(cross_entropy)
return l2loss + celoss
def fsp_matrix(x, y):
"""
**FSP matrix op**
This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
Given feature map x with shape [x_channel, h, w] and feature map y with shape
[y_channel, h, w], we can get the fsp matrix of x and y in two steps:
1. reshape x into matrix with shape [x_channel, h * w] and reshape and
transpose y into matrix with shape [h * w, y_channel].
2. multiply x and y to get fsp matrix with shape [x_channel, y_channel].
The output is a batch of fsp matrices.
Args:
x (Variable): A feature map with shape [batch_size, x_channel, height, width].
y (Variable): A feature map with shape [batch_size, y_channel, height, width].
The y_channel can be different with the x_channel of Input(X)
while the other dimensions must be the same with Input(X)'s.
Returns:
fsp matrix (Variable): The output of FSP op with shape [batch_size, x_channel, y_channel].
The x_channel is the channel of x and the y_channel is the channel of y.
Examples:
.. code-block:: python
feature_map_0 = fluid.layers.conv2d(x)
feature_map_1 = fluid.layers.conv2d(feature_map_0)
loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
"""
helper = LayerHelper('fsp_matrix', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
input_param_name='x'))
helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
return out
......@@ -99,7 +99,8 @@ class ParallelExecutor(object):
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
self._places = compiler.get_available_places(use_cuda)
self._places = framework.cuda_places(
) if use_cuda else framework.cpu_places()
self._scope = scope if scope is not None else executor.global_scope()
main_program = main_program if main_program is not None \
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import core
import six
import threading
from .framework import Program, Variable, program_guard, default_main_program, default_startup_program
from .executor import global_scope
from .data_feeder import DataFeeder, BatchedTensorProvider
from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
from .unique_name import UniqueNameGenerator
__all__ = ['PyReader']
def _convert_places(places):
if not isinstance(places, (list, tuple)):
places = [places]
ret = []
for p in places:
if not isinstance(p, core.Place):
tmp = core.Place()
tmp.set_place(p)
p = tmp
ret.append(p)
return ret
class PyReader(object):
"""
Create a reader object for data feeding in Python.
Data would be prefetched using Python thread and be pushed
into a queue asynchronously. Data in the queue would be extracted
automatically when `Executor.run(...)` is called.
Args:
feed_list (list(Variable)|tuple(Variable)): feed variable list.
The variables should be created by :code:`fluid.layers.data()`.
capacity (int): capacity of the queue maintained in PyReader object.
use_double_buffer (bool): whether to use double_buffer_reader to
speed up data feeding.
iterable (bool): whether the created reader object is iterable.
Returns:
reader (Reader): the created reader object.
Examples:
1. If iterable = False, the created PyReader object is almost the
same as :code:`fluid.layers.py_reader()`. Operators would be
inserted into the program. User should call :code:`start()`
before each epoch and catch :code:`fluid.core.EOFException`
thrown by :code:`Executor.run()` when epoch ends. Once the
exception is caught, user should call :code:`reset()` to reset
the reader manually.
.. code-block:: python
image = fluid.layers.data(
name='image', shape=[784], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
reader = fluid.io.PyReader(feed_list=[image, label],
capacity=4, iterable=False)
reader.decorate_sample_list_generator(user_defined_reader)
... # definition of network is omitted
executor.run(fluid.default_main_program())
for _ in range(EPOCH_NUM):
reader.start()
while True:
try:
executor.run(feed=None, ...)
except fluid.core.EOFException:
reader.reset()
break
2. If iterable=True, the created PyReader object is decoupled with
the program. No operator would be inserted into the program.
In this case, the created reader is a Python generator, which
is iterable. User should feed the data yielded from PyReader
object into :code:`Executor.run(feed=...)`.
.. code-block:: python
image = fluid.layers.data(
name='image', shape=[784], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
reader = fluid.io.PyReader(feed_list=[image, label],
capacity=4, iterable=True)
reader.decorate_sample_list_generator(user_defined_reader,
places=fluid.cuda_places())
... # definition of network is omitted
executor.run(fluid.default_main_program())
for _ in range(EPOCH_NUM):
for data in reader():
executor.run(feed=data, ...)
"""
unique_name_generator = UniqueNameGenerator()
def __init__(self,
feed_list,
capacity,
use_double_buffer=True,
iterable=False):
self._tensor_reader = None
self._thread = None
self._iterable = iterable
self._use_double_buffer = use_double_buffer
self._capacity = capacity
self._feed_list = feed_list
if not self._iterable:
self._init_non_iterable()
def _init_iterable(self, places):
self._var_names = [v.name for v in self._feed_list]
self._places = _convert_places(places)
self._queue = core.init_lod_tensor_blocking_queue(core.Variable(),
self._capacity)
self._reader = core.create_py_reader(
self.queue, self._var_names, self._places, self._use_double_buffer)
def _init_non_iterable(self):
lod_levels = []
dtypes = []
shape_concat = []
ranks = []
shapes = []
for feed_data in self._feed_list:
dtypes.append(feed_data.dtype)
shape_concat.extend(feed_data.shape)
ranks.append(len(feed_data.shape))
shapes.append(feed_data.shape)
lod_levels.append(feed_data.lod_level)
queue_name = PyReader.unique_name_generator('lod_tensor_blocking_queue')
reader_name = PyReader.unique_name_generator('create_py_reader')
double_buffer_name = PyReader.unique_name_generator('double_buffer')
var = global_scope().var(queue_name)
self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity)
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=reader_name)
startup_blk.append_op(
type='create_py_reader',
inputs={'blocking_queue': [queue_name]},
outputs={'Out': [startup_var]},
attrs={
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'ranks': ranks
})
startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True
main_prog_var = _copy_reader_var_(
default_main_program().current_block(), startup_var)
main_prog_var.stop_gradient = True
main_prog_var.persistable = True
reader = monkey_patch_reader_methods(main_prog_var)
if self._use_double_buffer:
double_buffer_reader = double_buffer(
reader, name=double_buffer_name)
# we return a double buffer reader. However, the reset method comes from
# py_reader.
double_buffer_reader.reset = reader.reset
reader = double_buffer_reader
self._reader = reader
default_main_program().current_block().append_op(
type='read',
inputs={'Reader': [self._reader]},
outputs={'Out': self._feed_list})
@property
def queue(self):
return self._queue
@property
def iterable(self):
return self._iterable
def __call__(self):
assert self.iterable, "PyReader is not iterable"
assert self._tensor_reader is not None, \
"Data source of PyReader has not set yet"
class Iterator(object):
def __init__(self, reader):
self._reader = reader._reader
self._reset = reader._reset
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
ret = self._reader.read_next()
if ret:
return ret
else:
self._reset()
raise StopIteration
self._start()
return Iterator(self)
def _reset(self):
self._reader.reset()
self._thread.join()
def start(self):
'''
Start the data feeding thread.
Can only call when the reader object is not iterable.
'''
assert not self._iterable, "start() cannot be called when PyReader is iterable"
self._start()
def reset(self):
'''
Reset the reader object when :code:`fluid.core.EOFException` raises.
Can only call when the reader object is not iterable.
'''
assert not self._iterable, "reset() cannot be called when PyReader is iterable"
self._reset()
def _start(self):
def __thread_main__():
try:
for tensors in self._tensor_reader():
array = core.LoDTensorArray()
for item in tensors:
if not isinstance(item, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(item, core.CPUPlace())
item = tmp
array.append(item)
if not self._queue.push(array):
break
self._queue.close()
except Exception as ex:
self._queue.close()
raise ex
self._thread = threading.Thread(target=__thread_main__)
self._thread.daemon = True
self._thread.start()
def decorate_sample_generator(self,
sample_generator,
batch_size,
drop_last=True,
places=None):
'''
Set the data source of the PyReader object.
The provided :code:`sample_generator` should be a Python generator,
which yields numpy.ndarray typed data of each sample.
:code:`places` must be set when the PyReader object is iterable.
If all inputs have no lods, this method is faster than
:code:`decorate_sample_list_generator(paddle.batch(sample_generator, ...))` .
Args:
sample_generator (generator): Python generator that yields
numpy.ndarray-typed sample data.
batch_size (int): batch size. Must be larger than 0.
drop_last (bool): Whether to drop the last batch when sample number
is less than batch_size.
places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
be provided when PyReader is iterable.
'''
assert batch_size > 0, "batch_size must be larger than 0"
has_lod = False
for f in self._feed_list:
if f.lod_level != 0:
has_lod = True
break
if has_lod:
self.decorate_sample_list_generator(
paddle.batch(
sample_generator,
batch_size=batch_size,
drop_last=drop_last),
places=places)
else:
reader = BatchedTensorProvider(
feed_list=self._feed_list,
place=core.CPUPlace(),
batch_size=batch_size,
generator=sample_generator,
drop_last=drop_last)
self.decorate_batch_generator(reader, places=places)
def decorate_sample_list_generator(self, reader, places=None):
'''
Set the data source of the PyReader object.
The provided :code:`reader` should be a Python generator,
which yields list(numpy.ndarray) typed batched data.
:code:`places` must be set when the PyReader object is iterable.
Args:
reader (generator): Python generator that yields
list(numpy.ndarray)-typed batched data.
places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
be provided when PyReader is iterable.
'''
assert self._tensor_reader is None, \
"Cannot reset the data source of PyReader"
with program_guard(Program(), Program()):
feeder = DataFeeder(
feed_list=self._feed_list, place=core.CPUPlace())
paddle_reader = feeder.decorate_reader(reader, multi_devices=False)
def __tensor_reader_impl__():
for slots in paddle_reader():
yield [slots[var.name] for var in self._feed_list]
self.decorate_batch_generator(__tensor_reader_impl__, places)
def decorate_batch_generator(self, reader, places=None):
'''
Set the data source of the PyReader object.
The provided :code:`reader` should be a Python generator,
which yields numpy.ndarray-typed or LoDTensor-typed batched data.
:code:`places` must be set when the PyReader object is iterable.
Args:
reader (generator): Python generator that yields LoDTensor-typed
batched data.
places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
be provided when PyReader is iterable.
'''
assert self._tensor_reader is None, \
"Cannot reset the data source of PyReader"
self._tensor_reader = reader
if self._iterable:
assert places is not None, "Places cannot be None when py_reader is iterable"
self._init_iterable(places)
......@@ -20,7 +20,7 @@ logging.basicConfig()
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
"""
avazu_ctr_data/train.txt
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import numpy as np
import time
import six
import unittest
EPOCH_NUM = 60
BATCH_SIZE = 32
CLASS_NUM = 10
def random_reader():
np.random.seed(1)
for i in range(BATCH_SIZE * 40):
image = np.random.random([784])
label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
yield image, label
def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
startup_prog = fluid.Program()
main_prog = fluid.Program()
startup_prog.random_seed = 1
main_prog.random_seed = 1
with fluid.unique_name.guard():
with fluid.program_guard(main_prog, startup_prog):
image = fluid.layers.data(
name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
py_reader = fluid.io.PyReader(
feed_list=[image, label],
capacity=4,
iterable=not use_legacy_py_reader,
use_double_buffer=use_double_buffer)
hidden = image
for hidden_size in [10, 20, 30]:
hidden = fluid.layers.fc(
hidden,
size=hidden_size,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
predict_label = fluid.layers.fc(hidden,
size=CLASS_NUM,
act='softmax')
loss = fluid.layers.mean(
fluid.layers.cross_entropy(
input=predict_label, label=label))
optimizer = fluid.optimizer.Adam()
optimizer.minimize(loss)
return startup_prog, main_prog, py_reader, loss
class TestBase(unittest.TestCase):
def run_main(self, use_legacy_py_reader, with_data_parallel, places,
use_double_buffer):
scope = fluid.Scope()
with fluid.scope_guard(scope):
startup_prog, main_prog, py_reader, loss = simple_fc_net(
places, use_legacy_py_reader, use_double_buffer)
reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
ps = places if use_double_buffer else fluid.cpu_places(len(places))
py_reader.decorate_sample_list_generator(
reader, places=ps if py_reader.iterable else None)
exe = fluid.Executor(place=places[0])
exe.run(startup_prog)
prog = fluid.CompiledProgram(main_prog)
if with_data_parallel:
prog = prog.with_data_parallel(
loss_name=loss.name, places=places)
step = 0
step_list = []
loss_list = []
start_t = time.time()
if not py_reader.iterable:
for _ in six.moves.range(EPOCH_NUM):
step = 0
py_reader.start()
while True:
try:
L, = exe.run(program=prog,
fetch_list=[loss],
use_program_cache=True)
loss_list.append(np.mean(L))
step += 1
except fluid.core.EOFException:
py_reader.reset()
break
step_list.append(step)
else:
for _ in six.moves.range(EPOCH_NUM):
step = 0
for d in py_reader():
assert len(d) == len(places)
for i, item in enumerate(d):
image = item['image']
label = item['label']
assert image.shape() == [BATCH_SIZE, 784]
assert label.shape() == [BATCH_SIZE, 1]
assert image._place()._equals(ps[i])
assert label._place()._equals(ps[i])
L, = exe.run(program=prog,
feed=d,
fetch_list=[loss],
use_program_cache=True)
loss_list.append(np.mean(L))
step += 1
step_list.append(step)
end_t = time.time()
ret = {
"time": end_t - start_t,
"step": step_list,
"loss": np.array(loss_list)
}
return ret
def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
places = []
if with_cpu:
places.append([fluid.CPUPlace()])
if with_data_parallel:
places.append([fluid.CPUPlace()] * 2)
if with_gpu and fluid.core.is_compiled_with_cuda():
tmp = fluid.cuda_places()
assert len(tmp) > 0, "no gpu detected"
if with_data_parallel:
places.append(tmp)
places.append([tmp[0]])
return places
def test_main(self):
for with_data_parallel in [True, False]:
for p in self.prepare_places(with_data_parallel):
for use_double_buffer in [False, True]:
results = []
for use_legacy_py_reader in [False, True]:
ret = self.run_main(
use_legacy_py_reader=use_legacy_py_reader,
with_data_parallel=with_data_parallel,
places=p,
use_double_buffer=use_double_buffer)
results.append(ret)
if not use_double_buffer:
diff = np.max(
np.abs(results[0]['loss'] - results[1]['loss']))
self.assertLess(diff, 1e-3)
if __name__ == '__main__':
unittest.main()
......@@ -21,7 +21,7 @@ from test_dist_base import TestDistBase
def download_files():
url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
url_prefix = 'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
vocab_url = url_prefix + 'vocab.bpe.32000'
vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def fsp_matrix(a, b):
batch = a.shape[0]
a_channel = a.shape[1]
b_channel = b.shape[1]
h = a.shape[2]
w = a.shape[3]
a_t = a.transpose([0, 2, 3, 1])
a_t = a_t.reshape([batch, h * w, a_channel])
b_t = b.transpose([0, 2, 3, 1]).reshape([batch, h * w, b_channel])
a_r = a_t.repeat(
b_channel, axis=1).reshape(
[batch, h * w, b_channel, a_channel]).transpose([0, 1, 3, 2])
b_r = b_t.repeat(
a_channel, axis=1).reshape([batch, h * w, a_channel, b_channel])
return np.mean(a_r * b_r, axis=1)
class TestFSPOp(OpTest):
def setUp(self):
self.op_type = "fsp"
self.initTestCase()
feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32')
feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32')
self.inputs = {'X': feature_map_0, 'Y': feature_map_1}
self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)}
def initTestCase(self):
self.a_shape = (2, 16, 32, 31)
self.b_shape = (2, 28, 32, 31)
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import random
import sys
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable
NUM_USERS = 100
NUM_ITEMS = 1000
BATCH_SIZE = 32
NUM_BATCHES = 2
class MLP(fluid.imperative.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256)
self._user_layers = []
self._item_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._user_layers.append(
self.add_sublayer(
'user_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
self._item_layers.append(
self.add_sublayer(
'item_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
items = self._item_latent(items)
for ul, il in zip(self._user_layers, self._item_layers):
users = ul(users)
items = il(items)
return fluid.layers.elementwise_mul(users, items)
class DMF(fluid.imperative.Layer):
def __init__(self, name_scope):
super(DMF, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256)
self._match_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._match_layers.append(
self.add_sublayer(
'match_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
self._mat
def forward(self, users, items):
users = self._user_latent(users)
items = self._item_latent(items)
match_vec = fluid.layers.concat(
[users, items], axis=len(users.shape) - 1)
for l in self._match_layers:
match_vec = l(match_vec)
return match_vec
class DeepCF(fluid.imperative.Layer):
def __init__(self, name_scope):
super(DeepCF, self).__init__(name_scope)
self._user_emb = fluid.imperative.Embedding(self.full_name(),
[NUM_USERS, 256])
self._item_emb = fluid.imperative.Embedding(self.full_name(),
[NUM_ITEMS, 256])
self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name())
self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
def forward(self, users, items):
users_emb = self._user_emb(users)
items_emb = self._item_emb(items)
mlp_predictive = self._mlp(users_emb, items_emb)
dmf_predictive = self._dmf(users_emb, items_emb)
predictive = fluid.layers.concat(
[mlp_predictive, dmf_predictive],
axis=len(mlp_predictive.shape) - 1)
prediction = self._match_fc(predictive)
return prediction
def get_data():
user_ids = []
item_ids = []
labels = []
for uid in range(NUM_USERS):
for iid in range(NUM_ITEMS):
# 10% positive
label = float(random.randint(1, 10) == 1)
user_ids.append(uid)
item_ids.append(iid)
labels.append(label)
indices = np.arange(NUM_USERS * NUM_ITEMS)
np.random.shuffle(indices)
users_np = np.array(user_ids, dtype=np.int64)[indices]
items_np = np.array(item_ids, dtype=np.int64)[indices]
labels_np = np.array(labels, dtype=np.float32)[indices]
return np.expand_dims(users_np, -1), \
np.expand_dims(items_np, -1), \
np.expand_dims(labels_np, -1)
class TestImperativeDeepCF(unittest.TestCase):
def test_gan_float32(self):
seed = 90
users_np, items_np, labels_np = get_data()
startup = fluid.Program()
startup.random_seed = seed
main = fluid.Program()
main.random_seed = seed
scope = fluid.core.Scope()
with new_program_scope(main=main, startup=startup, scope=scope):
users = fluid.layers.data('users', [1], dtype='int64')
items = fluid.layers.data('items', [1], dtype='int64')
labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf')
prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels))
adam = fluid.optimizer.AdamOptimizer(0.01)
adam.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
exe.run(startup)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
static_loss = exe.run(
main,
feed={
users.name: users_np[slice:slice + BATCH_SIZE],
items.name: items_np[slice:slice + BATCH_SIZE],
labels.name: labels_np[slice:slice + BATCH_SIZE]
},
fetch_list=[loss])[0]
sys.stderr.write('static loss %s\n' % static_loss)
with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf')
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
prediction = deepcf(
to_variable(users_np[slice:slice + BATCH_SIZE]),
to_variable(items_np[slice:slice + BATCH_SIZE]))
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction,
to_variable(labels_np[slice:slice +
BATCH_SIZE])))
loss._backward()
adam = fluid.optimizer.AdamOptimizer(0.01)
adam.minimize(loss)
deepcf.clear_gradients()
dy_loss = loss._numpy()
self.assertEqual(static_loss, dy_loss)
if __name__ == '__main__':
unittest.main()
......@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
return self._fc3(x)
class TestImperativeMnist(unittest.TestCase):
class TestImperativeGAN(unittest.TestCase):
def test_gan_float32(self):
seed = 90
......
......@@ -1269,6 +1269,15 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_fsp(self):
program = Program()
with program_guard(program):
x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
y = layers.data(name="Y", shape=[8, 4, 4], dtype="float32")
out = layers.fsp_matrix(x, y)
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import math
import unittest
import numpy as np
import os
os.environ['CPU_NUM'] = '1'
def random_reader(sample_num):
def __impl__():
for _ in range(sample_num):
yield np.random.random(
size=[784]).astype('float32'), np.random.random_integers(
low=0, high=9, size=[1]).astype('int64')
return paddle.reader.cache(__impl__)
class TestCaseBase(unittest.TestCase):
def setUp(self):
self.batch_size = 32
self.epoch_num = 2
self.sample_num = 165
def generate_all_data(self, reader):
ret = []
for d in reader():
slots = [[], []]
for item in d:
slots[0].append(item[0])
slots[1].append(item[1])
slots = [np.array(slot) for slot in slots]
ret.append(slots)
return ret
def run_main(self, reader, use_sample_generator, iterable, drop_last):
image = fluid.layers.data(name='image', dtype='float32', shape=[784])
label = fluid.layers.data(name='label', dtype='int64', shape=[1])
py_reader = fluid.io.PyReader(
feed_list=[image, label],
capacity=16,
iterable=iterable,
use_double_buffer=False)
batch_reader = paddle.batch(reader, self.batch_size, drop_last)
all_datas = self.generate_all_data(batch_reader)
if not use_sample_generator:
py_reader.decorate_sample_list_generator(
batch_reader, places=fluid.cpu_places())
else:
py_reader.decorate_sample_generator(
reader, self.batch_size, drop_last, places=fluid.cpu_places())
if drop_last:
batch_num = int(self.sample_num / self.batch_size)
else:
batch_num = math.ceil(float(self.sample_num) / self.batch_size)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
for _ in range(self.epoch_num):
if py_reader.iterable:
step = 0
for data in py_reader():
img, lbl = exe.run(feed=data, fetch_list=[image, label])
self.assertArrayEqual(img, all_datas[step][0])
self.assertArrayEqual(lbl, all_datas[step][1])
step += 1
self.assertEqual(step, len(all_datas))
else:
step = 0
try:
py_reader.start()
while True:
img, lbl = exe.run(fetch_list=[image, label])
self.assertArrayEqual(img, all_datas[step][0])
self.assertArrayEqual(lbl, all_datas[step][1])
step += 1
except fluid.core.EOFException:
py_reader.reset()
self.assertEqual(step, len(all_datas))
break
def assertArrayEqual(self, arr1, arr2):
self.assertEqual(arr1.shape, arr2.shape)
self.assertTrue((arr1 == arr2).all())
def test_main(self):
reader = random_reader(self.sample_num)
for use_sample_generator in [False, True]:
for iterable in [False, True]:
for drop_last in [False, True]:
with fluid.program_guard(fluid.Program(), fluid.Program()):
self.run_main(reader, use_sample_generator, iterable,
drop_last)
class TestCase1(TestCaseBase):
def setUp(self):
self.batch_size = 32
self.epoch_num = 10
self.sample_num = 160
class TestCase2(TestCaseBase):
def setUp(self):
self.batch_size = 32
self.epoch_num = 2
self.sample_num = 200
class TestCase3(TestCaseBase):
def setUp(self):
self.batch_size = 32
self.epoch_num = 2
self.sample_num = 159
if __name__ == '__main__':
unittest.main()
......@@ -13,7 +13,7 @@
# limitations under the License.
__all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
'multiprocess_reader', 'Fake'
]
......@@ -33,6 +33,30 @@ import zlib
import paddle.compat as cpt
def cache(reader):
"""
Cache the reader data into memory.
Be careful that this method may take long time to process,
and consume lots of memory. :code:`reader()` would only
call once.
Args:
reader (generator): a reader object which yields
data each time.
Returns:
generator: a decorated reader object which yields data from cached memory.
"""
all_data = tuple(reader())
def __impl__():
for item in all_data:
yield item
return __impl__
def map_readers(func, *readers):
"""
Creates a data reader that outputs return value of function using
......
......@@ -117,6 +117,7 @@ packages=['paddle',
'paddle.fluid.contrib.slim.graph',
'paddle.fluid.contrib.slim.prune',
'paddle.fluid.contrib.slim.quantization',
'paddle.fluid.contrib.slim.distillation',
'paddle.fluid.contrib.utils',
'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册