未验证 提交 c3c8d5dd 编写于 作者: J Jeff Rasley 提交者: GitHub
上级 f0304bd1
name: Build
name: unit-tests
on:
push:
......@@ -14,7 +14,7 @@ on:
jobs:
# unit tests running on nvidia gpus
nv-torch12-p40:
runs-on: [self-hosted, nvidia, torch12]
runs-on: [self-hosted, nvidia, torch12, p40]
steps:
- uses: actions/checkout@v2
......@@ -102,6 +102,43 @@ jobs:
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec pip install -r {} \;
TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
# unit tests running on amd gpus
amd:
# The type of runner that the job will run on
runs-on: [self-hosted, amd]
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v2
# Runs a single command using the runners shell
- name: environment
run: |
rocm-smi --showhw
which python
python --version
which hipcc
hipcc --version
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
sudo apt-get update
sudo apt-get install -y libaio-dev
# Runs a set of commands using the runners shell
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning]
python -c "from deepspeed.env_report import cli_main; cli_main()"
#ds_report
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
#TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
nv-lightning-v100:
runs-on: [self-hosted, nvidia, torch18, v100]
......
......@@ -5,7 +5,9 @@
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int cublas_gemm_ex(cublasHandle_t handle,
......@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
......@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
......@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
......@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
......@@ -5,7 +5,15 @@
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
......
......@@ -43,7 +43,11 @@ public:
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
#endif
}
void Backward(int bsz,
const T* out_grad,
......@@ -68,7 +72,11 @@ public:
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
#endif
cublas_gemm_ex(_cublasHandle,
CUBLAS_OP_N,
......@@ -81,7 +89,11 @@ public:
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
#endif
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
}
......
......@@ -2,7 +2,9 @@
#pragma once
#include <cuda_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
......@@ -58,7 +60,11 @@ public:
B,
A,
C,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
......@@ -73,7 +79,11 @@ public:
A,
C,
B,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
......@@ -88,7 +98,11 @@ public:
B,
C,
A,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
......@@ -100,8 +114,12 @@ public:
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
......@@ -186,7 +204,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
......@@ -216,7 +238,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
......@@ -243,7 +269,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
......@@ -255,8 +285,12 @@ public:
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
......
......@@ -3,7 +3,11 @@
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
......
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file hcc_detail/hip_cooperative_groups.h
*
* @brief Device side implementation of `Cooperative Group` feature.
*
* Defines new types and device API wrappers related to `Cooperative Group`
* feature, which the programmer can directly use in his kernel(s) in order to
* make use of this feature.
*/
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
//#if __cplusplus
#if __cplusplus && defined(__clang__) && defined(__HIP__)
#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
#include <hip/hcc_detail/device_functions.h>
#endif
namespace cooperative_groups {
/** \brief The base type of all cooperative group types
*
* \details Holds the key properties of a constructed cooperative group type
* object, like the group type, its size, etc
*/
/*
class thread_group {
protected:
uint32_t _type; // thread_group type
uint32_t _size; // total number of threads in the tread_group
uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
// LSB represents lane 0, and MSB represents lane 63
// Construct a thread group, and set thread group type and other essential
// thread group properties. This generic thread group is directly constructed
// only when the group is supposed to contain only the calling the thread
// (throurh the API - `this_thread()`), and in all other cases, this thread
// group object is a sub-object of some other derived thread group object
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
uint64_t mask = (uint64_t)0) {
_type = type;
_size = size;
_mask = mask;
}
public:
// Total number of threads in the thread group, and this serves the purpose
// for all derived cooperative group types since their `size` is directly
// saved during the construction
__CG_QUALIFIER__ uint32_t size() const {
return _size;
}
// Rank of the calling thread within [0, size())
__CG_QUALIFIER__ uint32_t thread_rank() const;
// Is this cooperative group type valid?
__CG_QUALIFIER__ bool is_valid() const;
// synchronize the threads in the thread group
__CG_QUALIFIER__ void sync() const;
};
*/
class thread_group {
protected:
bool _tiled_partition; // this_thread_block() constructor sets to false
uint32_t _size; // this_thread_block() constructor sets to size()
uint32_t local_rank; // this_thread_block() constructor sets to thread_rank()
uint32_t _mask;
uint32_t _type;
public:
__CG_QUALIFIER__ thread_group(internal::group_type type,
uint32_t group_size,
uint64_t mask = (uint64_t)0)
{
_type = type;
_size = group_size;
_mask = mask;
local_rank = internal::workgroup::thread_rank();
}
__CG_QUALIFIER__ void tiled_partition(const thread_group& parent, unsigned int tile_size)
{
if ((ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 || tile_size > 64 ||
parent.size() < tile_size)
_tiled_partition = false;
// xxx : abort
_tiled_partition = true;
_size = tile_size;
local_rank = parent.thread_rank() % tile_size;
}
__CG_QUALIFIER__ void sync() const;
__CG_QUALIFIER__ uint32_t size() const { return _size; }
__CG_QUALIFIER__ uint32_t thread_rank() const;
__CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const
{
return (__shfl_down(var, delta, _size));
}
__CG_QUALIFIER__ float shfl_xor(float var, int mask) const
{
return (__shfl_xor(var, mask, _size));
}
__CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const
{
return (__shfl(var, src_lane, _size));
}
__CG_QUALIFIER__ bool is_valid() const;
};
/** \brief The multi-grid cooperative group type
*
* \details Represents an inter-device cooperative group type where the
* participating threads within the group spans across multiple
* devices, running the (same) kernel on these devices
*/
class multi_grid_group : public thread_group {
// Only these friend functions are allowed to construct an object of this class
// and access its resources
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
protected:
// Construct mutli-grid thread group (through the API this_multi_grid())
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
: thread_group(internal::cg_multi_grid, size)
{
}
public:
// Number of invocations participating in this multi-grid group. In other
// words, the number of GPUs
__CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
// Rank of this invocation. In other words, an ID number within the range
// [0, num_grids()) of the GPU, this kernel is running on
__CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
};
/** \brief User exposed API interface to construct multi-grid cooperative
* group type object - `multi_grid_group`
*
* \details User is not allowed to directly construct an object of type
* `multi_grid_group`. Instead, he should construct it through this
* API function
*/
__CG_QUALIFIER__ multi_grid_group this_multi_grid()
{
return multi_grid_group(internal::multi_grid::size());
}
/** \brief The grid cooperative group type
*
* \details Represents an inter-workgroup cooperative group type where the
* participating threads within the group spans across multiple
* workgroups running the (same) kernel on the same device
*/
class grid_group : public thread_group {
// Only these friend functions are allowed to construct an object of this class
// and access its resources
friend __CG_QUALIFIER__ grid_group this_grid();
protected:
// Construct grid thread group (through the API this_grid())
explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
public:
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
};
/** \brief User exposed API interface to construct grid cooperative group type
* object - `grid_group`
*
* \details User is not allowed to directly construct an object of type
* `multi_grid_group`. Instead, he should construct it through this
* API function
*/
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
* type
*
* \details Represents an intra-workgroup cooperative group type where the
* participating threads within the group are exctly the same threads
* which are participated in the currently executing `workgroup`
*/
class thread_block : public thread_group {
// Only these friend functions are allowed to construct an object of this
// class and access its resources
friend __CG_QUALIFIER__ thread_block this_thread_block();
protected:
// Construct a workgroup thread group (through the API this_thread_block())
explicit __CG_QUALIFIER__ thread_block(uint32_t size)
: thread_group(internal::cg_workgroup, size)
{
}
public:
// 3-dimensional block index within the grid
__CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
// 3-dimensional thread index within the block
__CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
};
/** \brief User exposed API interface to construct workgroup cooperative
* group type object - `thread_block`
*
* \details User is not allowed to directly construct an object of type
* `thread_block`. Instead, he should construct it through this API
* function
*/
__CG_QUALIFIER__ thread_block this_thread_block()
{
return thread_block(internal::workgroup::size());
}
/**
* Implementation of all publicly exposed base class APIs
*/
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const
{
switch (this->_type) {
case internal::cg_multi_grid: {
return (static_cast<const multi_grid_group*>(this)->thread_rank());
}
case internal::cg_grid: {
return (static_cast<const grid_group*>(this)->thread_rank());
}
case internal::cg_workgroup: {
return (static_cast<const thread_block*>(this)->thread_rank());
}
case internal::cg_coalesced_tile: {
return local_rank;
}
default: {
assert(false && "invalid cooperative group type");
return -1;
}
}
}
__CG_QUALIFIER__ bool thread_group::is_valid() const
{
switch (this->_type) {
case internal::cg_multi_grid: {
return (static_cast<const multi_grid_group*>(this)->is_valid());
}
case internal::cg_grid: {
return (static_cast<const grid_group*>(this)->is_valid());
}
case internal::cg_workgroup: {
return (static_cast<const thread_block*>(this)->is_valid());
}
case internal::cg_coalesced_tile: {
return _tiled_partition;
}
default: {
assert(false && "invalid cooperative group type");
return false;
}
}
}
__CG_QUALIFIER__ void thread_group::sync() const
{
switch (this->_type) {
case internal::cg_multi_grid: {
static_cast<const multi_grid_group*>(this)->sync();
break;
}
case internal::cg_grid: {
static_cast<const grid_group*>(this)->sync();
break;
}
case internal::cg_workgroup: {
static_cast<const thread_block*>(this)->sync();
break;
}
case internal::cg_coalesced_tile: {
if (!_tiled_partition) // If in a tiled partition, this is a no-op
__syncthreads();
break;
}
default: {
assert(false && "invalid cooperative group type");
}
}
}
/**
* Implementation of publicly exposed `wrapper` APIs on top of basic cooperative
* group type APIs
*/
template <class CGTy>
__CG_QUALIFIER__ uint32_t group_size(CGTy const& g)
{
return g.size();
}
template <class CGTy>
__CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g)
{
return g.thread_rank();
}
template <class CGTy>
__CG_QUALIFIER__ bool is_valid(CGTy const& g)
{
return g.is_valid();
}
template <class CGTy>
__CG_QUALIFIER__ void sync(CGTy const& g)
{
g.sync();
}
} // namespace cooperative_groups
#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file hcc_detail/hip_cooperative_groups_helper.h
*
* @brief Device side implementation of cooperative group feature.
*
* Defines helper constructs and APIs which aid the types and device API
* wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
*/
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#if __cplusplus
#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
#include <hip/hcc_detail/device_functions.h>
#include <hip/hcc_detail/hip_runtime_api.h>
#else
#include <hip/hcc_detail/amd_device_functions.h>
#endif
#if !defined(__align__)
#define __align__(x) __attribute__((aligned(x)))
#endif
#if !defined(__CG_QUALIFIER__)
#define __CG_QUALIFIER__ __device__ __forceinline__
#endif
#if !defined(__CG_STATIC_QUALIFIER__)
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
#endif
#if !defined(WAVEFRONT_SIZE)
#define WAVEFRONT_SIZE 64
#endif
namespace cooperative_groups {
namespace internal {
/** \brief Enums representing different cooperative group types
*/
typedef enum { cg_invalid, cg_multi_grid, cg_grid, cg_workgroup, cg_coalesced_tile } group_type;
/**
* Functionalities related to multi-grid cooperative group type
*/
namespace multi_grid {
__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); }
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); }
__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); }
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); }
__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); }
__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
} // namespace multi_grid
/**
* Functionalities related to grid cooperative group type
*/
namespace grid {
__CG_STATIC_QUALIFIER__ uint32_t size()
{
return (uint32_t)((hipBlockDim_z * hipGridDim_z) * (hipBlockDim_y * hipGridDim_y) *
(hipBlockDim_x * hipGridDim_x));
}
__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
{
// Compute global id of the workgroup to which the current thread belongs to
uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
(hipBlockIdx_y * hipGridDim_x) + (hipBlockIdx_x));
// Compute total number of threads being passed to reach current workgroup
// within grid
uint32_t num_threads_till_current_workgroup =
(uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
// Compute thread local rank within current workgroup
uint32_t local_thread_rank = (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
(hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x));
return (num_threads_till_current_workgroup + local_thread_rank);
}
__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); }
__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
} // namespace grid
/**
* Functionalities related to `workgroup` (thread_block in CUDA terminology)
* cooperative group type
*/
namespace workgroup {
__CG_STATIC_QUALIFIER__ dim3 group_index()
{
return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, (uint32_t)hipBlockIdx_z));
}
__CG_STATIC_QUALIFIER__ dim3 thread_index()
{
return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, (uint32_t)hipThreadIdx_z));
}
__CG_STATIC_QUALIFIER__ uint32_t size()
{
return ((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
}
__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
{
return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
(hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x)));
}
__CG_STATIC_QUALIFIER__ bool is_valid()
{
// TODO(mahesha) any functionality need to be added here? I believe not
return true;
}
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
} // namespace workgroup
} // namespace internal
} // namespace cooperative_groups
#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
......@@ -72,7 +72,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
#endif
}
void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
......@@ -96,7 +100,11 @@ public:
stride_b,
stride_c,
_config.batch_size,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
#endif
k_buf = _buffer_a;
q_buf = _buffer_b;
......@@ -136,7 +144,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(_config.gemm_algos[1]));
#else
cublasGemmAlgo_t(_config.gemm_algos[1]));
#endif
// A need to transpose.
cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
......@@ -161,7 +173,11 @@ public:
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(_config.gemm_algos[2]));
#else
cublasGemmAlgo_t(_config.gemm_algos[2]));
#endif
}
inline int GetN() const { return _config.k; }
......
......@@ -14,7 +14,11 @@
#include <iostream>
//#include <helper_functions.h>
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <cuda_runtime_api.h>
#include <stdio.h>
......@@ -78,7 +82,11 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
T a_sum = s_a[tid];
T b_sum = s_b[tid];
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
......@@ -86,21 +94,33 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
s_b[tid] = b_sum = b_sum + s_b[tid + 256];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 256) && (tid < 128)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 128];
s_b[tid] = b_sum = b_sum + s_b[tid + 128];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 128) && (tid < 64)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 64];
s_b[tid] = b_sum = b_sum + s_b[tid + 64];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
#if (__CUDA_ARCH__ >= 300)
if (tid < 32) {
......@@ -124,42 +144,66 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
s_b[tid] = b_sum = b_sum + s_b[tid + 32];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 32) && (tid < 16)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 16];
s_b[tid] = b_sum = b_sum + s_b[tid + 16];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 16) && (tid < 8)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 8];
s_b[tid] = b_sum = b_sum + s_b[tid + 8];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 8) && (tid < 4)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 4];
s_b[tid] = b_sum = b_sum + s_b[tid + 4];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 4) && (tid < 2)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 2];
s_b[tid] = b_sum = b_sum + s_b[tid + 2];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
if ((blockSize >= 2) && (tid < 1)) {
s_a[tid] = a_sum = a_sum + s_a[tid + 1];
s_b[tid] = b_sum = b_sum + s_b[tid + 1];
}
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
cta.sync();
#else
cg::sync(cta);
#endif
#endif
......@@ -464,7 +508,7 @@ void fused_lamb_cuda(at::Tensor& p,
lamb_coeff.data<scalar_t>());
}));
}
THCudaCheck(cudaGetLastError());
C10_CUDA_CHECK(cudaGetLastError());
}
// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
......
#include "cublas_wrappers.h"
#ifdef __HIP_PLATFORM_HCC__
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_gemm_algo algo)
#else
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
......@@ -12,7 +26,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
const float* B,
float* C,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status = rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
rocblas_datatype_f32_r,
(transa == rocblas_operation_none) ? m : k,
(const void*)B,
rocblas_datatype_f32_r,
(transb == rocblas_operation_none) ? k : n,
(const void*)beta,
C,
rocblas_datatype_f32_r,
m,
C,
rocblas_datatype_f32_r,
m,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmEx(handle,
transa,
transb,
......@@ -32,8 +73,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
m,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
......@@ -45,6 +91,20 @@ int cublas_gemm_ex(cublasHandle_t handle,
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_gemm_algo algo)
#else
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
......@@ -57,7 +117,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
const __half* B,
__half* C,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status = rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
rocblas_datatype_f16_r,
(transa == rocblas_operation_none) ? m : k,
(const void*)B,
rocblas_datatype_f16_r,
(transb == rocblas_operation_none) ? k : n,
(const void*)beta,
(void*)C,
rocblas_datatype_f16_r,
m,
(void*)C,
rocblas_datatype_f16_r,
m,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmEx(handle,
transa,
transb,
......@@ -77,8 +164,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
m,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
......@@ -90,6 +182,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
rocblas_gemm_algo algo)
#else
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
......@@ -106,7 +216,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_C,
int batch,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status =
rocblas_gemm_strided_batched_ex(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
rocblas_datatype_f32_r,
(op_A == rocblas_operation_none) ? m : k,
stride_A,
B,
rocblas_datatype_f32_r,
(op_B == rocblas_operation_none) ? k : n,
stride_B,
beta,
C,
rocblas_datatype_f32_r,
m,
stride_C,
C,
rocblas_datatype_f32_r,
m,
stride_C,
batch,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
op_A,
op_B,
......@@ -130,8 +273,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
batch,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
batch,
......@@ -144,6 +292,24 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
rocblas_gemm_algo algo)
#else
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
......@@ -160,7 +326,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_C,
int batch,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status =
rocblas_gemm_strided_batched_ex(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
rocblas_datatype_f16_r,
(op_A == rocblas_operation_none) ? m : k,
stride_A,
B,
rocblas_datatype_f16_r,
(op_B == rocblas_operation_none) ? k : n,
stride_B,
beta,
C,
rocblas_datatype_f16_r,
m,
stride_C,
C,
rocblas_datatype_f16_r,
m,
stride_C,
batch,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
op_A,
op_B,
......@@ -184,8 +383,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
batch,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
......
......@@ -140,7 +140,9 @@ BertTransformerLayer<T>::~BertTransformerLayer()
template <typename T>
void BertTransformerLayer<T>::Initialize()
{
#ifndef __HIP_PLATFORM_HCC__
if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
#endif
}
template <typename T>
......
......@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int row = blockIdx.x;
int id = threadIdx.x;
int loop_stride = blockDim.x;
......@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
int row_stride,
int iterations)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int row = blockIdx.x;
int id = threadIdx.x;
int loop_stride = blockDim.x;
......@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
int row_stride,
int iterations)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int row = blockIdx.x;
int id = threadIdx.x;
int loop_stride = blockDim.x;
......
......@@ -11,8 +11,12 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
g.tiled_partition(b, TILE_DIM);
#else
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
#endif
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int y_stride = width * TILE_DIM;
......
......@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
unsigned groups,
unsigned merge_count)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
unsigned merge_hidden = hidden_dim >> merge_count;
unsigned quantization_stride = (merge_hidden * output_size) / groups;
......
......@@ -42,7 +42,7 @@ __global__ void fused_bias_gelu(__half* input,
int total_count,
int intermediate_size)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
float2* input_cast = reinterpret_cast<float2*>(input);
const float2* bias_cast = reinterpret_cast<const float2*>(bias);
......@@ -120,7 +120,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
float2* input_cast = reinterpret_cast<float2*>(input);
const float2* bias_cast = reinterpret_cast<const float2*>(bias);
......@@ -206,7 +206,7 @@ __global__ void fused_bias_residual(__half* input,
int intermediate_size,
bool add_bias)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
float2* input_cast = reinterpret_cast<float2*>(input);
const float2* residual_cast = reinterpret_cast<const float2*>(residual);
......
......@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
float epsilon,
int row_stride)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
......@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
int row_stride,
bool preLN)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
cg::thread_block b = cg::this_thread_block();
......
......@@ -38,7 +38,7 @@ __global__ void attn_softmax_v2(__half* vals,
int iterations,
int reduceWidth)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
......
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
......
......@@ -28,8 +28,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
int gid = id / WARP_SIZE;
......@@ -125,12 +129,17 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
__half* means,
int row_stride)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
g.tiled_partition(b, 32);
#else
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -322,7 +331,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
g.tiled_partition(b, 32);
#else
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -416,13 +430,18 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
__half* vars,
int row_stride)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
g.tiled_partition(b, 32);
#else
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -634,7 +653,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
g.tiled_partition(b, TILE_DIM);
#else
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
#endif
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int offset = threadIdx.y * width + idx;
......@@ -701,7 +725,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
g.tiled_partition(b, TILE_DIM);
#else
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
#endif
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int offset = threadIdx.y * width + idx;
......@@ -766,7 +795,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -866,7 +900,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1081,7 +1120,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1176,7 +1220,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1380,7 +1429,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
g.tiled_partition(b, TILE_DIM);
#else
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
#endif
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int offset = threadIdx.y * width + idx;
......@@ -1442,7 +1496,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
g.tiled_partition(b, TILE_DIM);
#else
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
#endif
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int offset = threadIdx.y * width + idx;
......@@ -1501,7 +1560,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1605,7 +1669,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1823,7 +1892,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -1926,7 +2000,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
int iterations = row_stride / iteration_stride;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......
......@@ -34,7 +34,12 @@ __global__ void attn_softmax(float* vals,
int block_width = blockStride * seq_length;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
g.tiled_partition(b, tbSize);
#else
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
#endif
int batch = blockIdx.y;
int row = blockIdx.x;
......@@ -156,7 +161,7 @@ __global__ void attn_softmax(__half* vals,
int seq_length,
int iterations)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
__shared__ float partialSum[MAX_WARP_NUM];
int warp_num = blockDim.x >> WARP_SIZE_BITS;
......@@ -165,7 +170,12 @@ __global__ void attn_softmax(__half* vals,
int block_width = blockStride * seq_length;
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
g.tiled_partition(b, tbSize);
#else
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
#endif
int batch = blockIdx.y;
int row = blockIdx.x;
......@@ -449,7 +459,12 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
: MAX_THREAD_ITERATIONS);
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
g.tiled_partition(b, tbSize);
#else
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
#endif
int row = blockIdx.x;
int id = threadIdx.x;
......@@ -524,7 +539,12 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
}
cg::thread_block b = cg::this_thread_block();
#ifdef __HIP_PLATFORM_HCC__
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
g.tiled_partition(b, WARP_SIZE);
#else
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#endif
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
......
......@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
int heads,
int head_ext)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int d0_stride = hidden_dim * seq_length;
int d1_stride = hidden_dim;
......@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
int heads,
int head_ext)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int d0_stride = hidden_dim * seq_length;
int d1_stride = hidden_dim;
......@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
int seq_length,
int heads)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
__shared__ float4 in_data[3072];
int d0_stride = hidden_dim * seq_length;
......@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
int hidden_dim,
int head_ext)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
int d0_stride = hidden_dim * (seq_length / head_ext);
int d1_stride = hidden_dim;
......@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
int seq_length,
int hidden_dim)
{
#if __CUDA_ARCH__ >= 700
#ifdef HALF_PRECISION_AVAILABLE
__shared__ float4 in_data[3072];
int d0_stride = hidden_dim * seq_length;
......
......@@ -79,6 +79,11 @@ def nvcc_version():
def debug_report():
max_dots = 33
hip_version = 'unknown'
if hasattr(torch.version, 'hip'):
hip_version = torch.version.hip
report = [
("torch install path",
torch.__path__),
......@@ -86,6 +91,8 @@ def debug_report():
torch.__version__),
("torch cuda version",
torch.version.cuda),
("torch hip version",
hip_version),
("nvcc version",
nvcc_version()),
("deepspeed install path",
......@@ -94,7 +101,8 @@ def debug_report():
f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
),
("deepspeed wheel compiled w.",
f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}, hip {torch_info['hip_version']}"
),
]
print("DeepSpeed general environment info:")
for name, value in report:
......
......@@ -14,4 +14,4 @@ except ModuleNotFoundError:
from .ops.op_builder import ALL_OPS
installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
torch_info = {'version': "0.0", "cuda_version": "0.0"}
torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
from . import adam
from . import adagrad
from . import lamb
#from ..git_version_info_installed import installed_ops as __installed_ops__
#if __installed_ops__['sparse_attn']:
from . import sparse_attention
from . import transformer
......
......@@ -415,27 +415,6 @@ class DeepSpeedEngine(Module):
"""
return self._global_grad_norm
def set_train_batch_size(self, train_batch_size):
"""Adjust the global batch size by increasing or decreasing the number of
micro-batches (i.e., gradient accumulation steps). The size of each micro-batch
(i.e., ``train_micro_batch_size_per_gpu``) is not changed.
Args:
train_batch_size (int): The new global batch size for training.
Raises:
ValueError: if ``train_batch_size`` is not divisible by the
configured micro-batch size and data parallelism.
"""
if train_batch_size % (self.train_micro_batch_size_per_gpu() *
self.dp_world_size) != 0:
#print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
raise ValueError(
f'Train batch size must be divisible by micro-batch data parallelism')
new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() *
self.dp_world_size)
# overwrite config
self._config.train_batch_size = train_batch_size
self._config.gradient_accumulation_steps = new_gas
def get_global_grad_norm(self) -> float:
"""Return the 2-norm of all gradients. If there is model parallelism,
the norm will be global.
......@@ -2839,10 +2818,9 @@ class DeepSpeedEngine(Module):
self.optimizer.state_dict()
if self.optimizer and not self.zero_optimization() else None
}
torch.save(optimizer_state,
self._get_optimizer_ckpt_name(save_dir,
tag,
expp_rank))
with open(self._get_optimizer_ckpt_name(save_dir, tag, expp_rank), 'wb') as fd:
torch.save(optimizer_state, fd)
fd.flush()
# get non-moe parameters
model_state_dict = self._get_non_moe_state_dict(self.module_state_dict())
......@@ -2872,7 +2850,9 @@ class DeepSpeedEngine(Module):
}
state.update(client_state)
logger.info(f'Saving model checkpoint: {save_path}')
torch.save(state, save_path)
with open(save_path, 'wb') as fd:
torch.save(state, fd)
fd.flush()
self._curr_save_path = None
def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
......@@ -3006,7 +2986,9 @@ class DeepSpeedEngine(Module):
zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
ds_config=self.config,
ds_version=version)
torch.save(zero_sd, zero_checkpoint_name)
with open(zero_checkpoint_name, 'wb') as fd:
torch.save(zero_sd, fd)
fd.flush()
if self.global_rank == 0:
self._copy_recovery_script(save_path)
logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
......
FROM rocm/pytorch:latest
##############################################################################
# Temporary Installation Directory
##############################################################################
ENV STAGE_DIR=/tmp
RUN mkdir -p ${STAGE_DIR}
##############################################################################
# Installation/Basic Utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
cmake g++ gcc \
curl wget vim tmux emacs less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
llvm-9-dev
##############################################################################
# Installation Latest Git
##############################################################################
RUN add-apt-repository ppa:git-core/ppa -y && \
apt-get update && \
apt-get install -y git && \
git --version
##############################################################################
# Client Liveness & Uncomment Port 22 for SSH Daemon
##############################################################################
# Keep SSH client alive from server side
RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
##############################################################################
# Mellanox OFED
##############################################################################
#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
#RUN apt-get install -y libnuma-dev
#RUN cd ${STAGE_DIR} && \
# wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
# cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
# ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
# cd ${STAGE_DIR} && \
# rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
##############################################################################
# OPENMPI
##############################################################################
#ENV OPENMPI_BASEVERSION=4.0
#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
#RUN cd ${STAGE_DIR} && \
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
# cd openmpi-${OPENMPI_VERSION} && \
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
# make -j"$(nproc)" install && \
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
# # Sanity check:
# test -f /usr/local/mpi/bin/mpic++ && \
# cd ${STAGE_DIR} && \
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
#ENV PATH=/usr/local/mpi/bin:${PATH} \
# LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
## Create a wrapper for OpenMPI to allow running as root by default
#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
# chmod a+x /usr/local/mpi/bin/mpirun
##############################################################################
# Python
##############################################################################
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.6
RUN apt-get install -y python3.6 python3.6-dev && \
rm -f /usr/bin/python && \
ln -s /usr/bin/python3.6 /usr/bin/python && \
curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py && \
pip install --upgrade pip && \
# Print python an pip version
python -V && pip -V
RUN pip install pyyaml
RUN pip install ipython
##############################################################################
# TensorFlow
##############################################################################
RUN pip install tensorflow-rocm
##############################################################################
# Some Packages
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libsndfile-dev \
libjpeg-dev \
libpng-dev \
screen
RUN pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
sklearn \
scikit-learn \
mpi4py \
h5py
##############################################################################
## SSH daemon port inside container cannot conflict with host OS port
###############################################################################
ENV SSH_PORT=2222
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
##############################################################################
# PyTorch
##############################################################################
#ENV PYTORCH_VERSION=1.2.0
#ENV TORCHVISION_VERSION=0.4.0
#ENV TENSORBOARDX_VERSION=1.8
#RUN pip install torch==${PYTORCH_VERSION}
#RUN pip install torchvision==${TORCHVISION_VERSION}
#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
##############################################################################
# PyYAML build issue
# https://stackoverflow.com/a/53926898
##############################################################################
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
rm -rf /usr/lib/python3/dist-packages/PyYAML-*
##############################################################################
## CuPy installation
###############################################################################
RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
RUN cd ${STAGE_DIR}/cupy && \
git submodule update --init && \
CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
RUN rm -rf ${STAGE_DIR}/cupy
##############################################################################
## Add deepspeed user
###############################################################################
# Add a deepspeed user with user id 8877
#RUN useradd --create-home --uid 8877 deepspeed
#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
#RUN usermod -aG sudo deepspeed
#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# # Change to non-root privilege
#USER deepspeed
##############################################################################
# DeepSpeed
##############################################################################
RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
RUN cd ${STAGE_DIR}/DeepSpeed && \
git checkout . && \
git checkout master && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
RUN rm -rf ${STAGE_DIR}/DeepSpeed
RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
......@@ -156,7 +156,7 @@ python setup.py $VERBOSE bdist_wheel
if [ "$local_only" == "1" ]; then
echo "Installing deepspeed"
$PIP_SUDO pip uninstall -y deepspeed
# $PIP_SUDO pip uninstall -y deepspeed
$PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
ds_report
else
......
......@@ -10,9 +10,9 @@ from .transformer import TransformerBuilder
from .stochastic_transformer import StochasticTransformerBuilder
from .utils import UtilsBuilder
from .async_io import AsyncIOBuilder
from .builder import get_default_compute_capabilities
from .transformer_inference import InferenceBuilder
from .quantizer import QuantizerBuilder
from .builder import get_default_compute_capabilities, OpBuilder
# TODO: infer this list instead of hard coded
# List of all available ops
......
......@@ -31,6 +31,9 @@ except ImportError:
f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
def installed_cuda_version():
import torch.utils.cpp_extension
......@@ -100,23 +103,10 @@ def assert_no_cuda_mismatch():
"cuda/cpp extensions without a matching cuda version.")
def assert_torch_info(torch_info):
install_torch_version = torch_info['version']
install_cuda_version = torch_info['cuda_version']
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
current_torch_version = ".".join(torch.__version__.split('.')[:2])
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
raise RuntimeError(
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. Please re-install "
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
f"torch={current_torch_version}, cuda={current_cuda_version}")
class OpBuilder(ABC):
_rocm_version = None
_is_rocm_pytorch = None
def __init__(self, name):
self.name = name
self.jit_mode = False
......@@ -136,6 +126,67 @@ class OpBuilder(ABC):
'''
pass
@staticmethod
def assert_torch_info(torch_info):
install_torch_version = torch_info['version']
install_cuda_version = torch_info['cuda_version']
install_hip_version = torch_info['hip_version']
if not OpBuilder.is_rocm_pytorch():
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
else:
current_hip_version = ".".join(torch.version.hip.split('.')[:2])
current_torch_version = ".".join(torch.__version__.split('.')[:2])
if not OpBuilder.is_rocm_pytorch():
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
raise RuntimeError(
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. Please re-install "
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
f"torch={current_torch_version}, cuda={current_cuda_version}")
else:
if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
raise RuntimeError(
"PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
"with a different version than what is being used at runtime. Please re-install "
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
f"torch={current_torch_version}, hip={current_hip_version}")
@staticmethod
def is_rocm_pytorch():
if OpBuilder._is_rocm_pytorch is not None:
return OpBuilder._is_rocm_pytorch
_is_rocm_pytorch = False
if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
_is_rocm_pytorch = hasattr(torch.version,
'hip') and torch.version.hip is not None
if _is_rocm_pytorch:
from torch.utils.cpp_extension import ROCM_HOME
_is_rocm_pytorch = ROCM_HOME is not None
OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
return OpBuilder._is_rocm_pytorch
@staticmethod
def installed_rocm_version():
if OpBuilder._rocm_version:
return OpBuilder._rocm_version
ROCM_MAJOR = '0'
ROCM_MINOR = '0'
if OpBuilder.is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME
with open('/opt/rocm/.info/version-dev', 'r') as file:
ROCM_VERSION_DEV_RAW = file.read()
ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
return OpBuilder._rocm_version
def include_paths(self):
'''
Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
......@@ -396,7 +447,7 @@ class OpBuilder(ABC):
# Ensure the op we're about to load was compiled with the same
# torch/cuda versions we are currently using at runtime.
if isinstance(self, CUDAOpBuilder):
assert_torch_info(torch_info)
self.assert_torch_info(torch_info)
return importlib.import_module(self.absolute_name())
else:
......@@ -414,7 +465,7 @@ class OpBuilder(ABC):
f"Unable to JIT load the {self.name} op due to ninja not being installed."
)
if isinstance(self, CUDAOpBuilder):
if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
assert_no_cuda_mismatch()
self.jit_mode = True
......@@ -534,8 +585,10 @@ class CUDAOpBuilder(OpBuilder):
def builder(self):
from torch.utils.cpp_extension import CUDAExtension
if not self.is_rocm_pytorch():
assert_no_cuda_mismatch()
return CUDAExtension(name=self.absolute_name(),
cuda_ext = CUDAExtension(
name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
libraries=self.strip_empty_entries(self.libraries_args()),
......@@ -543,6 +596,15 @@ class CUDAOpBuilder(OpBuilder):
'cxx': self.strip_empty_entries(self.cxx_args()),
'nvcc': self.strip_empty_entries(self.nvcc_args())
})
if self.is_rocm_pytorch():
# hip converts paths to absolute, this converts back to relative
sources = cuda_ext.sources
curr_file = Path(__file__).parent.parent # ds root
for i in range(len(sources)):
src = Path(sources[i])
sources[i] = str(src.relative_to(curr_file))
cuda_ext.sources = sources
return cuda_ext
def cxx_args(self):
if sys.platform == "win32":
......@@ -551,9 +613,20 @@ class CUDAOpBuilder(OpBuilder):
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
def nvcc_args(self):
args = ['-O3']
if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
args += [
'-std=c++14',
'-U__HIP_NO_HALF_OPERATORS__',
'-U__HIP_NO_HALF_CONVERSIONS__',
'-U__HIP_NO_HALF2_OPERATORS__',
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
]
else:
cuda_major, _ = installed_cuda_version()
args = [
'-O3',
args += [
'--use_fast_math',
'-std=c++17'
if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
......@@ -561,8 +634,8 @@ class CUDAOpBuilder(OpBuilder):
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
return args + self.compute_capability_args()
args += self.compute_capability_args()
return args
def libraries_args(self):
if sys.platform == "win32":
......@@ -573,11 +646,17 @@ class CUDAOpBuilder(OpBuilder):
class TorchCPUOpBuilder(CUDAOpBuilder):
def extra_ldflags(self):
if not self.is_rocm_pytorch():
return ['-lcurand']
else:
return []
def cxx_args(self):
import torch
if not self.is_rocm_pytorch():
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
else:
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()
......
......@@ -2,8 +2,6 @@
Copyright 2020 The Microsoft DeepSpeed Team
"""
import os
import sys
import subprocess
from .builder import TorchCPUOpBuilder
......@@ -22,5 +20,17 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
def include_paths(self):
import torch
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]
if not self.is_rocm_pytorch():
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
else:
CUDA_INCLUDE = [
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"hiprand"),
]
return ['csrc/includes'] + CUDA_INCLUDE
......@@ -2,8 +2,6 @@
Copyright 2020 The Microsoft DeepSpeed Team
"""
import os
import sys
import subprocess
from .builder import TorchCPUOpBuilder
......@@ -22,5 +20,17 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
def include_paths(self):
import torch
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]
if not self.is_rocm_pytorch():
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
else:
CUDA_INCLUDE = [
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"rocrand"),
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
"include",
"hiprand"),
]
return ['csrc/includes'] + CUDA_INCLUDE
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
import torch
from .builder import CUDAOpBuilder
......@@ -18,14 +19,15 @@ class FusedAdamBuilder(CUDAOpBuilder):
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
def include_paths(self):
return ['csrc/includes']
return ['csrc/includes', 'csrc/adam']
def cxx_args(self):
args = super().cxx_args()
return args + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo',
'-O3',
'--use_fast_math'
] + self.version_dependent_macros() + self.compute_capability_args()
nvcc_flags = ['-O3'] + self.version_dependent_macros()
if not self.is_rocm_pytorch():
nvcc_flags.extend(['-lineinfo',
'--use_fast_math'] + self.compute_capability_args())
return nvcc_flags
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
import torch
from .builder import CUDAOpBuilder
......@@ -25,7 +26,14 @@ class FusedLambBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros()
def nvcc_args(self):
return ['-lineinfo',
'-O3',
'--use_fast_math'
] + self.version_dependent_macros() + self.compute_capability_args()
nvcc_flags = ['-O3'] + self.version_dependent_macros()
if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
nvcc_flags += [
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
]
else:
nvcc_flags.extend(['-lineinfo',
'--use_fast_math'] + self.compute_capability_args())
return nvcc_flags
......@@ -32,6 +32,10 @@ class SparseAttnBuilder(OpBuilder):
#command_status = list(map(self.command_exists, required_commands))
#deps_compatible = all(command_status)
if self.is_rocm_pytorch():
self.warning(f'{self.NAME} is not compatible with ROCM')
return False
try:
import torch
except ImportError:
......
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
import torch
from .builder import CUDAOpBuilder
......@@ -28,4 +29,11 @@ class TransformerBuilder(CUDAOpBuilder):
]
def include_paths(self):
return ['csrc/includes']
includes = ['csrc/includes']
if self.is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME
includes += [
'{}/hiprand/include'.format(ROCM_HOME),
'{}/rocrand/include'.format(ROCM_HOME)
]
return includes
......@@ -32,7 +32,11 @@ except ImportError:
print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
from op_builder import ALL_OPS, get_default_compute_capabilities
from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
# fetch rocm state
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
rocm_version = OpBuilder.installed_rocm_version()
RED_START = '\033[31m'
RED_END = '\033[0m'
......@@ -51,8 +55,8 @@ def fetch_requirements(path):
install_requires = fetch_requirements('requirements/requirements.txt')
extras_require = {
'1bit_mpi' : fetch_requirements('requirements/requirements-1bit-mpi.txt'),
'1bit': [], # Will add proper cupy version below
'1bit': [], # add cupy based on cuda/rocm version
'1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
'dev': fetch_requirements('requirements/requirements-dev.txt'),
'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
......@@ -62,9 +66,17 @@ extras_require = {
# Add specific cupy version to both onebit extension variants
if torch_available and torch.cuda.is_available():
cupy = None
if is_rocm_pytorch:
rocm_major, rocm_minor = rocm_version
# XXX cupy support for rocm 5 is not available yet
if rocm_major <= 4:
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
else:
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
extras_require['1bit_mpi'].append(cupy)
if cupy:
extras_require['1bit'].append(cupy)
extras_require['1bit_mpi'].append(cupy)
# Make an [all] extra that installs all needed dependencies
all_extras = set()
......@@ -207,9 +219,17 @@ else:
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
# Set cuda_version to 0.0 if cpu-only
cuda_version = "0.0"
# Set hip_version to 0.0 if cpu-only
hip_version = "0.0"
if torch_available and torch.version.cuda is not None:
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
torch_info = {"version": torch_version, "cuda_version": cuda_version}
if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:
hip_version = ".".join(torch.version.hip.split('.')[:2])
torch_info = {
"version": torch_version,
"cuda_version": cuda_version,
"hip_version": hip_version
}
print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
with open('deepspeed/git_version_info_installed.py', 'w') as fd:
......
......@@ -8,6 +8,9 @@ from torch.multiprocessing import Process
import deepspeed
import pytest
from functools import wraps
import unittest
from pathlib import Path
from pathlib import Path
......@@ -39,6 +42,13 @@ def set_cuda_visibile():
if cuda_visible is None:
# CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
import subprocess
is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
if is_rocm_pytorch:
rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
gpu_ids = filter(lambda s: 'GPU' in s,
rocm_smi.decode('utf-8').strip().split('\n'))
num_gpus = len(list(gpu_ids))
else:
nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
cuda_visible = ",".join(map(str, range(num_gpus)))
......@@ -94,6 +104,7 @@ def distributed_test(world_size=2, backend='nccl'):
# make sure all ranks finish at the same time
torch.distributed.barrier()
# tear down after test completes
torch.distributed.destroy_process_group()
......@@ -154,6 +165,6 @@ def distributed_test(world_size=2, backend='nccl'):
return dist_wrap
def get_test_path(src):
def get_test_path(filename):
curr_path = Path(__file__).parent
return str(curr_path.joinpath(src))
return str(curr_path.joinpath(filename))
......@@ -186,6 +186,8 @@ def checkpoint_correctness_verification(args,
trained_model.save_checkpoint(save_folder, tag=save_tag)
dist.barrier()
loaded_model = create_deepspeed_model(args=args,
model=models[1],
base_optimizer=base_optimizers[1])
......
......@@ -123,7 +123,7 @@ class TestConfigurableMP:
load_lr_scheduler_states=False)
test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
inputs = self.get_inputs()
_run(inputs)
......
......@@ -15,9 +15,6 @@ import deepspeed
import sys
#if not deepspeed.ops.__installed_ops__['transformer']:
# pytest.skip("transformer kernels are not installed", allow_module_level=True)
def check_equal(first, second, atol=1e-2, verbose=False):
if verbose:
......@@ -266,10 +263,10 @@ def test_forward(batch_size,
@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
[
#(8,3,1024,512,16,3,True,False),
#(8,7,1024,512,16,3,True,True),
#(8,3,1024,512,16,3,False,False),
#(8,7,1024,512,16,3,False,True),
(8,3,1024,512,16,3,True,False),
(8,7,1024,512,16,3,True,True),
(8,3,1024,512,16,3,False,False),
(8,7,1024,512,16,3,False,True),
]) # yapf: disable
def test_forward_with_small_bsz(batch_size,
small_bsz,
......
......@@ -12,6 +12,7 @@ import numpy as np
import time
from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
from deepspeed.ops.op_builder import OpBuilder
PipeTopo = PipeDataParallelTopology
from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
......@@ -25,6 +26,12 @@ if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
allow_module_level=True)
rocm_version = OpBuilder.installed_rocm_version()
if rocm_version[0] > 4:
pytest.skip(
"NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
allow_module_level=True)
def test_onebitadam_fp16_basic(tmpdir):
config_dict = {
......
......@@ -252,7 +252,6 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
def _skip_on_cuda_compatability():
return
if torch.cuda.get_device_capability()[0] < 7:
pytest.skip("needs higher compute capability than 7")
cuda_major = int(torch.version.cuda.split('.')[0]) * 10
......
......@@ -846,7 +846,7 @@ def test_zero3_param_partitioning_many_params(world_sz: int,
for _ in range(n_layers))
for layer_num, module in enumerate(self.modulelist):
if dist.get_rank() == 0:
with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
param: Parameter = module.weight
partition_sz = math.ceil(param.numel() / dist.get_world_size())
offset = 0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册