未验证 提交 adc15e1c 编写于 作者: G Gavin Goodship 提交者: GitHub

Update curriculum-learning.md (#3031)

Co-authored-by: NJeff Rasley <jerasley@microsoft.com>
Co-authored-by: NOlatunji Ruwase <olruwase@microsoft.com>
上级 1f85569e
......@@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that
### 2.3 Token-based training termination
Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
### 2.4 Token-based LR decay
......
......@@ -6,10 +6,11 @@
DeepSpeed library
To build wheel on Windows:
1. Install pytorch, such as pytorch 1.12 + cuda 11.6
2. Install visual cpp build tool
3. Include cuda toolkit
4. Launch cmd console with Administrator privilege for creating required symlink folders
1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
2. Install visual cpp build tool.
3. Include cuda toolkit.
4. Launch cmd console with Administrator privilege for creating required symlink folders.
Create a new wheel via the following command:
build_win.bat
......@@ -36,7 +37,7 @@ from op_builder import get_default_compute_capabilities, OpBuilder
from op_builder.all_ops import ALL_OPS
from op_builder.builder import installed_cuda_version
# fetch rocm state
# Fetch rocm state.
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
rocm_version = OpBuilder.installed_rocm_version()
......@@ -68,12 +69,12 @@ extras_require = {
'sd': fetch_requirements('requirements/requirements-sd.txt')
}
# Add specific cupy version to both onebit extension variants
# Add specific cupy version to both onebit extension variants.
if torch_available and torch.cuda.is_available():
cupy = None
if is_rocm_pytorch:
rocm_major, rocm_minor = rocm_version
# XXX cupy support for rocm 5 is not available yet
# XXX cupy support for rocm 5 is not available yet.
if rocm_major <= 4:
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
else:
......@@ -82,7 +83,7 @@ if torch_available and torch.cuda.is_available():
extras_require['1bit'].append(cupy)
extras_require['1bit_mpi'].append(cupy)
# Make an [all] extra that installs all needed dependencies
# Make an [all] extra that installs all needed dependencies.
all_extras = set()
for extra in extras_require.items():
for req in extra[1]:
......@@ -91,7 +92,7 @@ extras_require['all'] = list(all_extras)
cmdclass = {}
# For any pre-installed ops force disable ninja
# For any pre-installed ops force disable ninja.
if torch_available:
from accelerator import get_accelerator
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
......@@ -104,7 +105,7 @@ else:
TORCH_MINOR = "0"
if torch_available and not torch.cuda.is_available():
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
"(compute capabilities 6.0, 6.1, 6.2)")
......@@ -148,18 +149,18 @@ for op_name, builder in ALL_OPS.items():
op_compatible = builder.is_compatible()
compatible_ops[op_name] = op_compatible
# If op is requested but not available, throw an error
# If op is requested but not available, throw an error.
if op_enabled(op_name) and not op_compatible:
env_var = op_envvar(op_name)
if env_var not in os.environ:
builder.warning(f"One can disable {op_name} with {env_var}=0")
abort(f"Unable to pre-compile {op_name}")
# if op is compatible but install is not enabled (JIT mode)
# If op is compatible but install is not enabled (JIT mode).
if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
builder.hipify_extension()
# If op install enabled, add builder to extensions
# If op install enabled, add builder to extensions.
if op_enabled(op_name) and op_compatible:
assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
install_ops[op_name] = op_enabled(op_name)
......@@ -167,7 +168,7 @@ for op_name, builder in ALL_OPS.items():
print(f'Install Ops={install_ops}')
# Write out version/git info
# Write out version/git info.
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
......@@ -200,38 +201,38 @@ if sys.platform == "win32":
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
egg_info.manifest_maker.template = 'MANIFEST_win.in'
# Parse the DeepSpeed version string from version.txt
# Parse the DeepSpeed version string from version.txt.
version_str = open('version.txt', 'r').read().strip()
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
# Building wheel for distribution, update version file
# Building wheel for distribution, update version file.
if 'DS_BUILD_STRING' in os.environ:
# Build string env specified, probably building for distribution
# Build string env specified, probably building for distribution.
with open('build.txt', 'w') as fd:
fd.write(os.environ.get('DS_BUILD_STRING'))
version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution
# build.txt exists, probably installing from distribution.
with open('build.txt', 'r') as fd:
version_str += fd.read().strip()
else:
# None of the above, probably installing from source
# None of the above, probably installing from source.
version_str += f'+{git_hash}'
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
bf16_support = False
# Set cuda_version to 0.0 if cpu-only
# Set cuda_version to 0.0 if cpu-only.
cuda_version = "0.0"
nccl_version = "0.0"
# Set hip_version to 0.0 if cpu-only
# Set hip_version to 0.0 if cpu-only.
hip_version = "0.0"
if torch_available and torch.version.cuda is not None:
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
if sys.platform != "win32":
if isinstance(torch.cuda.nccl.version(), int):
# This will break if minor version > 9
# This will break if minor version > 9.
nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
else:
nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册