未验证 提交 adc15e1c 编写于 作者: G Gavin Goodship 提交者: GitHub

Update curriculum-learning.md (#3031)

Co-authored-by: NJeff Rasley <jerasley@microsoft.com>
Co-authored-by: NOlatunji Ruwase <olruwase@microsoft.com>
上级 1f85569e
...@@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that ...@@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that
### 2.3 Token-based training termination ### 2.3 Token-based training termination
Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens. Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
### 2.4 Token-based LR decay ### 2.4 Token-based LR decay
......
...@@ -6,10 +6,11 @@ ...@@ -6,10 +6,11 @@
DeepSpeed library DeepSpeed library
To build wheel on Windows: To build wheel on Windows:
1. Install pytorch, such as pytorch 1.12 + cuda 11.6 1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
2. Install visual cpp build tool 2. Install visual cpp build tool.
3. Include cuda toolkit 3. Include cuda toolkit.
4. Launch cmd console with Administrator privilege for creating required symlink folders 4. Launch cmd console with Administrator privilege for creating required symlink folders.
Create a new wheel via the following command: Create a new wheel via the following command:
build_win.bat build_win.bat
...@@ -36,7 +37,7 @@ from op_builder import get_default_compute_capabilities, OpBuilder ...@@ -36,7 +37,7 @@ from op_builder import get_default_compute_capabilities, OpBuilder
from op_builder.all_ops import ALL_OPS from op_builder.all_ops import ALL_OPS
from op_builder.builder import installed_cuda_version from op_builder.builder import installed_cuda_version
# fetch rocm state # Fetch rocm state.
is_rocm_pytorch = OpBuilder.is_rocm_pytorch() is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
rocm_version = OpBuilder.installed_rocm_version() rocm_version = OpBuilder.installed_rocm_version()
...@@ -68,12 +69,12 @@ extras_require = { ...@@ -68,12 +69,12 @@ extras_require = {
'sd': fetch_requirements('requirements/requirements-sd.txt') 'sd': fetch_requirements('requirements/requirements-sd.txt')
} }
# Add specific cupy version to both onebit extension variants # Add specific cupy version to both onebit extension variants.
if torch_available and torch.cuda.is_available(): if torch_available and torch.cuda.is_available():
cupy = None cupy = None
if is_rocm_pytorch: if is_rocm_pytorch:
rocm_major, rocm_minor = rocm_version rocm_major, rocm_minor = rocm_version
# XXX cupy support for rocm 5 is not available yet # XXX cupy support for rocm 5 is not available yet.
if rocm_major <= 4: if rocm_major <= 4:
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}" cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
else: else:
...@@ -82,7 +83,7 @@ if torch_available and torch.cuda.is_available(): ...@@ -82,7 +83,7 @@ if torch_available and torch.cuda.is_available():
extras_require['1bit'].append(cupy) extras_require['1bit'].append(cupy)
extras_require['1bit_mpi'].append(cupy) extras_require['1bit_mpi'].append(cupy)
# Make an [all] extra that installs all needed dependencies # Make an [all] extra that installs all needed dependencies.
all_extras = set() all_extras = set()
for extra in extras_require.items(): for extra in extras_require.items():
for req in extra[1]: for req in extra[1]:
...@@ -91,7 +92,7 @@ extras_require['all'] = list(all_extras) ...@@ -91,7 +92,7 @@ extras_require['all'] = list(all_extras)
cmdclass = {} cmdclass = {}
# For any pre-installed ops force disable ninja # For any pre-installed ops force disable ninja.
if torch_available: if torch_available:
from accelerator import get_accelerator from accelerator import get_accelerator
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False) cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
...@@ -104,7 +105,7 @@ else: ...@@ -104,7 +105,7 @@ else:
TORCH_MINOR = "0" TORCH_MINOR = "0"
if torch_available and not torch.cuda.is_available(): if torch_available and not torch.cuda.is_available():
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486 # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only " print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing " "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
"(compute capabilities 6.0, 6.1, 6.2)") "(compute capabilities 6.0, 6.1, 6.2)")
...@@ -148,18 +149,18 @@ for op_name, builder in ALL_OPS.items(): ...@@ -148,18 +149,18 @@ for op_name, builder in ALL_OPS.items():
op_compatible = builder.is_compatible() op_compatible = builder.is_compatible()
compatible_ops[op_name] = op_compatible compatible_ops[op_name] = op_compatible
# If op is requested but not available, throw an error # If op is requested but not available, throw an error.
if op_enabled(op_name) and not op_compatible: if op_enabled(op_name) and not op_compatible:
env_var = op_envvar(op_name) env_var = op_envvar(op_name)
if env_var not in os.environ: if env_var not in os.environ:
builder.warning(f"One can disable {op_name} with {env_var}=0") builder.warning(f"One can disable {op_name} with {env_var}=0")
abort(f"Unable to pre-compile {op_name}") abort(f"Unable to pre-compile {op_name}")
# if op is compatible but install is not enabled (JIT mode) # If op is compatible but install is not enabled (JIT mode).
if is_rocm_pytorch and op_compatible and not op_enabled(op_name): if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
builder.hipify_extension() builder.hipify_extension()
# If op install enabled, add builder to extensions # If op install enabled, add builder to extensions.
if op_enabled(op_name) and op_compatible: if op_enabled(op_name) and op_compatible:
assert torch_available, f"Unable to pre-compile {op_name}, please first install torch" assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
install_ops[op_name] = op_enabled(op_name) install_ops[op_name] = op_enabled(op_name)
...@@ -167,7 +168,7 @@ for op_name, builder in ALL_OPS.items(): ...@@ -167,7 +168,7 @@ for op_name, builder in ALL_OPS.items():
print(f'Install Ops={install_ops}') print(f'Install Ops={install_ops}')
# Write out version/git info # Write out version/git info.
git_hash_cmd = "git rev-parse --short HEAD" git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD" git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ: if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
...@@ -200,38 +201,38 @@ if sys.platform == "win32": ...@@ -200,38 +201,38 @@ if sys.platform == "win32":
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator') create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
egg_info.manifest_maker.template = 'MANIFEST_win.in' egg_info.manifest_maker.template = 'MANIFEST_win.in'
# Parse the DeepSpeed version string from version.txt # Parse the DeepSpeed version string from version.txt.
version_str = open('version.txt', 'r').read().strip() version_str = open('version.txt', 'r').read().strip()
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash. # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel # Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
# Building wheel for distribution, update version file # Building wheel for distribution, update version file.
if 'DS_BUILD_STRING' in os.environ: if 'DS_BUILD_STRING' in os.environ:
# Build string env specified, probably building for distribution # Build string env specified, probably building for distribution.
with open('build.txt', 'w') as fd: with open('build.txt', 'w') as fd:
fd.write(os.environ.get('DS_BUILD_STRING')) fd.write(os.environ.get('DS_BUILD_STRING'))
version_str += os.environ.get('DS_BUILD_STRING') version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'): elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution # build.txt exists, probably installing from distribution.
with open('build.txt', 'r') as fd: with open('build.txt', 'r') as fd:
version_str += fd.read().strip() version_str += fd.read().strip()
else: else:
# None of the above, probably installing from source # None of the above, probably installing from source.
version_str += f'+{git_hash}' version_str += f'+{git_hash}'
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR]) torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
bf16_support = False bf16_support = False
# Set cuda_version to 0.0 if cpu-only # Set cuda_version to 0.0 if cpu-only.
cuda_version = "0.0" cuda_version = "0.0"
nccl_version = "0.0" nccl_version = "0.0"
# Set hip_version to 0.0 if cpu-only # Set hip_version to 0.0 if cpu-only.
hip_version = "0.0" hip_version = "0.0"
if torch_available and torch.version.cuda is not None: if torch_available and torch.version.cuda is not None:
cuda_version = ".".join(torch.version.cuda.split('.')[:2]) cuda_version = ".".join(torch.version.cuda.split('.')[:2])
if sys.platform != "win32": if sys.platform != "win32":
if isinstance(torch.cuda.nccl.version(), int): if isinstance(torch.cuda.nccl.version(), int):
# This will break if minor version > 9 # This will break if minor version > 9.
nccl_version = ".".join(str(torch.cuda.nccl.version())[:2]) nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
else: else:
nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2])) nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册