Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
xxadev
tensorflow
提交
98fb3b92
T
tensorflow
项目概览
xxadev
/
tensorflow
与 Fork 源项目一致
从无法访问的项目Fork
通知
3
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
tensorflow
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
98fb3b92
编写于
10月 29, 2018
作者:
Z
Zhenyu Tan
提交者:
TensorFlower Gardener
10月 29, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement apply gradients for keras optimizer v2.
PiperOrigin-RevId: 219189655
上级
d09c6878
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
884 addition
and
205 deletion
+884
-205
tensorflow/contrib/distribute/python/BUILD
tensorflow/contrib/distribute/python/BUILD
+22
-0
tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
...flow/contrib/distribute/python/keras_optimizer_v2_test.py
+237
-0
tensorflow/python/keras/optimizer_v2/BUILD
tensorflow/python/keras/optimizer_v2/BUILD
+1
-0
tensorflow/python/keras/optimizer_v2/adam.py
tensorflow/python/keras/optimizer_v2/adam.py
+124
-0
tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
...orflow/python/keras/optimizer_v2/gradient_descent_test.py
+14
-14
tensorflow/python/keras/optimizer_v2/optimizer_v2.py
tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+341
-40
tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+145
-151
未找到文件。
tensorflow/contrib/distribute/python/BUILD
浏览文件 @
98fb3b92
...
...
@@ -471,6 +471,28 @@ cuda_py_test(
],
)
cuda_py_test
(
name
=
"keras_optimizer_v2_test"
,
srcs
=
[
"keras_optimizer_v2_test.py"
],
additional_deps
=
[
":combinations"
,
"@absl_py//absl/testing:parameterized"
,
"//third_party/py/numpy"
,
"//tensorflow/contrib/optimizer_v2:training"
,
"//tensorflow/python/data/ops:dataset_ops"
,
"//tensorflow/python/eager:test"
,
"//tensorflow/python/estimator:estimator_py"
,
"//tensorflow/python/feature_column"
,
"//tensorflow/python:framework_ops"
,
"//tensorflow/python:platform"
,
"//tensorflow/python:summary"
,
],
tags
=
[
"multi_and_single_gpu"
,
"no_pip"
,
],
)
cuda_py_test
(
name
=
"estimator_training_test"
,
size
=
"large"
,
...
...
tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
0 → 100644
浏览文件 @
98fb3b92
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests that show that DistributionStrategy works with canned Estimator."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
shutil
import
tempfile
from
absl.testing
import
parameterized
import
numpy
as
np
import
six
from
tensorflow.contrib.distribute.python
import
combinations
from
tensorflow.contrib.distribute.python
import
mirrored_strategy
from
tensorflow.core.protobuf
import
config_pb2
from
tensorflow.python.data.ops
import
dataset_ops
from
tensorflow.python.eager
import
context
from
tensorflow.python.estimator
import
run_config
from
tensorflow.python.estimator
import
training
from
tensorflow.python.estimator.canned
import
dnn_linear_combined
from
tensorflow.python.estimator.canned
import
prediction_keys
from
tensorflow.python.estimator.export
import
export
from
tensorflow.python.estimator.inputs
import
numpy_io
from
tensorflow.python.feature_column
import
feature_column
from
tensorflow.python.keras.optimizer_v2
import
adam
from
tensorflow.python.ops
import
variable_scope
from
tensorflow.python.ops
import
variables
from
tensorflow.python.platform
import
gfile
from
tensorflow.python.platform
import
test
from
tensorflow.python.summary.writer
import
writer_cache
class
KerasOptimizerV2IntegrationTest
(
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
self
.
_model_dir
=
tempfile
.
mkdtemp
()
def
dataset_input_fn
(
self
,
x
,
y
,
batch_size
):
def
input_fn
():
dataset
=
dataset_ops
.
Dataset
.
from_tensor_slices
((
x
,
y
))
dataset
=
dataset
.
repeat
(
1
).
batch
(
batch_size
)
return
dataset
return
input_fn
@
combinations
.
generate
(
combinations
.
combine
(
mode
=
[
'graph'
],
distribution
=
[
combinations
.
one_device_strategy
,
combinations
.
mirrored_strategy_with_gpu_and_cpu
,
combinations
.
mirrored_strategy_with_two_gpus
],
use_train_and_evaluate
=
[
True
,
False
]))
def
test_complete_flow_with_mode
(
self
,
distribution
,
use_train_and_evaluate
):
label_dimension
=
2
input_dimension
=
label_dimension
batch_size
=
10
data
=
np
.
linspace
(
0.
,
2.
,
batch_size
*
label_dimension
,
dtype
=
np
.
float32
)
data
=
data
.
reshape
(
batch_size
,
label_dimension
)
train_input_fn
=
self
.
dataset_input_fn
(
x
=
{
'x'
:
data
},
y
=
data
,
batch_size
=
batch_size
//
len
(
distribution
.
worker_devices
))
eval_input_fn
=
self
.
dataset_input_fn
(
x
=
{
'x'
:
data
},
y
=
data
,
batch_size
=
batch_size
//
len
(
distribution
.
worker_devices
))
predict_input_fn
=
numpy_io
.
numpy_input_fn
(
x
=
{
'x'
:
data
},
batch_size
=
batch_size
,
shuffle
=
False
)
linear_feature_columns
=
[
feature_column
.
numeric_column
(
'x'
,
shape
=
(
input_dimension
,))
]
dnn_feature_columns
=
[
feature_column
.
numeric_column
(
'x'
,
shape
=
(
input_dimension
,))
]
feature_columns
=
linear_feature_columns
+
dnn_feature_columns
session_config
=
config_pb2
.
ConfigProto
(
log_device_placement
=
True
,
allow_soft_placement
=
True
)
estimator
=
dnn_linear_combined
.
DNNLinearCombinedRegressor
(
linear_feature_columns
=
linear_feature_columns
,
dnn_hidden_units
=
(
2
,
2
),
dnn_feature_columns
=
dnn_feature_columns
,
label_dimension
=
label_dimension
,
model_dir
=
self
.
_model_dir
,
dnn_optimizer
=
adam
.
Adam
(
0.001
),
linear_optimizer
=
adam
.
Adam
(
0.001
),
config
=
run_config
.
RunConfig
(
train_distribute
=
distribution
,
eval_distribute
=
distribution
,
session_config
=
session_config
))
num_steps
=
2
if
use_train_and_evaluate
:
scores
,
_
=
training
.
train_and_evaluate
(
estimator
,
training
.
TrainSpec
(
train_input_fn
,
max_steps
=
num_steps
),
training
.
EvalSpec
(
eval_input_fn
))
else
:
estimator
.
train
(
train_input_fn
,
steps
=
num_steps
)
scores
=
estimator
.
evaluate
(
eval_input_fn
)
self
.
assertIn
(
'loss'
,
six
.
iterkeys
(
scores
))
predictions
=
np
.
array
([
x
[
prediction_keys
.
PredictionKeys
.
PREDICTIONS
]
for
x
in
estimator
.
predict
(
predict_input_fn
)
])
self
.
assertAllEqual
((
batch_size
,
label_dimension
),
predictions
.
shape
)
feature_spec
=
feature_column
.
make_parse_example_spec
(
feature_columns
)
serving_input_receiver_fn
=
export
.
build_parsing_serving_input_receiver_fn
(
feature_spec
)
export_dir
=
estimator
.
export_savedmodel
(
tempfile
.
mkdtemp
(),
serving_input_receiver_fn
)
self
.
assertTrue
(
gfile
.
Exists
(
export_dir
))
def
tearDown
(
self
):
if
self
.
_model_dir
:
writer_cache
.
FileWriterCache
.
clear
()
shutil
.
rmtree
(
self
.
_model_dir
)
class
MirroredStrategyOptimizerV2Test
(
test
.
TestCase
):
def
testKerasOptimizerWithUnequalInput
(
self
):
if
context
.
num_gpus
()
<
1
:
self
.
skipTest
(
'Not enough GPUs.'
)
def
create_fn
(
device_id
):
var
=
variables
.
Variable
(
2.0
,
name
=
'var'
,
aggregation
=
variable_scope
.
VariableAggregation
.
SUM
)
# grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
loss
=
(
device_id
+
1
)
*
var
optimizer
=
adam
.
Adam
(
learning_rate
=
0.01
,
beta_1
=
0.2
,
beta_2
=
0.2
)
train_op
=
optimizer
.
minimize
(
loss
,
var_list
=
[
var
])
m
=
optimizer
.
get_slot
(
var
,
'm'
)
v
=
optimizer
.
get_slot
(
var
,
'v'
)
return
(
var
,
m
,
v
,
train_op
,
optimizer
.
iteration
)
devices
=
[
'/device:GPU:0'
,
'/device:CPU:0'
]
dist
=
mirrored_strategy
.
MirroredStrategy
(
devices
)
with
dist
.
scope
():
(
var
,
m
,
v
,
op
,
counter
)
=
dist
.
call_for_each_replica
(
create_fn
,
dist
.
worker_device_index
,
run_concurrently
=
False
)
self
.
evaluate
(
variables
.
global_variables_initializer
())
var_val
=
[
2.0
,
2.0
,
2.0
]
self
.
assertAllClose
(
var_val
,
self
.
evaluate
(
[
dist
.
read_var
(
var
),
var
.
get
(
devices
[
0
]),
var
.
get
(
devices
[
1
])]))
self
.
assertAllClose
([
0
,
0
,
0
],
self
.
evaluate
([
dist
.
read_var
(
counter
),
counter
.
get
(
devices
[
0
]),
counter
.
get
(
devices
[
1
])
]))
train_op
=
dist
.
unwrap
(
op
)
self
.
evaluate
(
train_op
)
# m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
m_val
=
[
1.2
,
1.2
,
1.2
]
# assert slot variables in both replicas are the same.
self
.
assertAllClose
(
m_val
,
self
.
evaluate
(
[
dist
.
read_var
(
m
),
m
.
get
(
devices
[
0
]),
m
.
get
(
devices
[
1
])]))
# v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
v_val
=
[
1.8
,
1.8
,
1.8
]
self
.
assertAllClose
(
v_val
,
self
.
evaluate
(
[
dist
.
read_var
(
v
),
v
.
get
(
devices
[
0
]),
v
.
get
(
devices
[
1
])]))
# var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
# = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
var_val
=
[
1.99
,
1.99
,
1.99
]
self
.
assertAllClose
(
var_val
,
self
.
evaluate
(
[
dist
.
read_var
(
var
),
var
.
get
(
devices
[
0
]),
var
.
get
(
devices
[
1
])]))
self
.
assertAllClose
([
1
,
1
,
1
],
self
.
evaluate
([
dist
.
read_var
(
counter
),
counter
.
get
(
devices
[
0
]),
counter
.
get
(
devices
[
1
])
]))
self
.
evaluate
(
train_op
)
# m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
m_val
=
[
1.44
,
1.44
,
1.44
]
self
.
assertAllClose
(
m_val
,
self
.
evaluate
(
[
dist
.
read_var
(
m
),
m
.
get
(
devices
[
0
]),
m
.
get
(
devices
[
1
])]))
# v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
v_val
=
[
2.16
,
2.16
,
2.16
]
self
.
assertAllClose
(
v_val
,
self
.
evaluate
(
[
dist
.
read_var
(
v
),
v
.
get
(
devices
[
0
]),
v
.
get
(
devices
[
1
])]))
self
.
assertAllClose
([
2
,
2
,
2
],
self
.
evaluate
([
dist
.
read_var
(
counter
),
counter
.
get
(
devices
[
0
]),
counter
.
get
(
devices
[
1
])
]))
if
__name__
==
'__main__'
:
test
.
main
()
tensorflow/python/keras/optimizer_v2/BUILD
浏览文件 @
98fb3b92
...
...
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
py_library
(
name
=
"optimizer_v2"
,
srcs
=
[
"adam.py"
,
"gradient_descent.py"
,
"optimizer_v2.py"
,
],
...
...
tensorflow/python/keras/optimizer_v2/adam.py
0 → 100644
浏览文件 @
98fb3b92
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Adam for TensorFlow."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
tensorflow.python.keras.optimizer_v2
import
optimizer_v2
from
tensorflow.python.ops
import
math_ops
from
tensorflow.python.training
import
training_ops
class
Adam
(
optimizer_v2
.
OptimizerV2
):
"""Optimizer that implements the Adam algorithm.
Adam optimization is a stochastic gradient descent method that is based on
adaptive estimation of first-order and second-order moments. According to the
reference, the method is 'computationally efficient, has little memory
requirement, invariant to diagonal rescaling of gradients, and is well suited
for problems that are large in terms of data/parameters'.
# References
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
"""
def
__init__
(
self
,
learning_rate
=
0.001
,
beta_1
=
0.9
,
beta_2
=
0.999
,
epsilon
=
1e-8
,
name
=
'Adam'
):
r
"""Construct a new Adam optimizer.
Initialization:
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
$$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
The default value of 1e-8 for epsilon might not be a good default in
general. For example, when training an Inception network on ImageNet a
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
formulation just before Section 2.1 of the Kingma and Ba paper rather than
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
hat" in the paper.
The sparse implementation of this algorithm (used when the gradient is an
IndexedSlices object, typically because of `tf.gather` or an embedding
lookup in the forward pass) does apply momentum to variable slices even if
they were not used in the forward pass (meaning they have a gradient equal
to zero). Momentum decay (beta1) is also applied to the entire momentum
accumulator. This means that the sparse behavior is equivalent to the dense
behavior (in contrast to some momentum implementations which ignore momentum
unless a variable slice was actually used).
Args:
learning_rate: A Tensor or a floating point value. The learning rate.
beta_1: A float value or a constant float tensor. The exponential decay
rate for the 1st moment estimates.
beta_2: A float value or a constant float tensor. The exponential decay
rate for the 2nd moment estimates.
epsilon: A small constant for numerical stability. This epsilon is
"epsilon hat" in the Kingma and Ba paper (in the formula just before
Section 2.1), not the epsilon in Algorithm 1 of the paper.
name: Optional name for the operations created when applying gradients.
Defaults to "Adam". @compatibility(eager) When eager execution is
enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
a callable that takes no arguments and returns the actual value to use.
This can be useful for changing these values across different
invocations of optimizer functions. @end_compatibility
"""
super
(
Adam
,
self
).
__init__
(
name
)
self
.
_lr
=
learning_rate
self
.
_beta_1
=
beta_1
self
.
_beta_2
=
beta_2
self
.
_epsilon
=
epsilon
def
_create_slots
(
self
,
var_list
):
# Create slots for the first and second moments.
for
var
in
var_list
:
self
.
add_slot
(
var
,
'm'
)
self
.
add_slot
(
var
,
'v'
)
def
_resource_apply_dense
(
self
,
grad
,
var
):
m
=
self
.
get_slot
(
var
,
'm'
)
v
=
self
.
get_slot
(
var
,
'v'
)
# TODO(tanzheny): let optimizer have its own step counter, and let
# beta1_power and beta2_power depend on it.
return
training_ops
.
resource_apply_adam
(
var
.
handle
,
m
.
handle
,
v
.
handle
,
math_ops
.
cast
(
self
.
_beta_1
,
grad
.
dtype
.
base_dtype
),
math_ops
.
cast
(
self
.
_beta_2
,
grad
.
dtype
.
base_dtype
),
math_ops
.
cast
(
self
.
_lr
,
grad
.
dtype
.
base_dtype
),
math_ops
.
cast
(
self
.
_beta_1
,
grad
.
dtype
.
base_dtype
),
math_ops
.
cast
(
self
.
_beta_2
,
grad
.
dtype
.
base_dtype
),
math_ops
.
cast
(
self
.
_epsilon
,
grad
.
dtype
.
base_dtype
),
grad
,
use_locking
=
self
.
_use_locking
)
tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
浏览文件 @
98fb3b92
...
...
@@ -64,13 +64,13 @@ class GradientDescentOptimizerTest(test.TestCase):
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
grads0
=
constant_op
.
constant
([
0.1
,
0.1
],
dtype
=
dtype
)
grads1
=
constant_op
.
constant
([
0.01
,
0.01
],
dtype
=
dtype
)
sgd
_op
=
gradient_descent
.
SGD
(
3.0
).
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]))
sgd
=
gradient_descent
.
SGD
(
3.0
)
sgd_op
=
sgd
.
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]))
# TODO(apassos) calling initialize_resources on all resources here
# doesn't work because the sessions and graph are reused across unit
# tests and this would mean trying to reinitialize variables. Figure out
# a long-term solution for this.
resources
.
initialize_resources
([
var0
,
var1
]).
run
()
resources
.
initialize_resources
([
var0
,
var1
,
sgd
.
iteration
]).
run
()
# Fetch params to validate initial values
self
.
assertAllCloseAccordingToType
([
1.0
,
2.0
],
var0
.
eval
())
self
.
assertAllCloseAccordingToType
([
3.0
,
4.0
],
var1
.
eval
())
...
...
@@ -90,13 +90,13 @@ class GradientDescentOptimizerTest(test.TestCase):
grads0
=
constant_op
.
constant
([
0.1
,
0.1
],
dtype
=
dtype
)
grads1
=
constant_op
.
constant
([
0.01
,
0.01
],
dtype
=
dtype
)
lr
=
lambda
:
3.0
sgd
_op
=
gradient_descent
.
SGD
(
lr
).
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]))
sgd
=
gradient_descent
.
SGD
(
lr
)
sgd_op
=
sgd
.
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]))
# TODO(apassos) calling initialize_resources on all resources here
# doesn't work because the sessions and graph are reused across unit
# tests and this would mean trying to reinitialize variables. Figure out
# a long-term solution for this.
resources
.
initialize_resources
([
var0
,
var1
]).
run
()
resources
.
initialize_resources
([
var0
,
var1
,
sgd
.
iteration
]).
run
()
# Fetch params to validate initial values
self
.
assertAllCloseAccordingToType
([
1.0
,
2.0
],
var0
.
eval
())
self
.
assertAllCloseAccordingToType
([
3.0
,
4.0
],
var1
.
eval
())
...
...
@@ -116,12 +116,13 @@ class GradientDescentOptimizerTest(test.TestCase):
x
=
constant_op
.
constant
([[
4.0
],
[
5.0
]],
dtype
=
dtype
)
pred
=
math_ops
.
matmul
(
var0
,
x
)
+
var1
loss
=
pred
*
pred
sgd_op
=
gradient_descent
.
SGD
(
1.0
).
minimize
(
loss
)
sgd
=
gradient_descent
.
SGD
(
1.0
)
sgd_op
=
sgd
.
minimize
(
loss
,
[
var0
,
var1
])
# TODO(apassos) calling initialize_resources on all resources here
# doesn't work because the sessions and graph are reused across unit
# tests and this would mean trying to reinitialize variables. Figure out
# a long-term solution for this.
resources
.
initialize_resources
([
var0
,
var1
]).
run
()
resources
.
initialize_resources
([
var0
,
var1
,
sgd
.
iteration
]).
run
()
# Fetch params to validate initial values
self
.
assertAllCloseAccordingToType
([[
1.0
,
2.0
]],
var0
.
eval
())
self
.
assertAllCloseAccordingToType
([
3.0
],
var1
.
eval
())
...
...
@@ -143,7 +144,7 @@ class GradientDescentOptimizerTest(test.TestCase):
pred
=
math_ops
.
matmul
(
embedding_ops
.
embedding_lookup
([
var0
],
[
0
]),
x
)
pred
+=
var1
loss
=
pred
*
pred
sgd_op
=
gradient_descent
.
SGD
(
1.0
).
minimize
(
loss
)
sgd_op
=
gradient_descent
.
SGD
(
1.0
).
minimize
(
loss
,
[
var0
,
var1
]
)
variables
.
global_variables_initializer
().
run
()
# Fetch params to validate initial values
self
.
assertAllCloseAccordingToType
([[
1.0
,
2.0
]],
var0
.
eval
())
...
...
@@ -193,25 +194,24 @@ class GradientDescentOptimizerTest(test.TestCase):
def
testWithGlobalStep
(
self
):
for
dtype
in
[
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]:
with
self
.
cached_session
():
global_step
=
variables
.
Variable
(
0
,
trainable
=
False
)
var0
=
variables
.
Variable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
variables
.
Variable
([
3.0
,
4.0
],
dtype
=
dtype
)
grads0
=
constant_op
.
constant
([
0.1
,
0.1
],
dtype
=
dtype
)
grads1
=
constant_op
.
constant
([
0.01
,
0.01
],
dtype
=
dtype
)
sgd
_op
=
gradient_descent
.
SGD
(
3.0
).
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]),
global_step
=
global_step
)
sgd
=
gradient_descent
.
SGD
(
3.0
)
sgd_op
=
sgd
.
apply_gradients
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
])
)
variables
.
global_variables_initializer
().
run
()
# Fetch params to validate initial values
self
.
assertAllCloseAccordingToType
([
1.0
,
2.0
],
var0
.
eval
())
self
.
assertAllCloseAccordingToType
([
3.0
,
4.0
],
var1
.
eval
())
# Run 1 step of sgd
sgd_op
.
run
()
# Validate updated params and
global_step
# Validate updated params and
optimizer iterations.
self
.
assertAllCloseAccordingToType
([
1.0
-
3.0
*
0.1
,
2.0
-
3.0
*
0.1
],
var0
.
eval
())
self
.
assertAllCloseAccordingToType
([
3.0
-
3.0
*
0.01
,
4.0
-
3.0
*
0.01
],
var1
.
eval
())
self
.
assertAllCloseAccordingToType
(
1
,
global_step
.
eval
())
self
.
assertAllCloseAccordingToType
(
1
,
sgd
.
iteration
.
eval
())
def
testSparseBasic
(
self
):
for
dtype
in
[
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]:
...
...
tensorflow/python/keras/optimizer_v2/optimizer_v2.py
浏览文件 @
98fb3b92
# Copyright 201
5
The TensorFlow Authors. All Rights Reserved.
# Copyright 201
8
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -22,7 +22,20 @@ from __future__ import print_function
import
abc
from
tensorflow.python.eager
import
backprop
from
tensorflow.python.eager
import
context
from
tensorflow.python.framework
import
dtypes
from
tensorflow.python.framework
import
ops
from
tensorflow.python.keras
import
initializers
from
tensorflow.python.keras.engine
import
base_layer
from
tensorflow.python.ops
import
control_flow_ops
from
tensorflow.python.ops
import
gradients
from
tensorflow.python.ops
import
variable_scope
from
tensorflow.python.ops
import
variables
from
tensorflow.python.platform
import
tf_logging
as
logging
from
tensorflow.python.training
import
distribution_strategy_context
from
tensorflow.python.training
import
optimizer
as
optimizer_v1
from
tensorflow.python.util
import
nest
class
OptimizerV2
(
optimizer_v1
.
Optimizer
):
...
...
@@ -77,29 +90,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
opt.apply_gradients(capped_grads_and_vars)
```
### Gating Gradients
Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
argument that controls the degree of parallelism during the application of
the gradients.
The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
<b>`GATE_NONE`</b>: Compute and apply gradients in parallel. This provides
the maximum parallelism in execution, at the cost of some non-reproducibility
in the results. For example the two gradients of `matmul` depend on the input
values: With `GATE_NONE` one of the gradients could be applied to one of the
inputs _before_ the other gradient is computed resulting in non-reproducible
results.
<b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
they are used. This prevents race conditions for Ops that generate gradients
for multiple inputs where the gradients depend on the inputs.
<b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
before any one of them is used. This provides the least parallelism but can
be useful if you want to process all gradients before applying any of them.
### Slots
Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
...
...
@@ -111,11 +101,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
This can be useful if you want to log debug a training algorithm, report stats
about the slots, etc.
### Non-slot variables
Some optimizer subclasses, such as `AdamOptimizer` have variables that
are not associated with the variables to train, just the step itself.
### Hyper parameters
These are arguments passed to the optimizer subclass constructor
...
...
@@ -124,18 +109,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
callables. If they are callable, the callable will be called during
`apply_gradients()` to get the value for the hyper parameter.
### State
Internal methods are passed a `state` argument with the correct
values to use for the slot and non-slot variables, and the hyper
parameters.
"""
# Values for gate_gradients.
GATE_NONE
=
0
GATE_OP
=
1
GATE_GRAPH
=
2
def
__init__
(
self
,
name
):
"""Create a new Optimizer.
...
...
@@ -145,6 +120,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
you should be able to use the _set_hyper()/state.get_hyper()
facility instead.
This class in stateful and thread-compatible.
Args:
name: A non-empty string. The name to use for accumulators created
for the optimizer.
...
...
@@ -157,6 +134,192 @@ class OptimizerV2(optimizer_v1.Optimizer):
self
.
_use_locking
=
True
super
(
OptimizerV2
,
self
).
__init__
(
self
.
_use_locking
,
name
)
self
.
_hyper
=
{}
self
.
_slots
=
{}
self
.
_prepared
=
False
def
minimize
(
self
,
loss
,
var_list
,
aggregation_method
=
None
,
colocate_gradients_with_ops
=
False
,
name
=
None
,
grad_loss
=
None
):
"""Add operations to minimize `loss` by updating `var_list`.
This method simply combines calls `compute_gradients()` and
`apply_gradients()`. If you want to process the gradient before applying
them call `compute_gradients()` and `apply_gradients()` explicitly instead
of using this function.
Args:
loss: A `Tensor` containing the value to minimize.
var_list: list or tuple of `Variable` objects to update to minimize
`loss`.
aggregation_method: Specifies the method used to combine gradient terms.
Valid values are defined in the class `AggregationMethod`.
colocate_gradients_with_ops: If True, try colocating gradients with the
corresponding op.
name: Optional name for the returned operation.
grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
Returns:
An Operation that updates the variables in `var_list`. If `global_step`
was not `None`, that operation also increments `global_step`.
Raises:
ValueError: If some of the variables are not `Variable` objects.
@compatibility(eager)
When eager execution is enabled, `loss` should be a Python function that
takes no arguments and computes the value to be minimized. Minimization (and
gradient computation) is done with respect to the elements of `var_list` if
not None, else with respect to any trainable variables created during the
execution of the `loss` function. `gate_gradients`, `aggregation_method`,
`colocate_gradients_with_ops` and `grad_loss` are ignored when eager
execution is enabled.
@end_compatibility
"""
grads_and_vars
=
self
.
compute_gradients
(
loss
,
var_list
=
var_list
,
aggregation_method
=
aggregation_method
,
colocate_gradients_with_ops
=
colocate_gradients_with_ops
,
grad_loss
=
grad_loss
)
return
self
.
apply_gradients
(
grads_and_vars
,
name
=
name
)
def
compute_gradients
(
self
,
loss
,
var_list
,
aggregation_method
=
None
,
colocate_gradients_with_ops
=
False
,
grad_loss
=
None
,
stop_gradients
=
None
):
"""Compute gradients of `loss` for the variables in `var_list`.
This is the first part of `minimize()`. It returns a list
of (gradient, variable) pairs where "gradient" is the gradient
for "variable". Note that "gradient" can be a `Tensor`, an
`IndexedSlices`, or `None` if there is no gradient for the
given variable.
Args:
loss: A Tensor containing the value to minimize or a callable taking no
arguments which returns the value to minimize. When eager execution is
enabled it must be a callable.
var_list: Optional list or tuple of `tf.Variable` to update to minimize
`loss`. Defaults to the list of variables collected in the graph under
the key `GraphKeys.TRAINABLE_VARIABLES`.
aggregation_method: Specifies the method used to combine gradient terms.
Valid values are defined in the class `AggregationMethod`.
colocate_gradients_with_ops: If True, try colocating gradients with the
corresponding op.
grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
stop_gradients: Optional. A Tensor or list of tensors not to differentiate
through.
Returns:
A list of (gradient, variable) pairs. Variable is always present, but
gradient can be `None`.
Raises:
TypeError: If `var_list` contains anything else than `Variable` objects.
ValueError: If some arguments are invalid, or var_list is None.
RuntimeError: If called with eager execution enabled and `loss` is
not callable.
@compatibility(eager)
When eager execution is enabled, `aggregation_method`, and
`colocate_gradients_with_ops` are ignored.
@end_compatibility
"""
var_list
=
nest
.
flatten
(
var_list
)
# TODO(josh11b): Test that we handle weight decay in a reasonable way.
if
callable
(
loss
):
with
backprop
.
GradientTape
()
as
tape
:
tape
.
watch
(
var_list
)
loss_value
=
loss
()
grads
=
tape
.
gradient
(
loss_value
,
var_list
,
grad_loss
)
else
:
if
context
.
executing_eagerly
():
raise
RuntimeError
(
"`loss` passed to Optimizer.compute_gradients "
"should be a function when eager execution is "
"enabled."
)
self
.
_assert_valid_dtypes
([
loss
])
if
grad_loss
is
not
None
:
self
.
_assert_valid_dtypes
([
grad_loss
])
grads
=
gradients
.
gradients
(
loss
,
var_list
,
grad_ys
=
grad_loss
,
aggregation_method
=
aggregation_method
,
colocate_gradients_with_ops
=
colocate_gradients_with_ops
,
stop_gradients
=
stop_gradients
)
grads_and_vars
=
list
(
zip
(
grads
,
var_list
))
self
.
_assert_valid_dtypes
([
v
for
g
,
v
in
grads_and_vars
if
g
is
not
None
and
v
.
dtype
!=
dtypes
.
resource
])
return
grads_and_vars
def
apply_gradients
(
self
,
grads_and_vars
,
name
=
None
):
"""Apply gradients to variables.
This is the second part of `minimize()`. It returns an `Operation` that
applies gradients.
Args:
grads_and_vars: List of (gradient, variable) pairs as returned by
`compute_gradients()`.
name: Optional name for the returned operation. Default to the name
passed to the `Optimizer` constructor.
Returns:
An `Operation` that applies the specified gradients. If `global_step`
was not None, that operation also increments `global_step`.
Raises:
TypeError: If `grads_and_vars` is malformed.
ValueError: If none of the variables have gradients.
"""
grads_and_vars
=
_filter_grads
(
grads_and_vars
)
var_list
=
[
v
for
(
_
,
v
)
in
grads_and_vars
]
if
distribution_strategy_context
.
has_distribution_strategy
():
reduced_grads
=
merge_grads
(
grads_and_vars
)
grads_and_vars
=
zip
(
reduced_grads
,
var_list
)
with
ops
.
init_scope
():
self
.
_create_slots
(
var_list
)
update_ops
=
[]
def
update_grad_to_var
(
grad
,
var
):
"""Apply gradient to variable."""
if
isinstance
(
var
,
ops
.
Tensor
):
raise
NotImplementedError
(
"Trying to update a Tensor "
,
var
)
if
isinstance
(
grad
,
ops
.
IndexedSlices
):
if
var
.
constraint
is
not
None
:
raise
RuntimeError
(
"Cannot use a constraint function on a sparse variable."
)
return
self
.
_resource_apply_sparse_duplicate_indices
(
grad
.
values
,
var
,
grad
.
indices
)
update_op
=
self
.
_resource_apply_dense
(
grad
,
var
)
if
var
.
constraint
is
not
None
:
with
ops
.
control_dependencies
([
update_op
]):
return
var
.
assign
(
var
.
constraint
(
var
))
else
:
return
update_op
with
ops
.
name_scope
(
name
,
self
.
_name
)
as
name
:
self
.
_prepare
()
for
grad
,
var
in
grads_and_vars
:
scope_name
=
""
if
in_eager_execution
()
else
"_"
+
var
.
op
.
name
with
ops
.
name_scope
(
"update"
+
scope_name
),
ops
.
colocate_with
(
var
):
update_ops
.
append
(
update_grad_to_var
(
grad
,
var
))
with
ops
.
colocate_with
(
self
.
_iterations
):
update_ops
.
append
(
self
.
_iterations
.
assign_add
(
1
))
return
control_flow_ops
.
group
(
*
update_ops
)
def
_set_hyper
(
self
,
name
,
value
):
self
.
_hyper
[
name
]
=
value
...
...
@@ -166,8 +329,33 @@ class OptimizerV2(optimizer_v1.Optimizer):
value
=
self
.
_hyper
[
name
]
return
self
.
_call_if_callable
(
value
)
def
add_slot
(
self
,
var
,
slot_name
):
slot_key
=
_get_slot_key_from_var
(
var
,
slot_name
)
if
slot_key
not
in
self
.
_slots
:
self
.
_slots
[
slot_key
]
=
self
.
add_weight
(
name
=
slot_key
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
)
def
get_slot
(
self
,
var
,
slot_name
):
slot_key
=
_get_slot_key_from_var
(
var
,
slot_name
)
return
self
.
_slots
[
slot_key
]
def
_prepare
(
self
):
pass
if
self
.
_prepared
:
return
# This is where all hyper variables will be created.
with
ops
.
device
(
"cpu:0"
):
self
.
_iterations
=
self
.
add_weight
(
self
.
_name
+
"/iter"
,
shape
=
[],
trainable
=
False
,
aggregation
=
variables
.
VariableAggregation
.
ONLY_FIRST_REPLICA
)
self
.
_prepared
=
True
@
property
def
iteration
(
self
):
if
not
self
.
_prepared
:
self
.
_prepare
()
return
self
.
_iterations
@
abc
.
abstractmethod
def
get_config
(
self
):
...
...
@@ -205,3 +393,116 @@ class OptimizerV2(optimizer_v1.Optimizer):
def
_serialize_hyperparameter
(
self
,
hyperparameter_name
):
"""Serialize a hyperparameter that can be a float, callable, or Tensor."""
return
self
.
_hyper
[
hyperparameter_name
]
def
add_weight
(
self
,
name
,
shape
,
dtype
=
None
,
initializer
=
"zeros"
,
trainable
=
None
,
synchronization
=
variables
.
VariableSynchronization
.
AUTO
,
aggregation
=
variables
.
VariableAggregation
.
NONE
):
if
dtype
is
None
:
dtype
=
dtypes
.
float32
initializer
=
initializers
.
get
(
initializer
)
if
synchronization
==
variables
.
VariableSynchronization
.
ON_READ
:
if
trainable
:
raise
ValueError
(
"Synchronization value can be set to "
"VariableSynchronization.ON_READ only for non-trainable variables. "
"You have specified trainable=True and "
"synchronization=VariableSynchronization.ON_READ."
)
else
:
# Set trainable to be false when variable is to be synced on read.
trainable
=
False
elif
trainable
is
None
:
trainable
=
True
variable
=
self
.
_add_variable_with_custom_getter
(
name
=
name
,
shape
=
shape
,
getter
=
base_layer
.
make_variable
,
overwrite
=
True
,
initializer
=
initializers
.
get
(
initializer
),
dtype
=
dtype
,
trainable
=
trainable
,
use_resource
=
True
,
synchronization
=
synchronization
,
aggregation
=
aggregation
)
return
variable
def
_filter_grads
(
grads_and_vars
):
"""Filter out iterable with grad equal to None."""
grads_and_vars
=
tuple
(
grads_and_vars
)
if
not
grads_and_vars
:
raise
ValueError
(
"No variables provided."
)
filtered
=
[]
vars_with_empty_grads
=
[]
for
grad
,
var
in
grads_and_vars
:
if
grad
is
None
:
vars_with_empty_grads
.
append
(
var
)
else
:
filtered
.
append
((
grad
,
var
))
filtered
=
tuple
(
filtered
)
if
not
filtered
:
raise
ValueError
(
"No gradients provided for any variable: %s."
%
([
v
.
name
for
_
,
v
in
filtered
],))
if
vars_with_empty_grads
:
logging
.
warning
(
(
"Gradients does not exist for variables %s when minimizing the loss."
),
([
v
.
name
for
v
in
vars_with_empty_grads
]))
return
filtered
def
merge_grads
(
grads_and_vars
):
"""Merge gradients from different replicas."""
def
merge_grad_fn
(
strategy
,
grads_and_vars
):
reduced_grads
=
strategy
.
batch_reduce
(
variable_scope
.
VariableAggregation
.
MEAN
,
grads_and_vars
)
return
reduced_grads
return
distribution_strategy_context
.
get_tower_context
().
merge_call
(
merge_grad_fn
,
grads_and_vars
)
def
in_eager_execution
():
with
ops
.
init_scope
():
return
context
.
executing_eagerly
()
def
_get_slot_key_from_var
(
var
,
slot_name
):
"""Get the slot key for the variable.
Scope the slot name in the namespace of the primary variable.
Set "primary.op.name + '/' + slot_name" as default name.
In graph mode the name is derived from the op.
In eager mode the name is derived from the var.
If distribution strategy exists, then the name is derived from the primary
variable instead of replica variable, i.e., /dense/kernel instead of
/dense/kernel/replica_1. If the slot name is 'm', then the slot variables
being created are /dense/kernel/m and /dense/kernel/m/replica_1, instead of
/dense/kernel/replica_1/m/replica_1.
Args:
var: the variable.
slot_name: the name of the slot.
Returns:
the name of the variable.
"""
# pylint: disable=protected-access
if
distribution_strategy_context
.
has_distribution_strategy
()
and
hasattr
(
var
,
"_primary_var"
):
var
=
var
.
_primary_var
if
context
.
executing_eagerly
():
name
=
var
.
_shared_name
else
:
name
=
var
.
op
.
name
return
name
+
"/"
+
slot_name
tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
浏览文件 @
98fb3b92
# Copyright 201
5
The TensorFlow Authors. All Rights Reserved.
# Copyright 201
8
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -18,6 +18,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
tensorflow.python.eager
import
context
from
tensorflow.python.framework
import
constant_op
from
tensorflow.python.framework
import
dtypes
from
tensorflow.python.framework
import
ops
...
...
@@ -36,194 +37,179 @@ class OptimizerTest(test.TestCase):
@
test_util
.
run_in_graph_and_eager_modes
def
testBasic
(
self
):
for
i
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
def
loss
():
return
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
# Note that for eager execution, minimize expects a function instead of a
# Tensor.
global_step
=
resource_variable_ops
.
ResourceVariable
(
array_ops
.
zeros
([],
dtypes
.
int64
),
name
=
'global_step_%d'
%
i
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
self
.
evaluate
(
variables
.
global_variables_initializer
())
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
opt_op
=
sgd_op
.
minimize
(
loss
,
global_step
,
[
var0
,
var1
])
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
-
14.
,
-
13.
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
self
.
evaluate
(
var1
))
for
_
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
with
self
.
cached_session
():
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
loss
=
lambda
:
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
if
not
context
.
executing_eagerly
():
loss
=
loss
()
sgd
=
gradient_descent
.
SGD
(
3.0
)
self
.
evaluate
(
variables
.
global_variables_initializer
())
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
opt_op
=
sgd
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
])
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
-
14.
,
-
13.
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
self
.
evaluate
(
var1
))
@
test_util
.
run_in_graph_and_eager_modes
def
testAggregationMethod
(
self
):
for
dtype
in
[
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]:
with
self
.
cached_session
():
var0
=
variables
.
Variable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
variables
.
Variable
([
3.0
,
4.0
],
dtype
=
dtype
)
cost
=
5
*
var0
+
3
*
var1
global_step
=
variables
.
Variable
(
array_ops
.
zeros
([],
dtypes
.
int64
),
name
=
'global_step'
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
opt_op
=
sgd_op
.
minimize
(
cost
,
global_step
,
[
var0
,
var1
],
aggregation_method
=
gradients_impl
.
AggregationMethod
.
EXPERIMENTAL_ACCUMULATE_N
)
loss
=
lambda
:
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
if
not
context
.
executing_eagerly
():
loss
=
loss
()
sgd
=
gradient_descent
.
SGD
(
3.0
)
variables
.
global_variables_initializer
().
run
(
)
self
.
evaluate
(
variables
.
global_variables_initializer
()
)
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
var0
.
eval
(
))
self
.
assertAllClose
([
3.0
,
4.0
],
var1
.
eval
(
))
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
opt_op
.
run
()
opt_op
=
sgd
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
],
aggregation_method
=
gradients_impl
.
AggregationMethod
.
EXPERIMENTAL_ACCUMULATE_N
)
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
-
14.
,
-
13.
],
var0
.
eval
(
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
var1
.
eval
(
))
self
.
assertAllClose
([
-
14.
,
-
13.
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
self
.
evaluate
(
var1
))
@
test_util
.
run_in_graph_and_eager_modes
def
testPrecomputedGradient
(
self
):
for
dtype
in
[
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]:
with
self
.
cached_session
():
var0
=
variables
.
Variable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
variables
.
Variable
([
3.0
,
4.0
],
dtype
=
dtype
)
cost
=
5
*
var0
+
3
*
var1
loss
=
lambda
:
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
if
not
context
.
executing_eagerly
():
loss
=
loss
()
grad_loss
=
constant_op
.
constant
([
42
,
-
42
],
dtype
=
dtype
)
global_step
=
variables
.
Variable
(
array_ops
.
zeros
([],
dtypes
.
int64
),
name
=
'global_step'
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
opt_op
=
sgd_op
.
minimize
(
cost
,
global_step
,
[
var0
,
var1
],
grad_loss
=
grad_loss
)
sgd
=
gradient_descent
.
SGD
(
3.0
)
variables
.
global_variables_initializer
().
run
(
)
self
.
evaluate
(
variables
.
global_variables_initializer
()
)
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
var0
.
eval
(
))
self
.
assertAllClose
([
3.0
,
4.0
],
var1
.
eval
(
))
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
opt_op
.
run
()
opt_op
=
sgd
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
],
grad_loss
=
grad_loss
)
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
1.0
-
3
*
5
*
42.0
,
2.0
-
3
*
5
*
(
-
42.0
)],
var0
.
eval
(
))
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
-
3
*
3
*
42.0
,
4.0
-
3
*
3
*
(
-
42.0
)],
var1
.
eval
())
@
test_util
.
run_in_graph_and_eager_modes
def
testNoVariables
(
self
):
for
dtype
in
[
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]:
# pylint: disable=cell-var-from-loop
def
loss
():
var0
=
resource_variable_ops
.
ResourceVariable
(
[
1.0
,
2.0
],
dtype
=
dtype
,
trainable
=
False
,
name
=
'a'
)
var1
=
resource_variable_ops
.
ResourceVariable
(
[
3.0
,
4.0
],
dtype
=
dtype
,
trainable
=
False
,
name
=
'b'
)
return
5
*
var0
+
var1
# pylint: enable=cell-var-from-loop
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No.*variables'
):
sgd_op
.
minimize
(
loss
)
self
.
evaluate
(
var1
))
@
test_util
.
run_in_graph_and_eager_modes
def
testNoGradients
(
self
):
for
_
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4
.0
],
dtype
=
dtype
)
# pylint: disable=cell-var-from-loop
def
loss
():
return
5
*
var0
# pylint: enable=cell-var-from-loop
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients'
):
# var1 has no gradient
sgd_op
.
minimize
(
loss
,
var_list
=
[
var1
])
with
self
.
cached_session
():
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2
.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
loss
=
lambda
:
5
*
var0
# pylint: disable=cell-var-from-loop
if
not
context
.
executing_eagerly
():
loss
=
loss
()
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients'
):
# var1 has no gradient
sgd_op
.
minimize
(
loss
,
var_list
=
[
var1
])
@
test_util
.
run_in_graph_and_eager_modes
def
testNoGradientsForAnyVariables_Minimize
(
self
):
for
_
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
def
loss
():
return
constant_op
.
constant
(
5.0
)
with
self
.
cached_session
():
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
loss
=
lambda
:
constant_op
.
constant
(
5.0
)
if
not
context
.
executing_eagerly
():
loss
=
loss
()
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients provided for any variable'
):
sgd_op
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
])
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients provided for any variable'
):
sgd_op
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
])
@
test_util
.
run_in_graph_and_eager_modes
def
testNoGradientsForAnyVariables_ApplyGradients
(
self
):
for
_
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients provided for any variable'
):
sgd_op
.
apply_gradients
([(
None
,
var0
),
(
None
,
var1
)])
with
self
.
cached_session
():
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
with
self
.
assertRaisesRegexp
(
ValueError
,
'No gradients provided for any variable'
):
sgd_op
.
apply_gradients
([(
None
,
var0
),
(
None
,
var1
)])
@
test_util
.
run_in_graph_and_eager_modes
def
testGradientsAsVariables
(
self
):
for
i
,
dtype
in
enumerate
([
dtypes
.
half
,
dtypes
.
float32
,
dtypes
.
float64
]):
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
def
loss
():
return
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
grads_and_vars
=
sgd_op
.
compute_gradients
(
loss
,
[
var0
,
var1
])
# Convert gradients to tf.Variables
converted_grads
=
[
resource_variable_ops
.
ResourceVariable
(
array_ops
.
zeros
([
2
],
dtype
),
name
=
'c_%d_%d'
%
(
i
,
j
))
for
j
,
gv
in
enumerate
(
grads_and_vars
)
]
convert_ops
=
[
state_ops
.
assign
(
converted_grads
[
j
],
gv
[
0
])
for
j
,
gv
in
enumerate
(
grads_and_vars
)
]
self
.
evaluate
(
variables
.
global_variables_initializer
())
# Run convert_ops to achieve the gradietns converting
self
.
evaluate
(
convert_ops
)
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
with
self
.
cached_session
():
var0
=
resource_variable_ops
.
ResourceVariable
([
1.0
,
2.0
],
dtype
=
dtype
)
var1
=
resource_variable_ops
.
ResourceVariable
([
3.0
,
4.0
],
dtype
=
dtype
)
loss
=
lambda
:
5
*
var0
+
3
*
var1
# pylint: disable=cell-var-from-loop
if
not
context
.
executing_eagerly
():
loss
=
loss
()
sgd
=
gradient_descent
.
SGD
(
3.0
)
grads_and_vars
=
sgd
.
compute_gradients
(
loss
,
[
var0
,
var1
])
# Convert gradients to tf.Variables
converted_grads
=
[
resource_variable_ops
.
ResourceVariable
(
array_ops
.
zeros
([
2
],
dtype
),
name
=
'c_%d_%d'
%
(
i
,
j
))
for
j
,
gv
in
enumerate
(
grads_and_vars
)
]
convert_ops
=
[
state_ops
.
assign
(
converted_grads
[
j
],
gv
[
0
])
for
j
,
gv
in
enumerate
(
grads_and_vars
)
]
# Run convert_ops to achieve the gradients converting
self
.
evaluate
(
variables
.
global_variables_initializer
())
self
.
evaluate
(
convert_ops
)
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
converted_grads_and_vars
=
list
(
zip
(
converted_grads
,
[
var0
,
var1
]))
opt_op
=
sgd_op
.
apply_gradients
(
converted_grads_and_vars
)
self
.
evaluate
(
opt_op
)
# Run 1 step of sgd through optimizer
converted_grads_and_vars
=
list
(
zip
(
converted_grads
,
[
var0
,
var1
]))
opt_op
=
sgd
.
apply_gradients
(
converted_grads_and_vars
)
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
-
14.
,
-
13.
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
self
.
evaluate
(
var1
))
# Validate updated params
self
.
assertAllClose
([
-
14.
,
-
13.
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
-
6.
,
-
5.
],
self
.
evaluate
(
var1
))
@
test_util
.
run_in_graph_and_eager_modes
def
testComputeGradientsWithTensors
(
self
):
x
=
ops
.
convert_to_tensor
(
1.0
)
def
f
():
return
x
*
x
with
self
.
cached_session
():
x
=
ops
.
convert_to_tensor
(
1.0
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
grads_and_vars
=
sgd_op
.
compute_gradients
(
f
,
[
x
])
self
.
assertEqual
(
1
,
len
(
grads_and_vars
))
grad
,
x_as_var
=
grads_and_vars
[
0
]
self
.
assertIs
(
x
,
x_as_var
)
self
.
assertEqual
(
2.0
,
self
.
evaluate
(
grad
))
def
f
():
return
x
*
x
with
self
.
assertRaises
(
NotImplementedError
):
sgd_op
.
apply_gradients
(
grads_and_vars
)
sgd
=
gradient_descent
.
SGD
(
3.0
)
grads_and_vars
=
sgd
.
compute_gradients
(
f
,
[
x
])
self
.
assertEqual
(
1
,
len
(
grads_and_vars
))
grad
,
x_as_var
=
grads_and_vars
[
0
]
self
.
assertIs
(
x
,
x_as_var
)
self
.
assertEqual
(
2.0
,
self
.
evaluate
(
grad
))
def
testTrainOp
(
self
):
with
self
.
cached_session
():
var0
=
variables
.
Variable
([
1.0
,
2.0
])
var1
=
variables
.
Variable
([
3.0
,
4.0
])
cost
=
5
*
var0
+
3
*
var1
global_step
=
variables
.
Variable
(
array_ops
.
zeros
([],
dtypes
.
int64
),
name
=
'global_step'
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
opt_op
=
sgd_op
.
minimize
(
cost
,
global_step
,
[
var0
,
var1
])
self
.
assertTrue
(
opt_op
in
ops
.
get_collection
(
ops
.
GraphKeys
.
TRAIN_OP
))
with
self
.
assertRaises
(
NotImplementedError
):
sgd
.
apply_gradients
(
grads_and_vars
)
@
test_util
.
run_in_graph_and_eager_modes
def
testConstraint
(
self
):
constraint_01
=
lambda
x
:
clip_ops
.
clip_by_value
(
x
,
-
0.1
,
0.
)
constraint_0
=
lambda
x
:
clip_ops
.
clip_by_value
(
x
,
0.
,
1.
)
...
...
@@ -232,21 +218,29 @@ class OptimizerTest(test.TestCase):
constraint
=
constraint_01
)
var1
=
variables
.
Variable
([
3.0
,
4.0
],
constraint
=
constraint_0
)
cost
=
5
*
var0
+
3
*
var1
global_step
=
variables
.
Variable
(
array_ops
.
zeros
([],
dtypes
.
int64
),
name
=
'global_step'
)
sgd_op
=
gradient_descent
.
SGD
(
3.0
)
opt_op
=
sgd_op
.
minimize
(
cost
,
global_step
,
[
var0
,
var1
])
loss
=
lambda
:
5
*
var0
+
3
*
var1
if
not
context
.
executing_eagerly
():
# pylint: disable=cell-var-from-loop
loss
=
loss
()
sgd
=
gradient_descent
.
SGD
(
3.0
)
variables
.
global_variables_initializer
().
run
(
)
self
.
evaluate
(
variables
.
global_variables_initializer
()
)
# Fetch params to validate initial values
self
.
assertAllClose
([
1.0
,
2.0
],
var0
.
eval
(
))
self
.
assertAllClose
([
3.0
,
4.0
],
var1
.
eval
(
))
self
.
assertAllClose
([
1.0
,
2.0
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
3.0
,
4.0
],
self
.
evaluate
(
var1
))
# Run 1 step of sgd through optimizer
opt_op
.
run
()
opt_op
=
sgd
.
minimize
(
loss
,
var_list
=
[
var0
,
var1
])
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
evaluate
(
opt_op
)
# Validate updated params
self
.
assertAllClose
([
-
0.1
,
-
0.1
],
var0
.
eval
())
self
.
assertAllClose
([
0.
,
0.
],
var1
.
eval
())
self
.
assertAllClose
([
-
0.1
,
-
0.1
],
self
.
evaluate
(
var0
))
self
.
assertAllClose
([
0.
,
0.
],
self
.
evaluate
(
var1
))
@
test_util
.
run_in_graph_and_eager_modes
def
testIterationWithoutMinimize
(
self
):
with
self
.
cached_session
():
sgd
=
gradient_descent
.
SGD
(
3.0
)
self
.
evaluate
(
sgd
.
iteration
.
initializer
)
self
.
assertEqual
(
0
,
self
.
evaluate
(
sgd
.
iteration
))
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录