提交 39e03270 编写于 作者: W wanghaoshuang

Fix arch of DARTS

上级 f454dd9d
...@@ -49,25 +49,23 @@ class Architect(object): ...@@ -49,25 +49,23 @@ class Architect(object):
self.network_weight_decay), self.network_weight_decay),
parameter_list=self.unrolled_model_params) parameter_list=self.unrolled_model_params)
def step(self, input_train, target_train, input_valid, target_valid): def step(self, train_data, valid_data):
if self.unrolled: if self.unrolled:
params_grads = self._backward_step_unrolled( params_grads = self._backward_step_unrolled(train_data, valid_data)
input_train, target_train, input_valid, target_valid)
self.optimizer.apply_gradients(params_grads) self.optimizer.apply_gradients(params_grads)
else: else:
loss = self._backward_step(input_valid, target_valid) loss = self._backward_step(valid_data)
self.optimizer.minimize(loss) self.optimizer.minimize(loss)
self.optimizer.clear_gradients() self.optimizer.clear_gradients()
def _backward_step(self, input_valid, target_valid): def _backward_step(self, valid_data):
loss = self.model._loss(input_valid, target_valid) loss = self.model.loss(valid_data)
loss.backward() loss.backward()
return loss return loss
def _backward_step_unrolled(self, input_train, target_train, input_valid, def _backward_step_unrolled(self, train_data, valid_data):
target_valid): self._compute_unrolled_model(train_data)
self._compute_unrolled_model(input_train, target_train) unrolled_loss = self.unrolled_model.loss(valid_data)
unrolled_loss = self.unrolled_model._loss(input_valid, target_valid)
unrolled_loss.backward() unrolled_loss.backward()
vector = [ vector = [
...@@ -81,23 +79,22 @@ class Architect(object): ...@@ -81,23 +79,22 @@ class Architect(object):
] ]
self.unrolled_model.clear_gradients() self.unrolled_model.clear_gradients()
implicit_grads = self._hessian_vector_product(vector, input_train, implicit_grads = self._hessian_vector_product(vector, train_data)
target_train)
for (p, g), ig in zip(arch_params_grads, implicit_grads): for (p, g), ig in zip(arch_params_grads, implicit_grads):
new_g = g - (ig * self.unrolled_optimizer.current_step_lr()) new_g = g - (ig * self.unrolled_optimizer.current_step_lr())
g.value().get_tensor().set(new_g.numpy(), self.place) g.value().get_tensor().set(new_g.numpy(), self.place)
return arch_params_grads return arch_params_grads
def _compute_unrolled_model(self, input, target): def _compute_unrolled_model(self, data):
for x, y in zip(self.unrolled_model.parameters(), for x, y in zip(self.unrolled_model.parameters(),
self.model.parameters()): self.model.parameters()):
x.value().get_tensor().set(y.numpy(), self.place) x.value().get_tensor().set(y.numpy(), self.place)
loss = self.unrolled_model._loss(input, target) loss = self.unrolled_model._loss(data)
loss.backward() loss.backward()
self.unrolled_optimizer.minimize(loss) self.unrolled_optimizer.minimize(loss)
self.unrolled_model.clear_gradients() self.unrolled_model.clear_gradients()
def _hessian_vector_product(self, vector, input, target, r=1e-2): def _hessian_vector_product(self, vector, data, r=1e-2):
R = r * fluid.layers.rsqrt( R = r * fluid.layers.rsqrt(
fluid.layers.sum([ fluid.layers.sum([
fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector
...@@ -111,7 +108,7 @@ class Architect(object): ...@@ -111,7 +108,7 @@ class Architect(object):
for param, grad in zip(model_params, vector): for param, grad in zip(model_params, vector):
param_p = param + grad * R param_p = param + grad * R
param.value().get_tensor().set(param_p.numpy(), self.place) param.value().get_tensor().set(param_p.numpy(), self.place)
loss = self.model._loss(input, target) loss = self.model.loss(data)
loss.backward() loss.backward()
grads_p = [ grads_p = [
to_variable(param._grad_ivar().numpy()) to_variable(param._grad_ivar().numpy())
...@@ -123,7 +120,7 @@ class Architect(object): ...@@ -123,7 +120,7 @@ class Architect(object):
param.value().get_tensor().set(param_n.numpy(), self.place) param.value().get_tensor().set(param_n.numpy(), self.place)
self.model.clear_gradients() self.model.clear_gradients()
loss = self.model._loss(input, target) loss = self.model.loss(data)
loss.backward() loss.backward()
grads_n = [ grads_n = [
to_variable(param._grad_ivar().numpy()) to_variable(param._grad_ivar().numpy())
......
...@@ -101,6 +101,7 @@ class AdaBERTClassifier(Layer): ...@@ -101,6 +101,7 @@ class AdaBERTClassifier(Layer):
t_probs = fluid.layers.softmax(t_logits) t_probs = fluid.layers.softmax(t_logits)
s_probs = fluid.layers.softmax(s_logits) s_probs = fluid.layers.softmax(s_logits)
t_probs.stop_gradient = False
kd_loss = t_probs * fluid.layers.log(s_probs / T) kd_loss = t_probs * fluid.layers.log(s_probs / T)
kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1) kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1)
kd_loss = fluid.layers.reduce_mean(kd_loss, dim=0) kd_loss = fluid.layers.reduce_mean(kd_loss, dim=0)
...@@ -113,5 +114,5 @@ class AdaBERTClassifier(Layer): ...@@ -113,5 +114,5 @@ class AdaBERTClassifier(Layer):
ce_loss = fluid.layers.mean(x=ce_loss) ce_loss = fluid.layers.mean(x=ce_loss)
e_loss = 1 # to be done e_loss = 1 # to be done
loss = (1 - gamma) * ce_loss + gamma * kd_loss + beta * e_loss loss = (1 - gamma) * ce_loss - gamma * kd_loss + beta * e_loss
return loss return loss
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册