|
|
|
|
|
|
|
import torch |
|
import torch.nn as nn |
|
|
|
|
|
class AutoClip: |
|
def __init__(self, parameters, initial_clipping=0.1, percentile=50, history_len=1000): |
|
self.parameters = list(parameters) |
|
self.grad_history = [torch.full([history_len], initial_clipping) for _ in self.parameters] |
|
|
|
self.index = 0 |
|
self.history_len = history_len |
|
self.percentile = percentile |
|
|
|
@torch.no_grad() |
|
def __call__(self): |
|
self._add_to_history(self.parameters) |
|
|
|
grad_norms = [] |
|
for parameter, history in zip(self.parameters, self.grad_history): |
|
if parameter.grad is None or not parameter.grad.abs().sum().is_nonzero(): |
|
continue |
|
|
|
clip_value = self._get_percentile(history, self.percentile) |
|
grad_norms.append(nn.utils.clip_grad_norm_(parameter, clip_value).item()) |
|
|
|
return sum(grad_norms) / len(grad_norms) |
|
|
|
def _add_to_history(self, parameters): |
|
for i, param in enumerate(parameters): |
|
if param.grad is None or not param.grad.abs().sum().is_nonzero(): |
|
continue |
|
|
|
self.grad_history[i][self.index] = param.grad.data.norm(2) |
|
|
|
self.index = (self.index + 1) % self.history_len |
|
|
|
def _get_percentile(self, tensor, percentile): |
|
k = 1 + round(0.01 * percentile * (tensor.numel() - 1)) |
|
return tensor.kthvalue(k).values.item() |
|
|