我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用torch.optim()。
def test_sgd(self): self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3), wrap_old_fn(old_optim.sgd, learningRate=1e-3) ) self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3, momentum=0.9, dampening=0), wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9, dampening=0) ) self._test_basic_cases( lambda weight, bias: optim.SGD([weight, bias], lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.SGD( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3) )
def test_adam(self): self._test_rosenbrock( lambda params: optim.Adam(params, lr=1e-2), wrap_old_fn(old_optim.adam, learningRate=1e-2) ) self._test_rosenbrock( lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2), wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2) ) self._test_basic_cases( lambda weight, bias: optim.Adam([weight, bias], lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.Adam( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3) )
def test_adadelta(self): self._test_rosenbrock( lambda params: optim.Adadelta(params), wrap_old_fn(old_optim.adadelta) ) self._test_rosenbrock( lambda params: optim.Adadelta(params, rho=0.95), wrap_old_fn(old_optim.adadelta, rho=0.95) ) self._test_rosenbrock( lambda params: optim.Adadelta(params, weight_decay=1e-2), wrap_old_fn(old_optim.adadelta, weightDecay=1e-2) ) self._test_basic_cases( lambda weight, bias: optim.Adadelta([weight, bias]) ) self._test_basic_cases( lambda weight, bias: optim.Adadelta( self._build_params_dict(weight, bias, rho=0.95)) )
def test_adagrad(self): self._test_rosenbrock( lambda params: optim.Adagrad(params, lr=1e-1), wrap_old_fn(old_optim.adagrad, learningRate=1e-1) ) self._test_rosenbrock( lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3), wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3) ) self._test_rosenbrock( lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2), wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-1) )
def test_adamax(self): self._test_rosenbrock( lambda params: optim.Adamax(params, lr=1e-1), wrap_old_fn(old_optim.adamax, learningRate=1e-1) ) self._test_rosenbrock( lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2), wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2) ) self._test_rosenbrock( lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)), wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-1) )
def test_asgd(self): self._test_rosenbrock( lambda params: optim.ASGD(params, lr=1e-3), wrap_old_fn(old_optim.asgd, eta0=1e-3) ) self._test_rosenbrock( lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8), wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8) ) self._test_rosenbrock( lambda params: optim.ASGD(params, lr=1e-3, t0=1e3), wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3) ) self._test_basic_cases( lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100) ) self._test_basic_cases( lambda weight, bias: optim.ASGD( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, t0=100) )
def test_rprop(self): self._test_rosenbrock( lambda params: optim.Rprop(params, lr=1e-3), wrap_old_fn(old_optim.rprop, stepsize=1e-3) ) self._test_rosenbrock( lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)), wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1) ) self._test_rosenbrock( lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)), wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3) ) self._test_basic_cases( lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.Rprop( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3) )
def __init__(self): #initialize network for cycleGAN self.netG_A = Generator(input_size = g_input_size, hidden_size = g_hidden_size, output_size = g_output_size) self.netG_B = Generator(input_size = g_input_size, hidden_size = g_hidden_size, output_size = g_output_size) self.netD_A = Discriminator(input_size = d_input_size, hidden_size = d_hidden_size, output_size = d_output_size) self.netD_B = Discriminator(input_size = d_input_size, hidden_size = d_hidden_size, output_size = d_output_size) print('---------- Networks initialized -------------') #initialize loss function self.criterionGAN = GANLoss() self.criterionCycle = torch.nn.L1Loss() #initialize optimizers self.optimizer_G = torch.optim.Adam(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()), lr = d_learning_rate, betas = optim_betas) self.optimizer_D_A = torch.optim.Adam(self.netD_A.parameters(), lr = d_learning_rate, betas = optim_betas) self.optimizer_D_B = torch.optim.Adam(self.netD_B.parameters(), lr = d_learning_rate, betas = optim_betas)
def __init__(self): #initialize network for cycleGAN self.netG_A = Generator(input_size = g_input_size, hidden_size = g_hidden_size, output_size = g_output_size) #self.netG_A = torch.nn.DataParallel(self.netG_A) self.netG_B = Generator(input_size = g_input_size, hidden_size = g_hidden_size, output_size = g_output_size) #self.netG_B = torch.nn.DataParallel(self.netG_B) self.netD_A = Discriminator(input_size = d_input_size, hidden_size = d_hidden_size, output_size = d_output_size) #self.netD_A = torch.nn.DataParallel(self.netD_A) self.netD_B = Discriminator(input_size = d_input_size, hidden_size = d_hidden_size, output_size = d_output_size) #self.netD_B = torch.nn.DataParallel(self.netD_B) print('---------- Networks initialized -------------') #initialize loss function self.criterionGAN = GANLoss() self.criterionCycle = torch.nn.L1Loss() #initialize optimizers self.optimizer_G = torch.optim.Adam(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()), lr = d_learning_rate, betas = optim_betas) self.optimizer_D_A = torch.optim.Adam(self.netD_A.parameters(), lr = d_learning_rate, betas = optim_betas, weight_decay = l2) self.optimizer_D_B = torch.optim.Adam(self.netD_B.parameters(), lr = d_learning_rate, betas = optim_betas, weight_decay = l2)
def __init__(self, args, attr_size, node_size): super(TreeLM, self).__init__() self.batch_size = args.batch_size self.seq_length = args.seq_length self.attr_size = attr_size self.node_size = node_size self.embedding_dim = args.embedding_dim self.layer_num = args.layer_num self.dropout_prob = args.dropout_prob self.lr = args.lr self.attr_embedding = nn.Embedding(self.attr_size, self.embedding_dim) self.dropout = nn.Dropout(self.dropout_prob) self.lstm = nn.LSTM(input_size = self.embedding_dim, hidden_size = self.embedding_dim, num_layers= self.layer_num, dropout = self.dropout_prob) self.fc = nn.Linear(self.embedding_dim, self.node_size) self.optimizer = optim.Adam(self.parameters(), lr=self.lr) # self.node_mapping = node_mapping
def init_optimizer(self, state_dict=None): """Initialize an optimizer for the free parameters of the network. Args: state_dict: network parameters """ if self.args.fix_embeddings: for p in self.network.embedding.parameters(): p.requires_grad = False parameters = [p for p in self.network.parameters() if p.requires_grad] if self.args.optimizer == 'sgd': self.optimizer = optim.SGD(parameters, self.args.learning_rate, momentum=self.args.momentum, weight_decay=self.args.weight_decay) elif self.args.optimizer == 'adamax': self.optimizer = optim.Adamax(parameters, weight_decay=self.args.weight_decay) else: raise RuntimeError('Unsupported optimizer: %s' % self.args.optimizer) # -------------------------------------------------------------------------- # Learning # --------------------------------------------------------------------------
def update_lr(self): # Loop over all modules for m in self.modules(): # If a module is active: if hasattr(m,'active') and m.active: # If we've passed this layer's freezing point, deactivate it. if self.j > m.max_j: m.active = False # Also make sure we remove all this layer from the optimizer for i,group in enumerate(self.optim.param_groups): if group['layer_index']==m.layer_index: self.optim.param_groups.remove(group) # If not, update the LR else: for i,group in enumerate(self.optim.param_groups): if group['layer_index']==m.layer_index: self.optim.param_groups[i]['lr'] = (0.05/m.lr_ratio)*(1+np.cos(np.pi*self.j/m.max_j))\ if self.scale_lr else 0.05 * (1+np.cos(np.pi*self.j/m.max_j)) self.j += 1
def run_rmse_net(model, variables, X_train, Y_train): opt = optim.Adam(model.parameters(), lr=1e-3) for i in range(1000): opt.zero_grad() model.train() train_loss = nn.MSELoss()( model(variables['X_train_'])[0], variables['Y_train_']) train_loss.backward() opt.step() model.eval() test_loss = nn.MSELoss()( model(variables['X_test_'])[0], variables['Y_test_']) print(i, train_loss.data[0], test_loss.data[0]) model.eval() model.set_sig(variables['X_train_'], variables['Y_train_']) return model # TODO: minibatching
def get_optimizer(model, exp_name): ''' create oprimizer based on parameters loaded from config ''' cfg = config.load_config_file(exp_name) optimizer_name = cfg['optimizer'] optimizer_method = getattr(torch.optim, optimizer_name) optimizer = optimizer_method( model.parameters(), lr=cfg['learning_rate'], momentum=cfg['momentum'], weight_decay=cfg['weight_decay'] ) return optimizer
def get_optimizer(args, params): if args.dataset == 'mnist': if args.model == 'optnet-eq': params = list(params) A_param = params.pop(0) assert(A_param.size() == (args.neq, args.nHidden)) optimizer = optim.Adam([ {'params': params, 'lr': 1e-3}, {'params': [A_param], 'lr': 1e-1} ]) else: optimizer = optim.Adam(params) elif args.dataset in ('cifar-10', 'cifar-100'): if args.opt == 'sgd': optimizer = optim.SGD(params, lr=1e-1, momentum=0.9, weight_decay=args.weightDecay) elif args.opt == 'adam': optimizer = optim.Adam(params, weight_decay=args.weightDecay) else: assert(False) return optimizer
def test_sgd(self): self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3), wrap_old_fn(old_optim.sgd, learningRate=1e-3) ) self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3, momentum=0.9, dampening=0, weight_decay=1e-4), wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9, dampening=0, weightDecay=1e-4) ) self._test_basic_cases( lambda weight, bias: optim.SGD([weight, bias], lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.SGD( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3) )
def test_rmsprop(self): self._test_rosenbrock( lambda params: optim.RMSprop(params, lr=1e-2), wrap_old_fn(old_optim.rmsprop, learningRate=1e-2) ) self._test_rosenbrock( lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2), wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2) ) self._test_rosenbrock( lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95), wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-2) ) self._test_basic_cases( lambda weight, bias: optim.Adagrad( self._build_params_dict(weight, bias, lr=1e-3), lr=1e-2) )
def train(rank, args, model): torch.manual_seed(args.seed + rank) train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, num_workers=1) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, num_workers=1) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train_epoch(epoch, args, model, train_loader, optimizer) test_epoch(model, test_loader)
def initialize(is_gpu, dir_data, di_set_transform, ext_img, n_img_per_batch, n_worker): trainloader, testloader, li_class = make_dataloader_custom_file( dir_data, di_set_transform, ext_img, n_img_per_batch, n_worker) #net = Net().cuda() net = Net_gap() #t1 = net.cuda() criterion = nn.CrossEntropyLoss() if is_gpu: net.cuda() criterion.cuda() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=1, patience = 8, epsilon=0.00001, min_lr=0.000001) # set up scheduler return trainloader, testloader, net, criterion, optimizer, scheduler, li_class
def __init__(self, action_space, observation_space, batch_size=128, learning_rate=1e-3, discount=1.0, epsilon=0.05): if not isinstance(action_space, spaces.Discrete): raise TypeError("Action space type should be Discrete.") self._action_space = action_space self._batch_size = batch_size self._discount = discount self._epsilon = epsilon self._q_network = ConvNet( num_channel_input=observation_space.shape[0], num_output=action_space.n) self._optimizer = optim.RMSprop( self._q_network.parameters(), lr=learning_rate) self._memory = ReplayMemory(100000)
def __init__(self, action_space, observation_space, batch_size=128, learning_rate=1e-3, discount=1.0, epsilon=0.05): if not isinstance(action_space, spaces.Discrete): raise TypeError("Action space type should be Discrete.") self._action_space = action_space self._batch_size = batch_size self._discount = discount self._epsilon = epsilon self._q_network = FCNet( input_size=reduce(lambda x, y: x * y, observation_space.shape), output_size=action_space.n) self._optimizer = optim.RMSprop( self._q_network.parameters(), lr=learning_rate) self._memory = ReplayMemory(100000)
def train_epoch(self, X, y, show_bar=True): optimizer = optim.Adam(self.parameters()) if show_bar: bar = Progbar(len(X)) for ix, (elem, tags) in enumerate(zip(X, y)): self.zero_grad() sentence, feature_vector, sentence_markers = self.get_sentence_feature_vector(elem) if self.GPU: targets = torch.LongTensor(tags).cuda() else: targets = torch.LongTensor(tags) neg_log_likelihood = self.neg_log_likelihood(sentence, feature_vector, targets) neg_log_likelihood.backward() optimizer.step() if show_bar: bar.update(ix + 1) if show_bar: print '' sys.stdout.flush()
def test_sgd(self): self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3), wrap_old_fn(old_optim.sgd, learningRate=1e-3) ) self._test_rosenbrock( lambda params: optim.SGD(params, lr=1e-3, momentum=0.9, dampening=0, weight_decay=1e-4), wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9, dampening=0, weightDecay=1e-4) ) self._test_basic_cases( lambda weight, bias: optim.SGD([weight, bias], lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.SGD( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3) ) self._test_basic_cases( lambda weight, bias: optim.SGD( self._build_params_dict_single(weight, bias, lr=1e-2), lr=1e-3) )
def fit(self, observations, labels): def closure(): predicted = self.predict(observations) loss = self.loss_fn(predicted, labels) self.optimizer.zero_grad() loss.backward() return loss old_params = parameters_to_vector(self.model.parameters()) for lr in self.lr * .5**np.arange(10): self.optimizer = optim.LBFGS(self.model.parameters(), lr=lr) self.optimizer.step(closure) current_params = parameters_to_vector(self.model.parameters()) if any(np.isnan(current_params.data.cpu().numpy())): print("LBFGS optimization diverged. Rolling back update...") vector_to_parameters(old_params, self.model.parameters()) else: return
def __init__(self, model, **kwargs): super(Seq2SeqTrainerPyTorch, self).__init__() self.steps = 0 self.gpu = bool(kwargs.get('gpu', True)) optim = kwargs.get('optim', 'adam') eta = float(kwargs.get('eta', 0.01)) mom = float(kwargs.get('mom', 0.9)) self.clip = float(kwargs.get('clip', 5)) if optim == 'adadelta': self.optimizer = torch.optim.Adadelta(model.parameters(), lr=eta) elif optim == 'adam': self.optimizer = torch.optim.Adam(model.parameters(), lr=eta) elif optim == 'rmsprop': self.optimizer = torch.optim.RMSprop(model.parameters(), lr=eta) else: self.optimizer = torch.optim.SGD(model.parameters(), lr=eta, momentum=mom) self.model = model self._input = model.make_input self.crit = model.create_loss() if self.gpu: self.model = torch.nn.DataParallel(model).cuda() self.crit.cuda()
def update_lr(self,max_j): for param_group in self.optim.param_groups: param_group['lr'] = (0.5 * self.lr) * (1 + np.cos(np.pi * self.j / max_j)) # Optionally anneal the width settings throughout training. # self.min_width = 0.25 + 0.25 * min(self.j / (max_j * 0.5), 1.0) # self.max_width = 0.50 + 0.50 * min(self.j / (max_j * 0.5), 1.0) # self.max_paths = [min(float(self.j) / (max_j * 0.5), 1.0)] * 3 # self.min_budget = 0.25 + 0.25 * min(self.j / (max_j * 0.5), 1.0) self.max_budget = 0.50 + 0.50 * min(self.j / (max_j * 0.5), 1.0) # Anneal kernel sizes towards max kernel size self.max_kernel = 3 + int(((self.final_max_kernel - 3)//2) * min(self.j / (max_j * 0.5), 1.0) * 2) self.j += 1
def __init__(self): super(Generator, self).__init__() self.main = nn.Sequential( nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False), nn.BatchNorm2d(ngf * 8), nn.ReLU(True), nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 4), nn.ReLU(True), nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 2), nn.ReLU(True), nn.ConvTranspose2d(ngf * 2, ngf * 1, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 1), nn.ReLU(True), nn.ConvTranspose2d(ngf * 1, nc, 4, 2, 1, bias=False), nn.Tanh() ) self.apply(weights_init) self.optimizer = optim.Adam(self.parameters(), lr=learning_rate, betas=(beta_1, beta_2)) #self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate, alpha=beta_2)
def __init__(self): super(Discriminator, self).__init__() self.main = nn.Sequential( nn.Conv2d(nc, ndf, 4, 2, 1, bias=False), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False), nn.BatchNorm2d(ndf * 2), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False), nn.BatchNorm2d(ndf * 4), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False), nn.BatchNorm2d(ndf * 8), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False), nn.Sigmoid() ) self.apply(weights_init) self.optimizer = optim.Adam(self.parameters(), lr=learning_rate, betas=(beta_1, beta_2)) #self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate, alpha=beta_2)