gotch/nn/scheduler.go

package nn

import (
	// "fmt"
	"fmt"
	"log"
	"math"
)

type SchedulerOptions struct {
	// Metrics   map[string]interface{}
	Loss      float64 // Usually metrics is loss value
	LastEpoch int
}
type SchedulerOption func(*SchedulerOptions)

func defaultSchedulerOptions() *SchedulerOptions {
	return &SchedulerOptions{
		// Metrics:   make(map[string]interface{}, 0),
		Loss:      math.Inf(1),
		LastEpoch: -1,
	}
}

func DefaultSchedulerOptions() *SchedulerOptions {
	return defaultSchedulerOptions()
}

func WithLastEpoch(epoch int) SchedulerOption {
	return func(o *SchedulerOptions) {
		o.LastEpoch = epoch
	}
}

func WithLoss(loss float64) SchedulerOption {
	return func(o *SchedulerOptions) {
		o.Loss = loss
	}
}

type scheduler interface {
	SetLRs(opts ...SchedulerOption)
	Build() *LRScheduler
}

// LRScheduler is a scheduler to update optimizer learning rates.
type LRScheduler struct {
	scheduler scheduler
}

func NewLRScheduler(s scheduler) *LRScheduler {
	return &LRScheduler{s}
}

// Step updates optimizer learning rate.
func (s *LRScheduler) Step(opts ...SchedulerOption) {
	s.scheduler.SetLRs(opts...)
}

type LambdaFn func(in interface{}) float64

// LamdaLR calculates new learning rate for each parameter group by applying
// Lambda function to the corresponding INITIAL learning rate.
type LambdaLR struct {
	opt        *Optimizer
	lrLambdas  []LambdaFn // length should be 1 or equal to length of optimizer param groups.
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewLambdaLRS creates a new LambdaLRS.
func NewLambdaLR(opt *Optimizer, ldFns []LambdaFn) *LambdaLR {
	ngroup := opt.ParamGroupNum()
	initialLRs := opt.GetLRs()
	var funcs []LambdaFn = make([]LambdaFn, ngroup)
	switch len(ldFns) {
	case 1:
		// Apply Lambda function to all param groups
		for i := 0; i < ngroup; i++ {
			funcs[i] = ldFns[0]
		}
	case ngroup:
		funcs = ldFns
	default:
		log.Fatalf("Number of lambda functions (%d) is not equal to number of optimizer groups (%d)", len(ldFns), ngroup)
	}

	return &LambdaLR{
		opt:        opt,
		lrLambdas:  ldFns,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (l *LambdaLR) Build() *LRScheduler {
	s := &LRScheduler{l}
	s.Step()
	return s
}

// SetLRs implements scheduler interface.
func (l *LambdaLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		l.lastEpoch += 1
	default:
		l.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	switch l.lastEpoch {
	case 0:
		newLRs = l.initialLRs
	default:
		for i, lr := range l.initialLRs {
			lambda := l.lrLambdas[i](l.lastEpoch)
			newLR := lr * lambda
			newLRs = append(newLRs, newLR)
		}
	}

	l.opt.SetLRs(newLRs)
	l.stepCount += 1
}

// MultiplicativeLR calculates new learning rates for each optimizer para groups
// by applying corresponding Lambda function to the CURRENT learning rate.
type MultiplicativeLR struct {
	opt        *Optimizer
	lrLambdas  []LambdaFn // length should be 1 or equal to length of optimizer param groups.
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewMultiplicativeLR creates a new MultiplicativeLR.
func NewMultiplicativeLR(opt *Optimizer, ldFns []LambdaFn) *MultiplicativeLR {
	ngroup := opt.ParamGroupNum()
	initialLRs := opt.GetLRs()

	var funcs []LambdaFn = make([]LambdaFn, ngroup)
	switch len(ldFns) {
	case 1:
		// Apply Lambda function to all param groups
		for i := 0; i < ngroup; i++ {
			funcs[i] = ldFns[0]
		}
	case ngroup:
		funcs = ldFns
	default:
		log.Fatalf("Number of lambda functions (%d) is not equal to number of optimizer groups (%d)", len(ldFns), ngroup)
	}
	return &MultiplicativeLR{
		opt:        opt,
		lrLambdas:  ldFns,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (m *MultiplicativeLR) Build() *LRScheduler {
	s := &LRScheduler{m}
	s.Step()
	return s
}

// SetLRs implements scheduler interface.
func (m *MultiplicativeLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		m.lastEpoch += 1
	default:
		m.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	lrs, err := m.opt.opt.GetLearningRates()
	if err != nil {
		log.Fatal(err)
	}

	switch m.lastEpoch {
	case 0:
		newLRs = m.initialLRs
	default:
		for i, lr := range lrs {
			lambda := m.lrLambdas[i](m.lastEpoch)
			newLR := lr * lambda
			newLRs = append(newLRs, newLR)
		}
	}

	m.opt.SetLRs(newLRs)
}

// StepLR decays the learning rates of each optimizer parameter group by gamma every
// step size epochs.
//
// NOTE. Such decay can happen simultaneously with other changes to the learning rate
// from outside this scheduler.
type StepLR struct {
	opt        *Optimizer
	stepSize   int
	gamma      float64
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewStepLR creates a new StepLR.
func NewStepLR(opt *Optimizer, stepSize int, gamma float64) *StepLR {
	initialLRs := opt.GetLRs()
	return &StepLR{
		opt:        opt,
		stepSize:   stepSize,
		gamma:      gamma,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (s *StepLR) Build() *LRScheduler {
	sc := &LRScheduler{s}
	sc.Step()
	return sc
}

// SetLRs implements scheduler interface.
func (s *StepLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		s.lastEpoch += 1
	default:
		s.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	lrs, err := s.opt.opt.GetLearningRates()
	if err != nil {
		log.Fatal(err)
	}

	switch {
	case s.lastEpoch == 0, s.lastEpoch%s.stepSize != 0:
		newLRs = lrs
	default:
		for _, lr := range lrs {
			newLR := floatRound(lr*s.gamma, 10)
			newLRs = append(newLRs, newLR)
		}
	}

	s.opt.SetLRs(newLRs)
}

// floatRound rounds float64 value to a specified precision.
// Modified from: https://stackoverflow.com/questions/18390266
func floatRound(input float64, precision int) float64 {
	roundFactor := math.Pow(10, float64(precision))
	up := input * roundFactor
	round := int(up + math.Copysign(0.5, up))

	return float64(round) / roundFactor
}

// StepLR decays the learning rates of each optimizer parameter group by gamm once
// the number of epochs reaches one of the milestones.
//
// NOTE. Such decay can happen simultaneously with other changes to the learning rate
// from outside this scheduler.
type MultiStepLR struct {
	opt        *Optimizer
	milestones []int
	gamma      float64
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewStepLR creates a new StepLR.
func NewMultiStepLR(opt *Optimizer, milestones []int, gamma float64) *MultiStepLR {
	initialLRs := opt.GetLRs()
	return &MultiStepLR{
		opt:        opt,
		milestones: milestones,
		gamma:      gamma,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (ms *MultiStepLR) Build() *LRScheduler {
	s := &LRScheduler{ms}
	s.Step()
	return s
}

// SetLRs implements scheduler interface.
func (ms *MultiStepLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		ms.lastEpoch += 1
	default:
		ms.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	lrs, err := ms.opt.opt.GetLearningRates()
	if err != nil {
		log.Fatal(err)
	}

	switch {
	case !contain(ms.lastEpoch, ms.milestones):
		newLRs = lrs
	default:
		for _, lr := range lrs {
			newLR := floatRound(lr*ms.gamma, 10)
			newLRs = append(newLRs, newLR)
		}
	}

	ms.opt.SetLRs(newLRs)
}

func contain(item int, list []int) bool {
	for _, i := range list {
		if i == item {
			return true
		}
	}

	return false
}

// ExponentialLR decays the learning rates of each optimizer parameter group by gamma every
// epochs.
type ExponentialLR struct {
	opt        *Optimizer
	gamma      float64
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewExponentialLR creates a new ExponentialLR.
func NewExponentialLR(opt *Optimizer, gamma float64) *ExponentialLR {
	initialLRs := opt.GetLRs()
	return &ExponentialLR{
		opt:        opt,
		gamma:      gamma,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (e *ExponentialLR) Build() *LRScheduler {
	s := &LRScheduler{e}
	s.Step()
	return s
}

// SetLRs implements scheduler interface.
func (e *ExponentialLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		e.lastEpoch += 1
	default:
		e.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	lrs, err := e.opt.opt.GetLearningRates()
	if err != nil {
		log.Fatal(err)
	}

	switch {
	case e.lastEpoch == 0:
		newLRs = lrs
	default:
		for _, lr := range lrs {
			newLR := floatRound(lr*e.gamma, 10)
			newLRs = append(newLRs, newLR)
		}
	}

	e.opt.SetLRs(newLRs)
}

// CosineAnnealingLR set the learning rates of each optimizer parameter group by using
// a cosine annealing schedule where eta max is set to initial learning rate and Tcur
// is the number of epochs since the last restart in SGDR (Stochastic Gradient Descent with Warm Restarts).
//
// NOTE. this implements only the cosine annealing part of SGDR, and not the starts.
// Ref.
// - https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.CosineAnnealingLR
// - https://arxiv.org/abs/1608.03983
type CosineAnnealingLR struct {
	opt        *Optimizer
	tmax       int     // maximal number of iteration
	etaMin     float64 // Minimum learning rate. Default = 0
	initialLRs []float64
	stepCount  int
	lastEpoch  int
}

// NewConsineAnnealingLR creates a new ConsineAnnealingLR.
func NewCosineAnnealingLR(opt *Optimizer, tmax int, etaMin float64) *CosineAnnealingLR {
	opt.ResetStepCount()
	initialLRs := opt.GetLRs()
	return &CosineAnnealingLR{
		opt:        opt,
		tmax:       tmax,
		etaMin:     etaMin,
		initialLRs: initialLRs,
		stepCount:  0,
		lastEpoch:  -1,
	}
}

// Build implements scheduler interface.
func (ca *CosineAnnealingLR) Build() *LRScheduler {
	s := &LRScheduler{ca}
	s.Step()
	return s
}

// SetLRs implements scheduler interface.
func (ca *CosineAnnealingLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		ca.lastEpoch += 1
	default:
		ca.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	lrs, err := ca.opt.opt.GetLearningRates()
	if err != nil {
		log.Fatal(err)
	}

	switch {
	case ca.lastEpoch == 0:
		newLRs = ca.initialLRs
	case (ca.lastEpoch-1-ca.tmax)%(2*ca.tmax) == 0:
		for i, lr := range lrs {
			// group['lr'] + (base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2
			newLR := lr + (ca.initialLRs[i]-ca.etaMin)*(1-math.Cos(math.Pi/float64(ca.tmax)))/2
			newLRs = append(newLRs, newLR)
		}
	default:
		for _, lr := range lrs {
			//(1 + math.cos(math.pi * self.last_epoch / self.T_max))
			dividend := 1 + math.Cos(math.Pi*float64(ca.lastEpoch)/float64(ca.tmax))

			// (1 + math.cos(math.pi * (self.last_ca.lastEpoch - 1) / self.T_max)) * (group['lr'] - self.eta_min) + self.eta_min
			divisor := (1 + math.Cos(math.Pi*(float64(ca.lastEpoch-1)/float64(ca.tmax))))
			newLR := (dividend/divisor)*(lr-ca.etaMin) + ca.etaMin
			newLRs = append(newLRs, newLR)
		}
	}

	ca.opt.SetLRs(newLRs)
	ca.stepCount += 1
}

// ReduceLROnPlateau reduces learning rate when a metric has stopped improving.
// Models often benefit from reducing the learning rate by a factor
// of 2-10 once learning stagnates. This scheduler reads a metrics
// quantity and if no improvement is seen for a 'patience' number
// of epochs, the learning rate is reduced.
type ReduceLROnPlateau struct {
	opt *Optimizer

	// One of `min` or `max`. In `min` mode, lr will be reduced
	// when the quantiy monitored has stopped DECREASING. In `max`
	// mode, it will be reduced when the quantity mornitored has stopped
	// INCREASING. Default = "min"
	mode string

	// Factor by which the learning rate will be reduced (new LR = lr * factor).
	// Default = 0.1
	factor float64

	// Number of epochs with no improvement after which learning rate
	// will be reduced. E.g., if patience = 2, then we will ignore the first
	// 2 epochs with no improvement, and wil only decrease the LR after the 3rd epoch
	// if the loss still hasn't improved then.
	// Default: 10
	patience int

	// If "true", prints a message to stdout for each update.
	// Default = false
	verbose bool

	// Threshold for measuring the new optimum to only focus on
	// significant changes.
	// Default = 1e-4
	threshold float64

	// One of `rel`, `abs`
	// - `rel`: dynamicThreshold = best * (1 + threshold) in `max` mode
	// or bet * (1 - threshold) in `min` mode
	// - `abs`: dynamicThreshold = best + threshold in `max` mode or
	// best - threshold in `min` mode.
	// Default = `rel`
	thresholdMode string

	// Number of epochs to wait before resuming normal operation after
	// LR has been reduced.
	// Default = 0
	cooldown int

	// Default = 0
	cooldownCounter int

	// A lower bound on the learning rate of all optimizer param groups.
	// If length = 1, it applies to all param groups, otherwise, it should
	// have the same legnth as optimizer learning groups.
	// Default = []float64{0}
	minLRs []float64

	// Minimal decay applied to LR. If the difference between new and old LR
	// is smaller than eps, then update is ignored.
	// Default = 1e-8
	eps float64

	// Default = modeWorse (either inf or -inf)
	best float64

	// Default = 0
	numBadEpochs int

	// The worse value for the chosen mode
	// Default = inf if mode="min" or -inf if mode="max"
	modeWorse float64

	// Default = 0
	lastEpoch int
}

type ReduceLROnPlateauOptions struct {
	Mode          string
	Factor        float64
	Patience      int
	Verbose       bool
	Threshold     float64
	ThresholdMode string
	MinLRs        []float64
	Cooldown      int
	Eps           float64
}

type ReduceLROnPlateauOption func(*ReduceLROnPlateauOptions)

func defaultReduceLROnPlateauOptions() *ReduceLROnPlateauOptions {
	return &ReduceLROnPlateauOptions{
		Mode:          "min",
		Factor:        0.1,
		Patience:      10,
		Verbose:       false,
		Threshold:     1e-4,
		ThresholdMode: "rel",
		Cooldown:      0,
		MinLRs:        []float64{0.0},
		Eps:           1e-8,
	}
}

func WithReduceOnPlateauMode(mode string) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Mode = mode
	}
}

func WithReduceOnPlateauFactor(factor float64) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Factor = factor
	}
}

func WithReduceOnPlateauPatience(patience int) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Patience = patience
	}
}

func WithReduceOnPlateauVerbose(verbose bool) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Verbose = verbose
	}
}

func WithReduceOnPlateauThreshold(threshold float64) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Threshold = threshold
	}
}

func WithReduceOnPlateauThresholdMode(thresholdMode string) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.ThresholdMode = thresholdMode
	}
}

func WithReduceOnPlateauEps(eps float64) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Eps = eps
	}
}

func WithReduceOnPlateauMinLRs(minLRs []float64) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.MinLRs = minLRs
	}
}

func WithReduceOnPlateauCooldown(cooldown int) ReduceLROnPlateauOption {
	return func(o *ReduceLROnPlateauOptions) {
		o.Cooldown = cooldown
	}
}

func NewReduceLROnPlateau(opt *Optimizer, opts ...ReduceLROnPlateauOption) *ReduceLROnPlateau {
	options := defaultReduceLROnPlateauOptions()
	for _, o := range opts {
		o(options)
	}

	// Validate input parameters
	if options.Mode != "min" && options.Mode != "max" {
		log.Fatalf("Invalid 'mode'. Mode should be either 'min' or 'max', got %v\n", options.Mode)
	}
	if options.Factor >= 1.0 {
		log.Fatalf("Factor should be < 1.0. Got %v\n", options.Factor)
	}

	if options.ThresholdMode != "rel" && options.ThresholdMode != "abs" {
		log.Fatalf("Invalide threshold mode. Should be 'rel' or 'abs'. Got %v\n", options.ThresholdMode)
	}

	var modeWorse float64
	switch options.Mode {
	case "min":
		modeWorse = math.Inf(1) // inf
	case "max":
		modeWorse = math.Inf(-1) // -inf
	}

	ngroup := opt.ParamGroupNum()
	var minLRs []float64 = make([]float64, ngroup)
	switch len(options.MinLRs) {
	case 1:
		for i := 0; i < ngroup; i++ {
			minLRs[i] = options.MinLRs[0]
		}
	case ngroup:
		minLRs = options.MinLRs
	default:
		log.Fatalf("MinLRs should have length of 1 or the same length as optimizer param groups. Got %v\n", len(options.MinLRs))
	}

	return &ReduceLROnPlateau{
		opt:             opt,
		mode:            options.Mode,
		factor:          options.Factor,
		patience:        options.Patience,
		verbose:         options.Verbose,
		threshold:       options.Threshold,
		thresholdMode:   options.ThresholdMode,
		cooldown:        options.Cooldown,
		cooldownCounter: 0,
		minLRs:          minLRs,
		eps:             options.Eps,

		best:         modeWorse,
		numBadEpochs: 0,
		modeWorse:    modeWorse,
		lastEpoch:    0,
	}
}

// Reset number of bad epochs counter and cooldown counter
func (s *ReduceLROnPlateau) reset() {
	s.best = s.modeWorse
	s.cooldownCounter = 0
	s.numBadEpochs = 0
}

func (s *ReduceLROnPlateau) inCooldown() bool {
	return s.cooldownCounter > 0
}

// Evaluates whether the metrics (loss) is better than current (best) value.
func (s *ReduceLROnPlateau) isBetter(a, best float64) bool {
	switch {
	case s.mode == "min" && s.thresholdMode == "rel":
		relEpsilon := 1.0 - s.threshold
		return a < best*relEpsilon

	case s.mode == "min" && s.thresholdMode == "abs":
		return a < best-s.threshold

	case s.mode == "max" && s.thresholdMode == "rel":
		relEpsilon := s.threshold + 1
		return a > best*relEpsilon
	default: // mode == "max" && thresholdMode == "abs"
		return a > best+s.threshold
	}
}

func (s *ReduceLROnPlateau) reduceLRs(epoch int) {
	oldLRs := s.opt.GetLRs()

	var newLRs []float64 = oldLRs
	for i, oldLR := range oldLRs {
		newLR := floatMax(oldLR*s.factor, s.minLRs[i])
		if oldLR-newLR > s.eps {
			newLRs[i] = newLR
			if s.verbose {
				fmt.Printf("Epoch %06d: Reducing learning rate of param groups %d to %0.4e\n", epoch, i, newLR)
			}
		}
	}

	s.opt.SetLRs(newLRs)
}

// SetLRs implements scheduler interface.
func (s *ReduceLROnPlateau) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}

	switch options.LastEpoch {
	case -1:
		s.lastEpoch += 1
	default:
		s.lastEpoch = options.LastEpoch
	}

	switch s.isBetter(options.Loss, s.best) {
	case true:
		s.best = options.Loss
		s.numBadEpochs = 0
	case false:
		s.numBadEpochs += 1
	}

	if s.inCooldown() {
		s.cooldownCounter -= 1
		s.numBadEpochs = 0 // ignore any bad epochs in cooldown
	}

	if s.numBadEpochs > s.patience {
		s.reduceLRs(s.lastEpoch)
		s.cooldownCounter = s.cooldown
		s.numBadEpochs = 0
	}
}

// Build implements scheduler interface.
func (s *ReduceLROnPlateau) Build() *LRScheduler {
	return &LRScheduler{s}
}

func floatMax(v1, v2 float64) float64 {
	if v1 >= v2 {
		return v1
	}
	return v2
}

// CyclicLR sets the learning rate of each parameter group according to
// cyclical learning rate policy (CLR). The policy cycles the learning
// rate between two boundaries with a constant frequency, as detailed in
// the paper `Cyclical Learning Rates for Training Neural Networks`_.
// The distance between the two boundaries can be scaled on a per-iteration
// or per-cycle basis.
//
// Cyclical learning rate policy changes the learning rate after every batch.
// `Step()` should be called after a batch has been used for training.
// This class has three built-in policies, as put forth in the paper:
// - "triangular": A basic triangular cycle without amplitude scaling.
// - "triangular2": A basic triangular cycle that scales initial amplitude by half each cycle.
// - "exp_range": A cycle that scales initial amplitude by :math:`\text{gamma}^{\text{cycle iterations}}`
// at each cycle iteration.
//
// Source:
// - Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
// - bckenstler/CLR: https://github.com/bckenstler/CLR
type CyclicLR struct {
	// optimizer (Optimizer): Wrapped optimizer.
	opt *Optimizer

	// Initial learning rate which is the
	// lower boundary in the cycle for each parameter group.
	initialLRs []float64

	// Upper learning rate boundaries in the cycle
	// for each parameter group. Functionally,
	// it defines the cycle amplitude (max_lr - base_lr).
	// The lr at any cycle is the sum of base_lr
	// and some scaling of the amplitude; therefore
	// max_lr may not actually be reached depending on
	// scaling function.
	maxLRs []float64

	// Number of training iterations in the
	// increasing half of a cycle.
	// Default: 2000
	stepSizeUp int

	// Number of training iterations in the
	// decreasing half of a cycle. If stepSizeDown is -1,
	// it is set to stepSizeUp.
	// Default: -1
	stepSizeDown int

	// One of {triangular, triangular2, exp_range}.
	// Values correspond to policies detailed above.
	// If scaleFn is not None, this argument is ignored.
	// Default: 'triangular'
	mode string

	// Constant in 'expRange' scaling function:
	// gamma**(cycle iterations)
	// Default: 1.0
	gamma float64

	// Custom scaling policy defined by a single
	// argument lambda function, where
	// 0 <= scaleFn(x) <= 1 for all x >= 0.
	// If specified, then 'mode' is ignored.
	// Default: nil
	scaleFn func(x float64) float64

	// One of {'cycle', 'iterations'}.
	// Defines whether scale_fn is evaluated on
	// cycle number or cycle iterations (training
	// iterations since start of cycle).
	// Default: 'cycle'
	scaleMode string

	// If `true`, momentum is cycled inversely
	// to learning rate between 'baseMomentum' and 'maxMomentum'.
	// Default: true
	cycleMomentum bool

	// Lower momentum boundaries in the cycle
	// for each parameter group. Note that momentum is cycled inversely
	// to learning rate; at the peak of a cycle, momentum is
	// 'baseMomentum' and learning rate is 'maxLR'.
	// Default: 0.8
	baseMomentums []float64

	// Upper momentum boundaries in the cycle
	// for each parameter group. Functionally,
	// it defines the cycle amplitude (maxMomentum - baseMomentum).
	// The momentum at any cycle is the difference of maxMomentum
	// and some scaling of the amplitude; therefore
	// baseMomentum may not actually be reached depending on
	// scaling function. Note that momentum is cycled inversely
	// to learning rate; at the start of a cycle, momentum is 'maxMomentum'
	// and learning rate is 'baseLR'
	// Default: 0.9
	maxMomentums []float64

	// The index of the last batch. This parameter is used when
	// resuming a training job. Since `Step()` should be invoked after each
	// batch instead of after each epoch, this number represents the total
	// number of *batches* computed, not the total number of epochs computed.
	// When lastEpoch=-1, the schedule is started from the beginning.
	// Default: -1
	lastEpoch int

	totalSize int
	stepRatio float64
}

type CyclicOptions struct {
	StepSizeUp    int                     // 2000
	StepSizeDown  int                     // -1
	Mode          string                  // "triangular"
	Gamma         float64                 // 1.0
	ScaleFn       func(x float64) float64 // nil
	ScaleMode     string                  // "cycle"
	CycleMomentum bool                    // true
	BaseMomentum  float64                 // 0.8
	MaxMomentum   float64                 // 0.9
	LastEpoch     int                     // -1
}

type CyclicOption func(*CyclicOptions)

func defaultCyclicOptions() *CyclicOptions {
	return &CyclicOptions{
		StepSizeUp:    2000,
		StepSizeDown:  -1,
		Mode:          "triangular",
		Gamma:         1.0,
		ScaleFn:       nil,
		ScaleMode:     "cycle",
		CycleMomentum: true,
		BaseMomentum:  0.8,
		MaxMomentum:   0.9,
		LastEpoch:     -1,
	}
}

func WithCyclicStepSizeUp(v int) CyclicOption {
	return func(o *CyclicOptions) {
		o.StepSizeUp = v
	}
}

func WithCyclicStepSizeDown(v int) CyclicOption {
	return func(o *CyclicOptions) {
		o.StepSizeDown = v
	}
}

func WithCyclicMode(v string) CyclicOption {
	return func(o *CyclicOptions) {
		o.Mode = v
	}
}

func WithCyclicGamma(v float64) CyclicOption {
	return func(o *CyclicOptions) {
		o.Gamma = v
	}
}

func WithCyclicScaleFn(v func(x float64) float64) CyclicOption {
	return func(o *CyclicOptions) {
		o.ScaleFn = v
	}
}

func WithCyclicScaleMode(v string) CyclicOption {
	return func(o *CyclicOptions) {
		o.ScaleMode = v
	}
}

func WithCyclicCycleMomentum(v bool) CyclicOption {
	return func(o *CyclicOptions) {
		o.CycleMomentum = v
	}
}

func WithCyclicBaseMomentum(v float64) CyclicOption {
	return func(o *CyclicOptions) {
		o.BaseMomentum = v
	}
}

func WithCyclicMaxMomentum(v float64) CyclicOption {
	return func(o *CyclicOptions) {
		o.MaxMomentum = v
	}
}

func WithCyclicLastEpoch(v int) CyclicOption {
	return func(o *CyclicOptions) {
		o.LastEpoch = v
	}
}

func NewCyclicLR(opt *Optimizer, baseLRs, maxLRs []float64, opts ...CyclicOption) *CyclicLR {
	options := defaultCyclicOptions()
	for _, o := range opts {
		o(options)
	}

	var cyc *CyclicLR = new(CyclicLR)

	initialLRs := formatParam(opt, baseLRs, "baseLRs")
	if options.LastEpoch == -1 {
		opt.SetLRs(initialLRs)
	}
	cyc.initialLRs = initialLRs

	cyc.opt = opt
	cyc.maxLRs = formatParam(opt, maxLRs, "maxLRs")

	var stepSizeDown int
	switch options.StepSizeDown {
	case -1:
		stepSizeDown = options.StepSizeUp
	default:
		stepSizeDown = options.StepSizeDown
	}
	cyc.stepSizeUp = options.StepSizeUp
	cyc.stepSizeDown = stepSizeDown

	totalSize := stepSizeDown + options.StepSizeUp
	stepRatio := float64(options.StepSizeUp) / float64(totalSize)
	cyc.totalSize = totalSize
	cyc.stepRatio = stepRatio

	if !strContain([]string{"triangular", "triangular2", "exp_range"}, options.Mode) && options.ScaleFn == nil {
		log.Fatalf("Invalide 'mode': %v and scale function is nil\n", options.Mode)
	}
	cyc.mode = options.Mode
	cyc.gamma = options.Gamma

	switch options.ScaleFn {
	case nil:
		switch cyc.mode {
		case "triangular":
			cyc.scaleFn = func(x float64) float64 {
				return 1.0
			}
			cyc.scaleMode = "cycle"

		case "triangular2":
			cyc.scaleFn = func(x float64) float64 {
				return 1 / (math.Pow(2.0, (x - 1.0)))
			}
			cyc.scaleMode = "cycle"
		case "ex_range":
			cyc.scaleFn = func(x float64) float64 {
				return math.Pow(cyc.gamma, x)
			}
			cyc.scaleMode = "iterations"
		}

	default:
		cyc.scaleFn = options.ScaleFn
		cyc.scaleMode = options.ScaleMode
	}

	cyc.cycleMomentum = options.CycleMomentum
	if cyc.cycleMomentum {
		// if optimizer doesn't have momentum, throw error
		// TODO. type casting optimizer.config and check
		cyc.baseMomentums = formatParam(opt, []float64{options.BaseMomentum}, "baseMomentum")
		if options.LastEpoch == -1 {
			opt.SetMomentum(options.BaseMomentum)
		}
		cyc.maxMomentums = formatParam(opt, []float64{options.MaxMomentum}, "maxMomentum")
	}

	return cyc
}

func strContain(items []string, item string) bool {
	for _, i := range items {
		if i == item {
			return true
		}
	}

	return false
}

func formatParam(opt *Optimizer, param []float64, paramName string) []float64 {
	ngroup := opt.ParamGroupNum()
	var paramOut []float64 = make([]float64, ngroup)
	switch len(param) {
	case 1:
		for i := 0; i < ngroup; i++ {
			paramOut[i] = param[0]
		}
	case ngroup:
		paramOut = param
	default:
		log.Fatalf("Length of %s should be either 1 or equal to number of param groups. Got %v\n", paramName, len(param))
	}

	return paramOut
}

// SetLRs implements scheduler interface.
//
// It calculates the learning rate at batch index. This function treats
// `lastEpoch` as the last batch index.
// NOTE. If `cycleMomentum` is ``true``, this function has a side effect of
// updating the optimizer's momentum.
func (cyc *CyclicLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		cyc.lastEpoch += 1
	default:
		cyc.lastEpoch = options.LastEpoch
	}

	cycle := math.Floor(1.0 + float64(cyc.lastEpoch)/float64(cyc.totalSize))
	x := 1.0 + float64(cyc.lastEpoch)/float64(cyc.totalSize) - cycle

	var scaleFactor float64
	switch {
	case x <= cyc.stepRatio:
		scaleFactor = x / cyc.stepRatio
	default:
		scaleFactor = (x - 1.0) / (cyc.stepRatio - 1.0)
	}

	ngroup := cyc.opt.ParamGroupNum()
	var newLRs []float64 = make([]float64, ngroup)
	for i := 0; i < ngroup; i++ {
		baseLR := cyc.initialLRs[i]
		maxLR := cyc.maxLRs[i]
		baseHeight := (maxLR - baseLR) * scaleFactor
		var newLR float64
		switch cyc.scaleMode {
		case "cycle":
			newLR = baseLR + baseHeight*cyc.scaleFn(cycle)
		default:
			newLR = baseLR + baseHeight*cyc.scaleFn(float64(cyc.lastEpoch))
		}

		newLRs[i] = newLR
	}

	// Update optimizer learning rates.
	cyc.opt.SetLRs(newLRs)

	// Update optimizer momentum.
	// NOTE. for now, we just assuming there's 1 param group and will update momentum for such param group.
	if cyc.cycleMomentum {
		var momentum float64
		baseMomentum, maxMomentum := cyc.baseMomentums[0], cyc.maxMomentums[0]
		baseHeight := (maxMomentum - baseMomentum) * scaleFactor
		switch cyc.scaleMode {
		case "cycle":
			momentum = maxMomentum - baseHeight*cyc.scaleFn(cycle)
		default:
			momentum = maxMomentum - baseHeight*cyc.scaleFn(float64(cyc.lastEpoch))
		}
		cyc.opt.SetMomentum(momentum)
	}
}

// Build implements scheduler interface.
func (cyc *CyclicLR) Build() *LRScheduler {
	return &LRScheduler{cyc}
}

// CosineAnnealingWarmRestart sets the learning rate of each parameter group
/// using a cosine annealing schedule.
//
// Source:
// Stochastic Gradient Descent with Warm Restarts: https://arxiv.org/abs/1608.03983
type CosineAnnealingWarmRestarts struct {
	opt *Optimizer

	// Number of iterations for the first restart.
	t0 int

	// Number of epochs between 2 warm restarts.
	ti int

	// A factor increases Ti after a restart.
	tMult int // Default = 1

	// Minimum learning rate.
	etaMin float64 // Default = 0.0

	// Number of epochs since last restart.
	tcur int

	// The index of last epoch. Default: -1.
	lastEpoch  int // Default = -1
	initialLRs []float64
	stepCount  int
}

type CosineAnnealingWarmRestartsOptions struct {
	TMult     int
	EtaMin    float64
	LastEpoch int
}

type CosineAnnealingWarmRestartsOption func(*CosineAnnealingWarmRestartsOptions)

func defaultCosineAnnealingWarmRestartsOptions() *CosineAnnealingWarmRestartsOptions {
	return &CosineAnnealingWarmRestartsOptions{
		TMult:     1,
		EtaMin:    0.0,
		LastEpoch: -1,
	}
}

func WithTMult(v int) CosineAnnealingWarmRestartsOption {
	return func(o *CosineAnnealingWarmRestartsOptions) {
		o.TMult = v
	}
}

func WithEtaMin(v float64) CosineAnnealingWarmRestartsOption {
	return func(o *CosineAnnealingWarmRestartsOptions) {
		o.EtaMin = v
	}
}

func WithCosineAnnealingLastEpoch(v int) CosineAnnealingWarmRestartsOption {
	return func(o *CosineAnnealingWarmRestartsOptions) {
		o.LastEpoch = v
	}
}

func NewCosineAnnealingWarmRestarts(opt *Optimizer, t0 int, opts ...CosineAnnealingWarmRestartsOption) *CosineAnnealingWarmRestarts {
	options := defaultCosineAnnealingWarmRestartsOptions()
	for _, o := range opts {
		o(options)
	}

	if t0 <= 0 {
		log.Fatalf("T0 expected to be positive. Got %v\n", t0)
	}

	if options.TMult < 1 {
		log.Fatalf("Expected TMult >= 1. Got %v\n", options.TMult)
	}

	initialLRs := opt.GetLRs()

	return &CosineAnnealingWarmRestarts{
		opt:        opt,
		t0:         t0,
		ti:         t0,
		tMult:      options.TMult,
		etaMin:     options.EtaMin,
		tcur:       options.LastEpoch,
		lastEpoch:  options.LastEpoch,
		stepCount:  0,
		initialLRs: initialLRs,
	}
}

// SetLRs implements scheduler interface.
//
// NOTE. scheduler.Step(epoch) could be called after every batch update
func (s *CosineAnnealingWarmRestarts) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}

	var epoch int = options.LastEpoch
	if options.LastEpoch == -1 && s.lastEpoch < 0 {
		epoch = 0
	}

	switch {
	case epoch == -1:
		epoch = s.lastEpoch + 1
		s.tcur = s.tcur + 1
		if s.tcur >= s.ti {
			s.tcur = s.tcur - s.ti
			s.ti = s.ti * s.tMult
		}

	case epoch < 0:
		log.Fatalf("Expected non-negative epoch, got %v\n", epoch)

	case epoch >= s.t0:
		switch s.tMult {
		case 1:
			s.tcur = epoch % s.t0
		default:
			// n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
			n := int(math.Log(float64((epoch/s.t0)*(s.tMult-1)+1)) / math.Log(float64(s.tMult)))

			// self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
			s.tcur = epoch - (s.t0*int(math.Pow(float64(s.tMult), float64(n-1))))/(s.tMult-1)

			// self.T_i = self.T_0 * self.T_mult ** (n)
			s.ti = s.t0 * int(math.Pow(float64(s.tMult), float64(n)))
		}

	default:
		s.ti = s.t0
		s.tcur = epoch
	}

	s.lastEpoch = epoch

	var newLRs []float64
	// [self.eta_min + (base_lr - self.eta_min) * (1 + math.cos(math.pi * self.T_cur / self.T_i)) / 2
	// for base_lr in self.base_lrs]
	for _, baseLR := range s.initialLRs {
		newLR := s.etaMin + (baseLR-s.etaMin)*(1+math.Cos(math.Pi*float64(s.tcur)/float64(s.ti)))/2
		newLRs = append(newLRs, newLR)
	}

	s.opt.SetLRs(newLRs)
}

// Build implement scheduler interface
func (s *CosineAnnealingWarmRestarts) Build() *LRScheduler {
	scheduler := &LRScheduler{s}
	scheduler.Step()
	return scheduler
}

// OneCycleLR sets the learning rate of each parameter group according to the
// 1cycle learning rate policy. The 1cycle policy anneals the learning
// rate from an initial learning rate to some maximum learning rate and then
// from that maximum learning rate to some minimum learning rate much lower
// than the initial learning rate.
//
// This policy was initially described in the paper `Super-Convergence:
// Very Fast Training of Neural Networks Using Large Learning Rates`_.
// The 1cycle learning rate policy changes the learning rate after every batch.
// `step` should be called after a batch has been used for training.
// This scheduler is not chainable.
//
// Note also that the total number of steps in the cycle can be determined in one
// of two ways (listed in order of precedence):
// - A value for total_steps is explicitly provided.
// - A number of epochs (epochs) and a number of steps per epoch
// (steps_per_epoch) are provided.
// In this case, the number of total steps is inferred by
// total_steps = epochs * steps_per_epoch
// You must either provide a value for total_steps or provide a value for both
// epochs and steps_per_epoch.
//
// Source:
// Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates
// https://arxiv.org/abs/1708.07120
type OneCycleLR struct {
	opt *Optimizer

	// Upper learning rate boundaries in the cycle
	// for each parameter group.
	maxLRs []float64
	minLRs []float64

	// The total number of steps in the cycle. Note that
	// if a value is provided here, then it must be inferred by providing
	// a value for epochs and stepsPerEpoch.
	// Default: -1
	totalSteps int

	// The number of epochs to train for. This is used along
	// with stepsPerEpoch in order to infer the total number of steps in the cycle
	// if a value for totalSteps is not provided.
	// Default: -1
	// epochs int

	// The number of steps per epoch to train for. This is
	// used along with epochs in order to infer the total number of steps in the
	// cycle if a value for totalSteps is not provided.
	// Default: -1
	// stepsPerEpoch int

	// The percentage of the cycle (in number of steps) spent
	// increasing the learning rate.
	// Default: 0.3
	// pctStart float64

	// Specifies the annealing strategy: one of ["cos", "linear"].
	// - "cos" for cosine annealing,
	// - "linear" for linear annealing.
	// Default: 'cos'
	// annealStrategy string

	// If "true", momentum is cycled inversely
	// to learning rate between "baseMomentum" and "maxMomentum".
	// Default: true
	cycleMomentum bool

	// Lower momentum boundaries in the cycle
	// for each parameter group. Note that momentum is cycled inversely
	// to learning rate; at the peak of a cycle, momentum is
	// 'base_momentum' and learning rate is 'max_lr'.
	// Default: 0.85
	baseMomentums []float64

	// Upper momentum boundaries in the cycle for each parameter group. Functionally,
	// it defines the cycle amplitude (max_momentum - base_momentum).
	// Note that momentum is cycled inversely to learning rate; at the start of a cycle,
	// momentum is "maxMomentum" and learning rate is "baseLR"
	// Default: 0.95
	maxMomentums []float64

	// Determines the initial learning rate via initialLR = maxLR/divFactor
	// Default: 25
	// divFactor float64

	// Determines the minimum learning rate via
	// minLR = initialLR/finalDivFactor
	// Default: 1e4
	// finalDivFactor float64

	// The index of the last batch. This parameter is used when
	// resuming a training job. Since `step()` should be invoked after each
	// batch instead of after each epoch, this number represents the total
	// number of *batches* computed, not the total number of epochs computed.
	// When lastEpoch=-1, the schedule is started from the beginning.
	// Default: -1
	lastEpoch int

	initialLRs []float64

	// Number of training iterations in the
	// increasing half of a cycle.
	stepSizeUp int

	// Number of training iterations in the
	// decreasing half of a cycle. If stepSizeDown is -1,
	// it is set to stepSizeUp.
	stepSizeDown int

	annealFn func(start, end, pct float64) float64
}

type OneCycleOptions struct {
	TotalSteps     int
	Epochs         int
	StepsPerEpoch  int
	PctStart       float64
	AnnealStrategy string
	CycleMomentum  bool
	BaseMomentum   float64
	MaxMomentum    float64
	DivFactor      float64
	FinalDivFactor float64
	LastEpoch      int
}

type OneCycleOption func(*OneCycleOptions)

func defaultOneCycleOptions() *OneCycleOptions {
	return &OneCycleOptions{
		TotalSteps:     -1,
		Epochs:         -1,
		StepsPerEpoch:  -1,
		PctStart:       0.3,
		AnnealStrategy: "cos",
		CycleMomentum:  true,
		BaseMomentum:   0.85,
		MaxMomentum:    0.95,
		DivFactor:      25.0,
		FinalDivFactor: 1e4,
		LastEpoch:      -1,
	}
}

func WithOneCycleTotalSteps(v int) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.TotalSteps = v
	}
}

func WithOneCycleEpochs(v int) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.Epochs = v
	}
}

func WithOneCycleStepsPerEpoch(v int) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.StepsPerEpoch = v
	}
}

func WithOneCyclePctStart(v float64) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.PctStart = v
	}
}

func WithOneCycleAnnealStrategy(v string) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.AnnealStrategy = v
	}
}

func WithOneCycleCycleMomentum(v bool) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.CycleMomentum = v
	}
}

func WithOneCycleBaseMomentum(v float64) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.BaseMomentum = v
	}
}

func WithOneCycleMaxMomentum(v float64) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.MaxMomentum = v
	}
}

func WithOneCycleDivFactor(v float64) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.DivFactor = v
	}
}

func WithOneCycleFinalDivFactor(v float64) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.FinalDivFactor = v
	}
}

func WithOneCycleLastEpoch(v int) OneCycleOption {
	return func(o *OneCycleOptions) {
		o.LastEpoch = v
	}
}

func NewOneCycleLR(opt *Optimizer, maxLR float64, opts ...OneCycleOption) *OneCycleLR {
	options := defaultOneCycleOptions()
	for _, o := range opts {
		o(options)
	}

	oc := new(OneCycleLR)
	oc.opt = opt
	oc.lastEpoch = options.LastEpoch

	// validate  pctStart
	if options.PctStart < 0 || options.PctStart > 1 {
		log.Fatalf("Expected float between 0 and 1 pct_start, but got %v\n", options.PctStart)
	}

	// validate totalSteps
	switch {
	case options.TotalSteps == -1 && options.Epochs == -1 && options.StepsPerEpoch == -1:
		log.Fatal("You must define either total_steps OR (epochs AND steps_per_epoch)")
	case options.TotalSteps != -1:
		if options.TotalSteps <= 0 {
			log.Fatalf("Expected non-negative integer totalSteps, but got %v", options.TotalSteps)
		}

		oc.totalSteps = options.TotalSteps
	default:
		switch {
		case options.Epochs <= 0:
			log.Fatalf("Expected non-negative integer epochs, but got %v\n", options.Epochs)
		case options.StepsPerEpoch <= 0:
			log.Fatalf("Expected non-negative integer stepsPerEpoch, but got %v\n", options.StepsPerEpoch)
		default:
			oc.totalSteps = options.Epochs * options.StepsPerEpoch
		}
	}

	oc.stepSizeUp = int(options.PctStart*float64(oc.totalSteps)) - 1
	oc.stepSizeDown = oc.totalSteps - oc.stepSizeUp - 1

	// validate annealStrategy
	switch {
	case !strContain([]string{"cos", "linear"}, options.AnnealStrategy):
		log.Fatalf("anneal_strategy must by one of 'cos' or 'linear', instead got %v\n", options.AnnealStrategy)
	case options.AnnealStrategy == "cos":
		oc.annealFn = func(start, end, pct float64) float64 {
			// "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
			cosOut := math.Cos(math.Pi*pct) + 1
			return end + (start-end)/2.0*cosOut
		}
	case options.AnnealStrategy == "linear":
		oc.annealFn = func(start, end, pct float64) float64 {
			return (end-start)*pct + start
		}
	}

	// Initialize learning rate variables
	maxLRs := formatParam(opt, []float64{maxLR}, "maxLR")
	ngroup := opt.ParamGroupNum()
	var initialLRs []float64 = make([]float64, ngroup)
	var minLRs []float64 = make([]float64, ngroup)
	if options.LastEpoch == -1 {
		for i := 0; i < ngroup; i++ {
			initialLR := maxLRs[i] / options.DivFactor
			initialLRs[i] = initialLR
			minLRs[i] = initialLR / options.FinalDivFactor
		}

		// Set initial learning rate for optimizer
		opt.SetLRs(initialLRs)

		// Keep maxLRs and minLRs in scheduler as we don't have these fields in optimizer as Python.
		oc.maxLRs = maxLRs
		oc.minLRs = minLRs
	}
	oc.initialLRs = initialLRs

	// Initialize momentum
	oc.cycleMomentum = options.CycleMomentum
	if oc.cycleMomentum {
		// NOTE.
		// Optimizer must support momentum with `cycle_momentum` option enabled
		// Assumming we have "momentum" and "betas" in optimizer
		// In Python, implementation as follow:
		/*
			self.use_beta1 = 'betas' in self.optimizer.defaults
			max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
			base_momentums = self._format_param('base_momentum', optimizer, base_momentum)
			if last_epoch == -1:
					for m_momentum, b_momentum, group in zip(max_momentums, base_momentums, optimizer.param_groups):
							if self.use_beta1:
									_, beta2 = group['betas']
									group['betas'] = (m_momentum, beta2)
							else:
									group['momentum'] = m_momentum
							group['max_momentum'] = m_momentum
							group['base_momentum'] = b_momentum
		*/

		// TODO. work on Optimizer to fully implement
		oc.maxMomentums = formatParam(opt, []float64{options.MaxMomentum}, "maxMomentum")
		oc.baseMomentums = formatParam(opt, []float64{options.BaseMomentum}, "baseMomentum")
		if options.LastEpoch == -1 {
			opt.SetMomentum(options.MaxMomentum)
		}
	}

	return oc
}

func (oc *OneCycleLR) SetLRs(opts ...SchedulerOption) {
	options := defaultSchedulerOptions()
	for _, o := range opts {
		o(options)
	}
	switch options.LastEpoch {
	case -1:
		oc.lastEpoch += 1
	default:
		oc.lastEpoch = options.LastEpoch
	}

	var newLRs []float64
	var newMomentums []float64
	stepNum := oc.lastEpoch
	if stepNum > oc.totalSteps {
		log.Fatalf("Tried to step %v times. The specified number of total steps is %v", stepNum, oc.totalSteps)
	}
	ngroup := oc.opt.ParamGroupNum()
	for i := 0; i < ngroup; i++ {
		var computedLR float64
		var computedMomentum float64
		initialLR := oc.initialLRs[i]
		maxLR := oc.maxLRs[i]
		minLR := oc.minLRs[i]
		maxMomentum := oc.maxMomentums[i]
		baseMomentum := oc.baseMomentums[i]
		switch {
		case stepNum <= oc.stepSizeUp:
			computedLR = oc.annealFn(initialLR, maxLR, float64(stepNum)/float64(oc.stepSizeUp))
			if oc.cycleMomentum {
				computedMomentum = oc.annealFn(maxMomentum, baseMomentum, float64(stepNum)/float64(oc.stepSizeUp))
			}

		default:
			downStepNum := stepNum - oc.stepSizeUp
			computedLR = oc.annealFn(maxLR, minLR, float64(downStepNum)/float64(oc.stepSizeDown))
			if oc.cycleMomentum {
				computedMomentum = oc.annealFn(baseMomentum, maxMomentum, float64(downStepNum)/float64(oc.stepSizeDown))
			}
		}

		newLRs = append(newLRs, computedLR)
		newMomentums = append(newMomentums, computedMomentum)
	}

	oc.opt.SetLRs(newLRs)
	// For now, just use first momentum.
	oc.opt.SetMomentum(newMomentums[0])
}

func (oc *OneCycleLR) Build() *LRScheduler {
	s := &LRScheduler{oc}
	s.Step()
	return s
}