diff --git a/dtype.go b/dtype.go index bdf83fc..0716410 100644 --- a/dtype.go +++ b/dtype.go @@ -3,6 +3,7 @@ package gotch import ( "fmt" "log" + // "log" "reflect" ) @@ -31,6 +32,7 @@ var ( Int DType = DType{reflect.TypeOf(int32(1))} // 3 Int64 DType = DType{reflect.TypeOf(int64(1))} // 4 // Half DType = DType{reflect.TypeOf(GoFloat16(1))} // 5 + Half DType = DType{reflect.TypeOf(float32(1))} // 5 Float DType = DType{reflect.TypeOf(float32(1))} // 6 Double DType = DType{reflect.TypeOf(float64(1))} // 7 // ComplexHalf DType = DType{reflect.TypeOf(GoComplexHalf(1))} // 8 @@ -45,6 +47,7 @@ var dtypeGoType = map[DType]reflect.Type{ Int16: reflect.TypeOf(int16(1)), Int: reflect.TypeOf(int32(1)), Int64: reflect.TypeOf(int64(1)), + Half: reflect.TypeOf(float32(1)), Float: reflect.TypeOf(float32(1)), Double: reflect.TypeOf(float64(1)), Bool: reflect.TypeOf(true), @@ -87,6 +90,7 @@ var dtypeCInt = map[DType]CInt{ Int16: 2, Int: 3, Int64: 4, + Half: 5, Float: 6, Double: 7, Bool: 11, @@ -137,6 +141,7 @@ var dtypeSize = map[DType]uint{ Int16: 2, Int: 4, Int64: 8, + Half: 4, // Should it be? Float: 4, Double: 8, Bool: 1, diff --git a/example/char-rnn/main.go b/example/char-rnn/main.go index 199271c..ecf6825 100644 --- a/example/char-rnn/main.go +++ b/example/char-rnn/main.go @@ -42,7 +42,7 @@ func sample(data *ts.TextData, lstm *nn.LSTM, linear *nn.Linear, device gotch.De input.MustDrop() inputView.MustDrop() - forwardTs := linear.Forward(state.(*nn.LSTMState).H()).MustSqueeze1(0, true).MustSoftmax(-1, gotch.Float, true) + forwardTs := linear.Forward(state.(*nn.LSTMState).H()).MustSqueezeDim(0, true).MustSoftmax(-1, gotch.Float, true) sampledY := forwardTs.MustMultinomial(1, false, true) lastLabel = sampledY.Int64Values()[0] sampledY.MustDrop() diff --git a/example/cifar/main.go b/example/cifar/main.go index 94d9267..29b03d1 100644 --- a/example/cifar/main.go +++ b/example/cifar/main.go @@ -73,7 +73,7 @@ func fastResnet(p *nn.Path) *nn.SequentialT { seq.Add(nn.NewLinear(p.Sub("linear"), 512, 10, nn.DefaultLinearConfig())) seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor { - return xs.MustMul1(ts.FloatScalar(0.125), false) + return xs.MustMulScalar(ts.FloatScalar(0.125), false) })) return seq diff --git a/example/mnist/cnn.go b/example/mnist/cnn.go index b8020c8..32398c8 100644 --- a/example/mnist/cnn.go +++ b/example/mnist/cnn.go @@ -71,8 +71,13 @@ func runCNN1() { var ds *vision.Dataset ds = vision.LoadMNISTDir(MnistDirNN) - testImages := ds.TestImages - testLabels := ds.TestLabels + // ds.TrainImages [60000, 784] + // ds.TrainLabels [60000, 784] + testImages := ds.TestImages // [10000, 784] + testLabels := ds.TestLabels // [10000, 784] + + fmt.Printf("testImages: %v\n", testImages.MustSize()) + fmt.Printf("testLabels: %v\n", testLabels.MustSize()) device := gotch.CudaIfAvailable() vs := nn.NewVarStore(device) @@ -87,16 +92,17 @@ func runCNN1() { startTime := time.Now() for epoch := 0; epoch < epochsCNN; epoch++ { - totalSize := ds.TrainImages.MustSize()[0] samples := int(totalSize) + // Shuffling index := ts.MustRandperm(int64(totalSize), gotch.Int64, gotch.CPU) imagesTs := ds.TrainImages.MustIndexSelect(0, index, false) labelsTs := ds.TrainLabels.MustIndexSelect(0, index, false) + index.MustDrop() batches := samples / batchSize batchIndex := 0 - var epocLoss *ts.Tensor + var epocLoss float64 for i := 0; i < batches; i++ { start := batchIndex * batchSize size := batchSize @@ -106,37 +112,33 @@ func runCNN1() { batchIndex += 1 // Indexing - narrowIndex := ts.NewNarrow(int64(start), int64(start+size)) - bImages := imagesTs.Idx(narrowIndex) - bLabels := labelsTs.Idx(narrowIndex) + bImages := imagesTs.MustNarrow(0, int64(start), int64(size), false) + bLabels := labelsTs.MustNarrow(0, int64(start), int64(size), false) bImages = bImages.MustTo(vs.Device(), true) bLabels = bLabels.MustTo(vs.Device(), true) logits := net.ForwardT(bImages, true) + bImages.MustDrop() loss := logits.CrossEntropyForLogits(bLabels) + logits.MustDrop() + bLabels.MustDrop() - // loss = loss.MustSetRequiresGrad(true, false) + loss = loss.MustSetRequiresGrad(true, true) opt.BackwardStep(loss) - epocLoss = loss.MustShallowClone() - epocLoss.Detach_() - - // fmt.Printf("completed \t %v batches\t %.2f\n", i, loss.Float64Values()[0]) - - bImages.MustDrop() - bLabels.MustDrop() + epocLoss = loss.Float64Values()[0] + loss.MustDrop() } - // vs.Freeze() - testAccuracy := nn.BatchAccuracyForLogits(vs, net, testImages, testLabels, vs.Device(), 1024) - // vs.Unfreeze() - fmt.Printf("Epoch: %v\t Loss: %.2f \t Test accuracy: %.2f%%\n", epoch, epocLoss.Float64Values()[0], testAccuracy*100.0) - if testAccuracy > bestAccuracy { - bestAccuracy = testAccuracy - } + ts.NoGrad(func() { + testAccuracy := nn.BatchAccuracyForLogits(vs, net, testImages, testLabels, vs.Device(), 1024) + fmt.Printf("Epoch: %v\t Loss: %.2f \t Test accuracy: %.2f%%\n", epoch, epocLoss, testAccuracy*100.0) + if testAccuracy > bestAccuracy { + bestAccuracy = testAccuracy + } + }) - epocLoss.MustDrop() imagesTs.MustDrop() labelsTs.MustDrop() } diff --git a/example/neural-style-transfer/main.go b/example/neural-style-transfer/main.go index 6855630..6dc050f 100644 --- a/example/neural-style-transfer/main.go +++ b/example/neural-style-transfer/main.go @@ -49,7 +49,7 @@ func gramMatrix(m *ts.Tensor) *ts.Tensor { gram := mview.MustMatmul(mviewT, true) mviewT.MustDrop() - return gram.MustDiv1(ts.IntScalar(a*b*c*d), true) + return gram.MustDivScalar(ts.IntScalar(a*b*c*d), true) } func styleLoss(m1 *ts.Tensor, m2 *ts.Tensor) *ts.Tensor { @@ -138,7 +138,7 @@ func main() { vs := nn.NewVarStore(device) path := vs.Root() - inputVar := path.VarCopy("img", contentImg) + inputVar := path.MustVarCopy("img", contentImg) opt, err := nn.DefaultAdamConfig().Build(vs, LearningRate) if err != nil { log.Fatal(err) @@ -168,7 +168,7 @@ func main() { t.MustDrop() } - lossMul := sLoss.MustMul1(styleWeight, true) + lossMul := sLoss.MustMulScalar(styleWeight, true) loss := lossMul.MustAdd(cLoss, true) opt.BackwardStep(loss) diff --git a/example/yolo/darknet.go b/example/yolo/darknet.go index 0035c61..ab6f327 100644 --- a/example/yolo/darknet.go +++ b/example/yolo/darknet.go @@ -248,8 +248,8 @@ func conv(vs *nn.Path, index uint, p int64, b *Block) (retVal1 int64, retVal2 in var res *ts.Tensor if leaky { - tmp2Mul := tmp2.MustMul1(ts.FloatScalar(0.1), false) - res = tmp2.MustMax1(tmp2Mul, true) + tmp2Mul := tmp2.MustMulScalar(ts.FloatScalar(0.1), false) + res = tmp2.MustMaximum(tmp2Mul, true) tmp2Mul.MustDrop() } else { res = tmp2 @@ -434,7 +434,7 @@ func detect(xs *ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) * }) sliceApplyAndSet(xsTs, 0, 4, func(xs *ts.Tensor) *ts.Tensor { - return xs.MustMul1(ts.IntScalar(stride), false) + return xs.MustMulScalar(ts.IntScalar(stride), false) }) // TODO: delete all middle tensors. diff --git a/example/yolo/main.go b/example/yolo/main.go index 93afd3c..f33adff 100644 --- a/example/yolo/main.go +++ b/example/yolo/main.go @@ -3,14 +3,15 @@ package main import ( "flag" "fmt" - "github.com/sugarme/gotch" - "github.com/sugarme/gotch/nn" - ts "github.com/sugarme/gotch/tensor" - "github.com/sugarme/gotch/vision" "log" "math" "path/filepath" "sort" + + "github.com/sugarme/gotch" + "github.com/sugarme/gotch/nn" + ts "github.com/sugarme/gotch/tensor" + "github.com/sugarme/gotch/vision" ) const ( @@ -273,7 +274,7 @@ func main() { imgTmp1 := imageTs.MustUnsqueeze(0, true) imgTmp2 := imgTmp1.MustTotype(gotch.Float, true) - img := imgTmp2.MustDiv1(ts.FloatScalar(255.0), true) + img := imgTmp2.MustDivScalar(ts.FloatScalar(255.0), true) predictTmp := model.ForwardT(img, false) predictions := predictTmp.MustSqueeze(true) diff --git a/nn/batch-norm.go b/nn/batch-norm.go index f3eb01a..a6b8c6c 100644 --- a/nn/batch-norm.go +++ b/nn/batch-norm.go @@ -41,10 +41,10 @@ type BatchNorm struct { func NewBatchNorm(vs *Path, nd uint, outDim int64, config *BatchNormConfig) *BatchNorm { return &BatchNorm{ config: config, - RunningMean: vs.ZerosNoTrain("running_mean", []int64{outDim}), - RunningVar: vs.OnesNoTrain("running_var", []int64{outDim}), - Ws: vs.NewVar("weight", []int64{outDim}, config.WsInit), - Bs: vs.NewVar("bias", []int64{outDim}, config.BsInit), + RunningMean: vs.MustZerosNoTrain("running_mean", []int64{outDim}), + RunningVar: vs.MustOnesNoTrain("running_var", []int64{outDim}), + Ws: vs.MustNewVar("weight", []int64{outDim}, config.WsInit), + Bs: vs.MustNewVar("bias", []int64{outDim}, config.BsInit), } } diff --git a/nn/conv-transpose.go b/nn/conv-transpose.go index 8ba56c8..3ac2035 100644 --- a/nn/conv-transpose.go +++ b/nn/conv-transpose.go @@ -73,10 +73,10 @@ func NewConvTranspose1D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *Conv weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } return &ConvTranspose1D{ @@ -104,11 +104,11 @@ func NewConvTranspose2D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *Conv ) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &ConvTranspose2D{ Ws: ws, @@ -134,11 +134,11 @@ func NewConvTranspose3D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *Conv ) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &ConvTranspose3D{ Ws: ws, diff --git a/nn/conv.go b/nn/conv.go index a8d74ed..f724a58 100644 --- a/nn/conv.go +++ b/nn/conv.go @@ -289,11 +289,11 @@ func NewConv1D(vs *Path, inDim, outDim, k int64, cfg *Conv1DConfig) *Conv1D { bs *ts.Tensor = ts.NewTensor() ) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, k) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv1D{ Ws: ws, @@ -316,11 +316,11 @@ func NewConv2D(vs *Path, inDim, outDim int64, k int64, cfg *Conv2DConfig) *Conv2 bs *ts.Tensor = ts.NewTensor() ) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, k, k) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv2D{ Ws: ws, @@ -343,11 +343,11 @@ func NewConv3D(vs *Path, inDim, outDim, k int64, cfg *Conv3DConfig) *Conv3D { bs *ts.Tensor = ts.NewTensor() ) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, k, k, k) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv3D{ Ws: ws, @@ -418,11 +418,11 @@ func NewConv(vs *Path, inDim, outDim int64, ksizes []int64, config interface{}) case len(ksizes) == 1 && configT.String() == "*nn.Conv1DConfig": cfg := config.(*Conv1DConfig) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv1D{ Ws: ws, Bs: bs, @@ -431,11 +431,11 @@ func NewConv(vs *Path, inDim, outDim int64, ksizes []int64, config interface{}) case len(ksizes) == 2 && configT.String() == "*nn.Conv2DConfig": cfg := config.(*Conv2DConfig) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv2D{ Ws: ws, Bs: bs, @@ -444,11 +444,11 @@ func NewConv(vs *Path, inDim, outDim int64, ksizes []int64, config interface{}) case len(ksizes) == 3 && configT.String() == "*nn.Conv3DConfig": cfg := config.(*Conv3DConfig) if cfg.Bias { - bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, cfg.BsInit) } weightSize := []int64{outDim, int64(inDim / cfg.Groups)} weightSize = append(weightSize, ksizes...) - ws = vs.NewVar("weight", weightSize, cfg.WsInit) + ws = vs.MustNewVar("weight", weightSize, cfg.WsInit) return &Conv3D{ Ws: ws, Bs: bs, diff --git a/nn/jit.go b/nn/jit.go index e4b6e77..8c378fd 100644 --- a/nn/jit.go +++ b/nn/jit.go @@ -36,7 +36,7 @@ func TrainableCModuleLoad(p *Path, file string) (*TrainableCModule, error) { // NOTE: return is a newly created and added tensor in varstore. // This tensor is different from input named tensor. // If not using, just ignore it. Drop it, will drop tensor at varstore. - _ = p.Add(name, namedTensor.Tensor, requiresGrad) + _ = p.MustAdd(name, namedTensor.Tensor, requiresGrad) // Clean-up named tensors. namedTensor.Tensor.MustDrop() @@ -62,7 +62,7 @@ func TrainableCModuleLoadData(p *Path, stream io.Reader) (*TrainableCModule, err // NOTE: return is a newly created and added tensor in varstore. // This tensor is different from input named tensor. // If not using, just ignore it. Drop it, will drop tensor at varstore. - _ = p.Add(name, namedTensor.Tensor, requiresGrad) + _ = p.MustAdd(name, namedTensor.Tensor, requiresGrad) // Clean-up named tensors. namedTensor.Tensor.MustDrop() diff --git a/nn/layer-norm.go b/nn/layer-norm.go index 538cd33..1af393a 100644 --- a/nn/layer-norm.go +++ b/nn/layer-norm.go @@ -39,8 +39,8 @@ func NewLayerNorm(vs *Path, normalizedShape []int64, config *LayerNormConfig) *L bs *ts.Tensor ) if config.ElementwiseAffine { - ws = vs.NewVar("weight", normalizedShape, config.WsInit) - bs = vs.NewVar("bias", normalizedShape, config.BsInit) + ws = vs.MustNewVar("weight", normalizedShape, config.WsInit) + bs = vs.MustNewVar("bias", normalizedShape, config.BsInit) } return &LayerNorm{config, ws, bs, normalizedShape} diff --git a/nn/linear.go b/nn/linear.go index dfce112..7d03ada 100644 --- a/nn/linear.go +++ b/nn/linear.go @@ -49,14 +49,14 @@ func NewLinear(vs *Path, inDim, outDim int64, c *LinearConfig) *Linear { case c.BsInit == nil: bound := 1.0 / math.Sqrt(float64(inDim)) bsInit := NewUniformInit(-bound, bound) - bs = vs.NewVar("bias", []int64{outDim}, bsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, bsInit) case c.BsInit != nil: - bs = vs.NewVar("bias", []int64{outDim}, c.BsInit) + bs = vs.MustNewVar("bias", []int64{outDim}, c.BsInit) } } return &Linear{ - Ws: vs.NewVar("weight", []int64{outDim, inDim}, c.WsInit).MustT(false), + Ws: vs.MustNewVar("weight", []int64{outDim, inDim}, c.WsInit).MustT(false), Bs: bs, } } diff --git a/nn/optimizer.go b/nn/optimizer.go index c1211d9..a9a8c86 100644 --- a/nn/optimizer.go +++ b/nn/optimizer.go @@ -5,14 +5,18 @@ package nn import ( "fmt" "log" + "math" + "github.com/sugarme/gotch" ts "github.com/sugarme/gotch/tensor" ) // Optimizer is a struct object to run gradient descent. type Optimizer struct { - opt *ts.COptimizer - variablesInOptimizer uint8 + varstore *VarStore + opt *ts.COptimizer + // variablesInOptimizer uint8 + variablesInOptimizer map[string]struct{} config interface{} stepCount int } @@ -34,25 +38,27 @@ type OptimizerConfig interface { } // defaultBuild is `default` Build method for OptimizerConfig interface -func defaultBuild(config OptimizerConfig, vs *VarStore, lr float64) (retVal *Optimizer, err error) { +func defaultBuild(config OptimizerConfig, vs *VarStore, lr float64) (*Optimizer, error) { opt, err := config.buildCOpt(lr) if err != nil { - return retVal, err + return nil, err } - if len(vs.Vars.TrainableVariables) > 0 { - for _, v := range vs.Vars.TrainableVariables { + names := make(map[string]struct{}) + for name, v := range vs.vars { + if v.Trainable { if err = opt.AddParameter(v.Tensor, v.Group); err != nil { err = fmt.Errorf("Optimizer defaultBuild - AddParameter failed: %w\n", err) return nil, err } } + names[name] = struct{}{} } return &Optimizer{ - opt: opt, - // variables: vs.Vars, - variablesInOptimizer: uint8(len(vs.Vars.TrainableVariables)), + varstore: vs, + opt: opt, + variablesInOptimizer: names, config: config, stepCount: 0, }, nil @@ -215,51 +221,79 @@ func (c *RMSPropConfig) Build(vs *VarStore, lr float64) (*Optimizer, error) { // Optimizer methods: // ================== + func (opt *Optimizer) addMissingVariables() { - - // missingVariables := len(opt.variables.TrainableVariables) - int(opt.variablesInOptimizer) - // - // if missingVariables > 0 { - // var tensors []ts.Tensor - // for _, t := range opt.variables.TrainableVariables[opt.variablesInOptimizer:] { - // tensor := t.MustShallowClone() - // tensor.Detach_() - // tensors = append(tensors, tensor) - // } - // - // opt.opt.AddParameters(tensors) - // opt.variablesInOptimizer = uint8(len(opt.variables.TrainableVariables)) - // } - + type param struct { + tensor *ts.Tensor + group uint + } + trainables := make(map[string]param) + for name, v := range opt.varstore.vars { + if v.Trainable { + trainables[name] = param{tensor: v.Tensor, group: v.Group} + } + } + missingVariables := len(trainables) - len(opt.variablesInOptimizer) + if missingVariables > 0 { + log.Println("INFO: Optimizer.addMissingVariables()...") + for name, x := range trainables { + if _, ok := opt.variablesInOptimizer[name]; !ok { + opt.opt.AddParameter(x.tensor, x.group) + opt.variablesInOptimizer[name] = struct{}{} + } + } + } } // ZeroGrad zeroes the gradient for the tensors tracked by this optimizer. -func (opt *Optimizer) ZeroGrad() { - opt.addMissingVariables() +func (opt *Optimizer) ZeroGrad() error { if err := opt.opt.ZeroGrad(); err != nil { - log.Fatalf("Optimizer - ZeroGrad method call error: %v\n", err) + err = fmt.Errorf("Optimizer.ZeroGrad() failed: %w\n", err) + return err + } + return nil +} + +// MustZeroGrad zeroes the gradient for the tensors tracked by this optimizer. +func (opt *Optimizer) MustZeroGrad() { + err := opt.ZeroGrad() + if err != nil { + log.Fatal(err) } } // Clips gradient value at some specified maximum value. func (opt *Optimizer) ClipGradValue(max float64) { + opt.varstore.Lock() + defer opt.varstore.Unlock() - // opt.variables.mutex.Lock() - // defer opt.variables.mutex.Unlock() - - // for _, tensor := range opt.variables.TrainableVariables { - // tensor.MustGrad().Clamp_(ts.FloatScalar(-max), ts.FloatScalar(max)) - // } + for _, v := range opt.varstore.vars { + if v.Trainable { + // v.Tensor.MustGrad().Clamp_(ts.FloatScalar(-max), ts.FloatScalar(max)) + gradTs := v.Tensor.MustGrad(false) + gradTs.Clamp_(ts.FloatScalar(-max), ts.FloatScalar(max)) + } + } } // Step performs an optimization step, updating the tracked tensors based on their gradients. -func (opt *Optimizer) Step() { - opt.addMissingVariables() +func (opt *Optimizer) Step() error { err := opt.opt.Step() if err != nil { - log.Fatalf("Optimizer - Step method call error: %v\n", err) + err = fmt.Errorf("Optimizer.Step() failed: %w\n", err) + return err } opt.stepCount += 1 + + return nil +} + +// MustStep performs an optimization step, updating the tracked tensors based on their gradients. +func (opt *Optimizer) MustStep() { + err := opt.Step() + if err != nil { + log.Fatal(err) + } } // ResetStepCount set step count to zero. @@ -273,51 +307,208 @@ func (opt *Optimizer) StepCount() int { } // BackwardStep applies a backward step pass, update the gradients, and performs an optimization step. -func (opt *Optimizer) BackwardStep(loss *ts.Tensor) { - opt.addMissingVariables() +func (opt *Optimizer) BackwardStep(loss *ts.Tensor) error { err := opt.opt.ZeroGrad() if err != nil { - log.Fatalf("Optimizer - BackwardStep method call - ZeroGrad error: %v\n", err) + err = fmt.Errorf("Optimizer.BackwardStep() failed: %w\n", err) + return err } + loss.MustBackward() err = opt.opt.Step() if err != nil { - log.Fatalf("Optimizer - BackwardStep method call - Step() error: %v\n", err) + err = fmt.Errorf("Optimizer.BackwardStep() failed: %w\n", err) + return err + } + + return nil +} + +// MustBackwardStep applies a backward step pass, update the gradients, and performs an optimization step. +func (opt *Optimizer) MustBackwardStep(loss *ts.Tensor) { + err := opt.BackwardStep(loss) + if err != nil { + log.Fatal(err) } } // BackwardStepClip applies a backward step pass, update the gradients, and performs an optimization step. // // The gradients are clipped based on `max` before being applied. -func (opt *Optimizer) BackwardStepClip(loss *ts.Tensor, max float64) { - opt.addMissingVariables() +func (opt *Optimizer) BackwardStepClip(loss *ts.Tensor, max float64) error { err := opt.opt.ZeroGrad() if err != nil { - log.Fatalf("Optimizer - BackwardStepClip method call - ZeroGrad error: %v\n", err) + err = fmt.Errorf("Optimizer.BackwardStepClip() failed: %w\n", err) + return err } loss.MustBackward() opt.ClipGradValue(max) err = opt.opt.Step() if err != nil { - log.Fatalf("Optimizer - BackwardStepClip method call - Step() error: %v\n", err) + err = fmt.Errorf("Optimizer.BackwardStepClip() failed: %w\n", err) + return err + } + return nil +} + +// MustBackwardStepClip applies a backward step pass, update the gradients, and performs an optimization step. +// +// The gradients are clipped based on `max` before being applied. +func (opt *Optimizer) MustBackwardStepClip(loss *ts.Tensor, max float64) { + err := opt.BackwardStepClip(loss, max) + if err != nil { + log.Fatal(err) } } -/// TODO. Clips gradient L2 norm over all trainable parameters. +type ClipOpts struct { + NormType float64 + ErrorIfNonFinite bool +} + +type ClipOpt func(*ClipOpts) + +func defaultClipOpts() *ClipOpts { + return &ClipOpts{ + NormType: 2.0, + ErrorIfNonFinite: false, // will switch to "true" in the future. + } +} + +func WithNormType(v float64) ClipOpt { + return func(o *ClipOpts) { + o.NormType = v + } +} + +func WithErrorIfNonFinite(v bool) ClipOpt { + return func(o *ClipOpts) { + o.ErrorIfNonFinite = v + } +} + +/// Clips gradient L2 norm over all trainable parameters. // // The norm is computed over all gradients together, as if they were // concatenated into a single vector. -func (opt *Optimizer) ClipGradNorm(max float64) { - // TODO. - log.Fatalf("Not implemented yet!") +// +/// Args: +// - max: max norm of the gradient +// - o.NormType. Type of the used p-norm, can be "inf" for infinity norm. Default= 2.0 +// - o.ErrorIfNonFinite bool. If true, throw error if total norm of the gradients from paramters is "nan", "inf" or "-inf". Default=false +// Returns: total norm of the parameters (viewed as a single vector) +// ref. https://github.com/pytorch/pytorch/blob/cb4aeff7d8e4c70bb638cf159878c5204d0cc2da/torch/nn/utils/clip_grad.py#L59 +func (opt *Optimizer) ClipGradNorm(max float64, opts ...ClipOpt) error { + o := defaultClipOpts() + for _, option := range opts { + option(o) + } + + opt.varstore.Lock() + defer opt.varstore.Unlock() + parameters := opt.varstore.TrainableVariables() + if len(parameters) == 0 { + // return ts.MustOfSlice([]float64{0.0}), nil + return nil + } + + var ( + norms []ts.Tensor + totalNorm *ts.Tensor + ) + + device := opt.varstore.device + if o.NormType == math.Inf(1) { + for _, v := range opt.varstore.vars { + n := v.Tensor.MustGrad(false).MustDetach(true).MustAbs(true).MustMax(true).MustTo(device, true) + norms = append(norms, *n) + } + // total_norm = norms[0] if len(norms) == 1 else torch.max(torch.stack(norms)) + totalNorm = ts.MustStack(norms, 0).MustMax(true) + } else { + for _, v := range opt.varstore.vars { + // x := v.Tensor.MustGrad(false).MustNorm(true) + + // NOTE. tensor.Norm() is going to be deprecated. So use linalg_norm + // Ref. https://pytorch.org/docs/stable/generated/torch.linalg.norm.html#torch.linalg.norm + x := v.Tensor.MustGrad(false).MustDetach(true).MustLinalgNorm(ts.FloatScalar(o.NormType), nil, false, gotch.Float, true) + norms = append(norms, *x) + } + } + + // totalNorm = ts.MustStack(norms, 0).MustNorm(true).MustAddScalar(ts.FloatScalar(1e-6), true) + // total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + totalNorm = ts.MustStack(norms, 0).MustLinalgNorm(ts.FloatScalar(o.NormType), nil, false, gotch.Float, true) + for _, x := range norms { + x.MustDrop() + } + + totalNormVal := totalNorm.Float64Values(true)[0] + // if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()): + if o.ErrorIfNonFinite && (math.IsNaN(totalNormVal) || math.IsInf(totalNormVal, 1)) { + err := fmt.Errorf("The total norm of order (%v) for gradients from 'parameters' is non-finite, so it cannot be clipped. To disable this error and scale the gradients by the non-finite norm anyway, set option.ErrorIfNonFinite= false", o.NormType) + return err + } + + // clip_coef = max_norm / (total_norm + 1e-6) + // clipCoefTs := ts.TensorFrom([]float64{max}).MustDiv(totalNorm, true) + clipCoef := max / (totalNormVal + 1e-6) + // NOTE: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so + // avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization + // when the gradients do not reside in CPU memory. + // clip_coef_clamped = torch.clamp(clip_coef, max=1.0) + if clipCoef > 1.0 { + clipCoef = 1.0 + } + for _, v := range opt.varstore.vars { + if v.Trainable { + // p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device)) + // v.Tensor.MustGrad(false).MustDetach(true).MustMulScalar_(ts.FloatScalar(clipCoef)) + v.Tensor.MustGrad(false).MustMulScalar_(ts.FloatScalar(clipCoef)) + } + } + + return nil } -// TODO. Applies a backward step pass, update the gradients, and performs an optimization step. +// BackwardStepClipNorm applies a backward step pass, update the gradients, and performs an optimization step. // // The gradients L2 norm is clipped based on `max`. -func (opt *Optimizer) BackwardStepClipNorm(loss *ts.Tensor, max float64) { - // TODO. - log.Fatalf("Not implemented yet!") +func (opt *Optimizer) BackwardStepClipNorm(loss *ts.Tensor, max float64, opts ...ClipOpt) error { + err := opt.opt.ZeroGrad() + if err != nil { + err := fmt.Errorf("Optimizer.BackwardStepClipNorm() failed: %w\n", err) + return err + } + err = loss.Backward() + if err != nil { + err := fmt.Errorf("Optimizer.BackwardStepClipNorm() failed: %w\n", err) + return err + } + + err = opt.ClipGradNorm(max, opts...) + if err != nil { + err := fmt.Errorf("Optimizer.BackwardStepClipNorm() failed: %w\n", err) + return err + } + + err = opt.Step() + if err != nil { + err := fmt.Errorf("Optimizer.BackwardStepClipNorm() failed: %w\n", err) + return err + } + + return nil +} + +// MustBackwardStepClipNorm applies a backward step pass, update the gradients, and performs an optimization step. +// +// The gradients L2 norm is clipped based on `max`. +func (opt *Optimizer) MustBackwardStepClipNorm(loss *ts.Tensor, max float64, opts ...ClipOpt) { + err := opt.BackwardStepClipNorm(loss, max, opts...) + if err != nil { + log.Fatal(err) + } } // SetLR sets the optimizer learning rate. diff --git a/nn/optimizer_test.go b/nn/optimizer_test.go index 275489f..e70b5d2 100644 --- a/nn/optimizer_test.go +++ b/nn/optimizer_test.go @@ -1,69 +1,73 @@ package nn_test -/* - * import ( - * // "reflect" - * "fmt" - * "log" - * "testing" - * - * "github.com/sugarme/gotch" - * "github.com/sugarme/gotch/nn" - * ts "github.com/sugarme/gotch/tensor" - * ) - * - * func TestOptimizer(t *testing.T) { - * - * var data []float32 - * for i := 0; i < 15; i++ { - * data = append(data, float32(i)) - * } - * xs, err := ts.NewTensorFromData(data, []int64{int64(len(data)), 1}) - * if err != nil { - * log.Fatal(err) - * } - * - * ys := xs.MustMul1(ts.FloatScalar(0.42), false).MustAdd1(ts.FloatScalar(1.337), false) - * - * vs := nn.NewVarStore(gotch.CPU) - * - * optCfg := nn.DefaultSGDConfig() - * opt, err := optCfg.Build(vs, 1e-2) - * if err != nil { - * t.Errorf("Failed building SGD optimizer") - * } - * - * cfg := nn.LinearConfig{ - * WsInit: nn.NewConstInit(0.0), - * BsInit: nn.NewConstInit(0.0), - * Bias: true, - * } - * - * linear := nn.NewLinear(vs.Root(), 1, 1, cfg) - * - * logits := xs.Apply(linear) - * loss := logits.MustMseLoss(ys, ts.ReductionMean.ToInt(), true) - * - * initialLoss := loss.MustView([]int64{-1}, false).MustFloat64Value([]int64{0}) - * - * wantLoss := float64(1.0) - * - * if initialLoss < wantLoss { - * t.Errorf("Expect initial loss > %v, got %v", wantLoss, initialLoss) - * } - * - * for i := 0; i < 50; i++ { - * loss = xs.Apply(linear).MustMseLoss(ys, ts.ReductionMean.ToInt(), true) - * - * opt.BackwardStep(loss) - * fmt.Printf("Loss: %.3f\n", loss.MustView([]int64{-1}, false).MustFloat64Value([]int64{0})) - * } - * - * loss = xs.Apply(linear).MustMseLoss(ys, ts.ReductionMean.ToInt(), true) - * finalLoss := loss.Values()[0] - * fmt.Printf("Final loss: %v\n", finalLoss) - * - * if finalLoss > 0.25 { - * t.Errorf("Expect initial loss < 0.25, got %v", finalLoss) - * } - * } */ +import ( + "fmt" + "testing" + + "github.com/sugarme/gotch" + "github.com/sugarme/gotch/nn" + ts "github.com/sugarme/gotch/tensor" +) + +func TestOptimizer(t *testing.T) { + x := ts.MustArangeStart(ts.IntScalar(1), ts.IntScalar(15), gotch.Float, gotch.CPU).MustView([]int64{-1, 1}, true) + // y = x * 0.42 + 1.337 + y := x.MustMulScalar(ts.FloatScalar(0.42), false).MustAddScalar(ts.FloatScalar(1.337), false) + + vs := nn.NewVarStore(gotch.CPU) + path := vs.Root() + + cfg := &nn.LinearConfig{ + WsInit: nn.NewConstInit(0.0), + BsInit: nn.NewConstInit(0.0), + Bias: true, + } + model := nn.NewLinear(path, 1, 1, cfg) + + lr := 1e-2 + opt, err := nn.DefaultSGDConfig().Build(vs, lr) + if err != nil { + t.Errorf("Failed building SGD optimizer") + } + + initialLoss := x.ApplyT(model, true).MustMseLoss(y, 1, true).Float64Values(true)[0] + wantLoss := float64(1.0) + if initialLoss < wantLoss { + t.Errorf("Expect initial loss > %v, got %v", wantLoss, initialLoss) + } + + // Optimization loop + for i := 0; i < 50; i++ { + logits := model.ForwardT(x, true) + loss := logits.MustMseLoss(y, 1, true) + if i%10 == 0 { + fmt.Printf("Loss: %.3f\n", loss.MustView([]int64{-1}, false).MustFloat64Value([]int64{0})) + } + opt.BackwardStep(loss) + } + + loss := x.Apply(model).MustMseLoss(y, 1, true) + opt.BackwardStep(loss) + + loss = x.Apply(model).MustMseLoss(y, 1, true) + finalLoss := loss.Float64Values()[0] + fmt.Printf("Final loss: %v\n", finalLoss) + + if finalLoss > 0.25 { + t.Errorf("Expect initial loss < 0.25, got %v", finalLoss) + } +} + +// see https://github.com/pytorch/pytorch/blob/9b203f667ac096db9f5f5679ac3e3d7931c34d36/test/test_nn.py#L2308 +func TestClipGradNorm(t *testing.T) { + // TODO. + // vs := nn.NewVarStore(gotch.CPU) + // path := vs.Root() + // l := nn.NewLinear(path, 10, 10, nn.DefaultLinearConfig()) + // maxNorm := 2.0 +} + +// see https://github.com/pytorch/pytorch/blob/9b203f667ac096db9f5f5679ac3e3d7931c34d36/test/test_nn.py#L2364 +func TestClipGradValue(t *testing.T) { + // TODO +} diff --git a/nn/other.go b/nn/other.go index dccbedb..e6686d4 100644 --- a/nn/other.go +++ b/nn/other.go @@ -24,6 +24,9 @@ func (d *Dropout) ForwardT(input *ts.Tensor, train bool) (retVal *ts.Tensor) { return ts.MustDropout(input, d.dropoutProb, train) } +// Parameter: +// ========== + // NewParameter creates a kind of tensor that is considered as a module parameter. // Ref. https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html func NewParameter(path *Path, name string, x *ts.Tensor, requireGradOpt ...bool) *ts.Tensor { @@ -32,11 +35,34 @@ func NewParameter(path *Path, name string, x *ts.Tensor, requireGradOpt ...bool) requiredGrad = requireGradOpt[0] } - param := path.Add(name, x, requiredGrad) + param := path.MustAdd(name, x, requiredGrad) return param } +// Buffer: +// ======= + +// NewBuffer creates new buffer. +// +// Buffer is different from Parameter as its requiredGrad always false. +// - `o.Persistent` param. Default=true. If `true` buffer variable will be saved when `nn.VarStore.Save()` is called. +// +// Ref. +// - https://github.com/pytorch/pytorch/blob/f71eede85a69caed637008e331f5ac5f5b7717ae/torch/nn/modules/module.py#L275 +// - https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723/2 +func NewBuffer(path *Path, name string, x *ts.Tensor, persistentOpt ...bool) *ts.Tensor { + persistent := true + if len(persistentOpt) > 0 { + persistent = persistentOpt[0] + } + opts := []AddOpt{ + WithPersistent(persistent), + WithVarType("buffer"), + } + return path.MustAdd(name, x, false, opts...) // requiredGrad always false. Different from parameter. +} + // Identity: // ========= diff --git a/nn/rnn.go b/nn/rnn.go index 61591ed..74a8673 100644 --- a/nn/rnn.go +++ b/nn/rnn.go @@ -97,26 +97,26 @@ func NewLSTM(vs *Path, inDim, hiddenDim int64, cfg *RNNConfig) *LSTM { } switch numDirections { case 1: - wIh := vs.KaimingUniform(fmt.Sprintf("weight_ih_l%d", i), []int64{gateDim, inDim}) - wHh := vs.KaimingUniform(fmt.Sprintf("weight_hh_l%d", i), []int64{gateDim, hiddenDim}) - bIh := vs.Zeros(fmt.Sprintf("bias_ih_l%d", i), []int64{gateDim}) - bHh := vs.Zeros(fmt.Sprintf("bias_hh_l%d", i), []int64{gateDim}) + wIh := vs.MustKaimingUniform(fmt.Sprintf("weight_ih_l%d", i), []int64{gateDim, inDim}) + wHh := vs.MustKaimingUniform(fmt.Sprintf("weight_hh_l%d", i), []int64{gateDim, hiddenDim}) + bIh := vs.MustZeros(fmt.Sprintf("bias_ih_l%d", i), []int64{gateDim}) + bHh := vs.MustZeros(fmt.Sprintf("bias_hh_l%d", i), []int64{gateDim}) flatWeights = append(flatWeights, *wIh, *wHh, *bIh, *bHh) case 2: // bi-directional // forward - wIh := vs.KaimingUniform(fmt.Sprintf("weight_ih_l%d", i), []int64{gateDim, inDim}) - wHh := vs.KaimingUniform(fmt.Sprintf("weight_hh_l%d", i), []int64{gateDim, hiddenDim}) - bIh := vs.Zeros(fmt.Sprintf("bias_ih_l%d", i), []int64{gateDim}) - bHh := vs.Zeros(fmt.Sprintf("bias_hh_l%d", i), []int64{gateDim}) + wIh := vs.MustKaimingUniform(fmt.Sprintf("weight_ih_l%d", i), []int64{gateDim, inDim}) + wHh := vs.MustKaimingUniform(fmt.Sprintf("weight_hh_l%d", i), []int64{gateDim, hiddenDim}) + bIh := vs.MustZeros(fmt.Sprintf("bias_ih_l%d", i), []int64{gateDim}) + bHh := vs.MustZeros(fmt.Sprintf("bias_hh_l%d", i), []int64{gateDim}) flatWeights = append(flatWeights, *wIh, *wHh, *bIh, *bHh) // reverse - wIhR := vs.KaimingUniform(fmt.Sprintf("weight_ih_l%d_reverse", i), []int64{gateDim, inDim}) - wHhR := vs.KaimingUniform(fmt.Sprintf("weight_hh_l%d_reverse", i), []int64{gateDim, hiddenDim}) - bIhR := vs.Zeros(fmt.Sprintf("bias_ih_l%d_reverse", i), []int64{gateDim}) - bHhR := vs.Zeros(fmt.Sprintf("bias_hh_l%d_reverse", i), []int64{gateDim}) + wIhR := vs.MustKaimingUniform(fmt.Sprintf("weight_ih_l%d_reverse", i), []int64{gateDim, inDim}) + wHhR := vs.MustKaimingUniform(fmt.Sprintf("weight_hh_l%d_reverse", i), []int64{gateDim, hiddenDim}) + bIhR := vs.MustZeros(fmt.Sprintf("bias_ih_l%d_reverse", i), []int64{gateDim}) + bHhR := vs.MustZeros(fmt.Sprintf("bias_hh_l%d_reverse", i), []int64{gateDim}) flatWeights = append(flatWeights, *wIhR, *wHhR, *bIhR, *bHhR) } } @@ -234,10 +234,10 @@ func NewGRU(vs *Path, inDim, hiddenDim int64, cfg *RNNConfig) (retVal *GRU) { inputDim = hiddenDim * numDirections } - wIh := vs.KaimingUniform("w_ih", []int64{gateDim, inputDim}) - wHh := vs.KaimingUniform("w_hh", []int64{gateDim, hiddenDim}) - bIh := vs.Zeros("b_ih", []int64{gateDim}) - bHh := vs.Zeros("b_hh", []int64{gateDim}) + wIh := vs.MustKaimingUniform("w_ih", []int64{gateDim, inputDim}) + wHh := vs.MustKaimingUniform("w_hh", []int64{gateDim, hiddenDim}) + bIh := vs.MustZeros("b_ih", []int64{gateDim}) + bHh := vs.MustZeros("b_hh", []int64{gateDim}) flatWeights = append(flatWeights, *wIh, *wHh, *bIh, *bHh) } diff --git a/nn/sequential.go b/nn/sequential.go index 38d949c..6ac46ce 100644 --- a/nn/sequential.go +++ b/nn/sequential.go @@ -251,6 +251,7 @@ func BatchAccuracyForLogits(vs *VarStore, m ts.ModuleT, xs, ys *ts.Tensor, d got logits := m.ForwardT(bImages, false) acc := logits.AccuracyForLogits(bLabels) + logits.MustDrop() sumAccuracy += acc.Float64Values()[0] * size sampleCount += size diff --git a/nn/sparse.go b/nn/sparse.go index f545b4e..b6b20a5 100644 --- a/nn/sparse.go +++ b/nn/sparse.go @@ -35,7 +35,7 @@ type Embedding struct { // NewEmbedding creates a new Embedding func NewEmbedding(vs *Path, numEmbeddings int64, embeddingDim int64, config *EmbeddingConfig) *Embedding { return &Embedding{ - Ws: vs.NewVar("weight", []int64{numEmbeddings, embeddingDim}, config.WsInit), + Ws: vs.MustNewVar("weight", []int64{numEmbeddings, embeddingDim}, config.WsInit), config: config, } } diff --git a/nn/varstore.go b/nn/varstore.go index 3f127ce..d6212ae 100644 --- a/nn/varstore.go +++ b/nn/varstore.go @@ -187,6 +187,46 @@ func (vs *VarStore) Load(filepath string) error { v.Tensor.Copy_(currTs) }) } + + for _, x := range namedTensors { + x.Tensor.MustDrop() + } + + return nil +} + +// LoadWeights loads pretrained weights to VarStore. +func (vs *VarStore) LoadWeights(namedTensors []ts.NamedTensor) error { + var namedTensorsMap map[string]*ts.Tensor = make(map[string]*ts.Tensor, 0) + for _, namedTensor := range namedTensors { + namedTensorsMap[namedTensor.Name] = namedTensor.Tensor + } + + // Match and in-place copy value (update) from newly loaded tensors + // to existing named tensors if name is matched. Throw error otherwise. + vs.Lock() + defer vs.Unlock() + + for name, v := range vs.vars { + // missing variable + currTs, ok := namedTensorsMap[name] + if !ok { + err := fmt.Errorf("VarStore.LoadWeights() failed: there's a tensor with name %q in VarStore, but not found in the loaded weights.\n", name) + return err + } + + // mismatched shape + sourceShape := currTs.MustSize() + destShape := v.Tensor.MustSize() + if !reflect.DeepEqual(destShape, sourceShape) { + err := fmt.Errorf("VarStore.LoadWeights() failed. Mismatched shape error for variable name: %v - At store: %v - At source %v\n", name, destShape, sourceShape) + return err + } + + ts.NoGrad(func() { + v.Tensor.Copy_(currTs) + }) + } return nil } @@ -242,6 +282,60 @@ func (vs *VarStore) LoadPartial(filepath string) ([]string, error) { }) } + for _, x := range namedTensors { + x.Tensor.MustDrop() + } + + return missingVariables, nil +} + +// LoadWeightsPartial loads the VarStore variable values from a file if it exists. +// +// Weight values for the tensors currently stored in the var-store and the given file get +// loaded from the given file. If a variable in the var store is not present in the given file, +// it is skipped and its values are not updated. This method should be used if pre-trained +// weight for only parts of the model are available. +// Note that the set of variables stored in the var-store is not changed, only the values +// for these tensors are modified. +// +// Returns a String Vector containing the names of missing variables. +func (vs *VarStore) LoadWeightsPartial(namedTensors []ts.NamedTensor) ([]string, error) { + var namedTensorsMap map[string]*ts.Tensor = make(map[string]*ts.Tensor, 0) + for _, namedTensor := range namedTensors { + namedTensorsMap[namedTensor.Name] = namedTensor.Tensor + } + + var missingVariables []string + + // Match and in-place copy value (update) from newly loaded tensors + // to existing named tensors if name is matched. Throw error otherwise. + vs.Lock() + defer vs.Unlock() + + for name, v := range vs.vars { + var currTs *ts.Tensor + var ok bool + + // missing variable + if currTs, ok = namedTensorsMap[name]; !ok { + missingVariables = append(missingVariables, name) + continue + } + + // mismatched shape + destShape := currTs.MustSize() + sourceShape := v.Tensor.MustSize() + if !reflect.DeepEqual(destShape, sourceShape) { + fmt.Printf("WARNING: Mismatched shape error for variable name: %v - At store: %v - At source %v. Skip loading this weight...\n", name, destShape, sourceShape) + missingVariables = append(missingVariables, name) + continue + } + + ts.NoGrad(func() { + v.Tensor.Copy_(currTs) + }) + } + return missingVariables, nil } @@ -284,7 +378,7 @@ func (vs *VarStore) Unfreeze() error { // // All the variables in this var store have to exist with the same // name in the source var store, otherwise an error is returned. -func (vs *VarStore) Copy(src VarStore) error { +func (vs *VarStore) Copy(src *VarStore) error { vs.Lock() defer vs.Unlock() src.Lock() @@ -343,6 +437,34 @@ func (vs *VarStore) Summary() { fmt.Printf("Num of layers: %v\n", len(vars)) } +// ToDType casts all variables in VarStore to specified DType. +// +// NOTE. only float-like types (Half, Float, Double) can ensure convertible. +func (vs *VarStore) ToDType(dtype gotch.DType) { + vs.Root().ToDType(dtype) +} + +// ToHalf casts all float-like variables in VarStore to `Half` dtype. +// +// NOTE. float-like includes `Half`, `Float` and `Double` dtype. +func (vs *VarStore) ToHalf() { + vs.Root().ToHalf() +} + +// ToFloat casts all float-like variables in VarStore to `Float` dtype. +// +// NOTE. float-like includes `Half`, `Float` and `Double` dtype. +func (vs *VarStore) ToFloat() { + vs.Root().ToFloat() +} + +// ToDouble casts all float-like variables in VarStore to `Double` dtype. +// +// NOTE. float-like includes `Half`, `Float` and `Double` dtype. +func (vs *VarStore) ToDouble() { + vs.Root().ToDouble() +} + // Path methods: // ============= @@ -467,6 +589,23 @@ func (p *Path) Add(name string, x *ts.Tensor, trainable bool, opts ...AddOpt) (* return p.add(name, x, trainable, o.VarType, o.Persistent) } +// MustAdd adds a tensor to a given path. +// +// Args +// - name: intention name of variable in VarStore (if duplicated, it will be added a suffix number) +// - x: tensor holding values to keep in VarStore +// - trainable: marked whether tensor is trainable. +// - o.VarType: variable type, i.e., either "parameter" or "buffer" +// - o.Persistent: whether to save this variables when `VarStore.Save()` is called. Only applied to `buffer` type. +// Returns a reference to a tensor stored in VarStore. +func (p *Path) MustAdd(name string, x *ts.Tensor, trainable bool, opts ...AddOpt) *ts.Tensor { + x, err := p.Add(name, x, trainable, opts...) + if err != nil { + log.Fatal(err) + } + return x +} + func (p *Path) getOrAddWithLock(name string, tensor *ts.Tensor, trainable bool, opts ...AddOpt) (*ts.Tensor, error) { path := p.getpath(name) @@ -480,9 +619,73 @@ func (p *Path) getOrAddWithLock(name string, tensor *ts.Tensor, trainable bool, } func (p *Path) SetGroup(g uint) { + p.varstore.Lock() + defer p.varstore.Unlock() + + // TODO. set group for individual variables. + // TBD. variables of current path only or all sub paths as well? + // For now, just set group for all variable at the path + path := strings.Join(p.path, SEP) + for name, v := range p.varstore.vars { + vpaths := strings.Split(name, SEP) + vpath := strings.Join(vpaths[:len(vpaths)-1], SEP) + if vpath == path { + v.Group = g + p.varstore.vars[name] = v + } + } p.group = g } +// ToDType casts all variables in this path and its sub-paths to the specified dtype. +// +// NOTE. this method should be used for floating-point conversion, i.e., +// "gotch.Float", "gotch.Half", "gotch.Float16", "gotch.Double". +func (p *Path) ToDType(dtype gotch.DType) { + p.varstore.Lock() + defer p.varstore.Unlock() + path := strings.Join(p.path, SEP) + for name, v := range p.varstore.vars { + if strings.Contains(name, path) { + newVar := v + newVar.Tensor = v.Tensor.MustTotype(dtype, true) + p.varstore.vars[name] = newVar + } + } +} + +// toFloat casts all float-like variables in this current path and sub-paths to specified dtype. +func (p *Path) toFloat(dtype gotch.DType) { + p.varstore.Lock() + defer p.varstore.Unlock() + path := strings.Join(p.path, SEP) + for name, v := range p.varstore.vars { + if strings.Contains(name, path) { + dtype := v.Tensor.DType() + if dtype == gotch.Half || dtype == gotch.Float || dtype == gotch.Double { + newVar := v + newVar.Tensor = v.Tensor.MustTotype(dtype, true) + p.varstore.vars[name] = newVar + } + } + } +} + +// ToHalf casts all variables in current path and subpaths to `Half` precision. +func (p *Path) ToHalf() { + p.toFloat(gotch.Half) +} + +// ToFloat casts all variables in current path and subpaths to `Float` precision. +func (p *Path) ToFloat() { + p.toFloat(gotch.Float) +} + +// ToDouble casts all variables in current path and subpaths to `Double` precision. +func (p *Path) ToDouble() { + p.toFloat(gotch.Double) +} + // ZerosNoTrain creates a new variable initialized with zeros. // // The new variable is named according to the name parameter and @@ -506,6 +709,20 @@ func (p *Path) ZerosNoTrain(name string, dims []int64, opts ...AddOpt) (*ts.Tens return out, nil } +// MustZerosNoTrain creates a new variable initialized with zeros. +// +// The new variable is named according to the name parameter and +// has the specified shape. The variable will not be trainable so +// gradients will not be tracked. +// The variable uses a float tensor initialized with zeros. +func (p *Path) MustZerosNoTrain(name string, dims []int64, opts ...AddOpt) *ts.Tensor { + x, err := p.ZerosNoTrain(name, dims, opts...) + if err != nil { + log.Fatal(err) + } + return x +} + // OnesNoTrain creates a new variable initialized with ones. // // The new variable is named according to the name parameter and @@ -529,6 +746,20 @@ func (p *Path) OnesNoTrain(name string, dims []int64, opts ...AddOpt) (*ts.Tenso return out, nil } +// MustOnesNoTrain creates a new variable initialized with ones. +// +// The new variable is named according to the name parameter and +// has the specified shape. The variable will not be trainable so +// gradients will not be tracked. +// The variable uses a float tensor initialized with ones. +func (p *Path) MustOnesNoTrain(name string, dims []int64, opts ...AddOpt) *ts.Tensor { + x, err := p.OnesNoTrain(name, dims, opts...) + if err != nil { + log.Fatal(err) + } + return x +} + // NewVar creates a new variable. // // The new variable is named according to the name parameter and diff --git a/pickle/serialization.go b/pickle/serialization.go index bd4677c..1c5457c 100644 --- a/pickle/serialization.go +++ b/pickle/serialization.go @@ -439,26 +439,19 @@ func LoadAll(vs *nn.VarStore, modelFile string) error { return err } - // for tsName, _ := range vs.Vars.NamedVariables { - for tsName := range vs.Vars.NamedVariables { - // missing variable - currTs, ok := weights[tsName] - if !ok { - err = fmt.Errorf("LoadAll() failed: Cannot find tensor with name: %v in variable store. \n", tsName) - return err + var namedTensors []ts.NamedTensor + for n, x := range weights { + namedTs := ts.NamedTensor{ + Name: n, + Tensor: x, } - // mismatched shape - sourceShape := currTs.MustSize() - destShape := vs.Vars.NamedVariables[tsName].MustSize() - if !reflect.DeepEqual(destShape, sourceShape) { - err = fmt.Errorf("LoadAll() failed: Mismatched shape error for variable name: %v - At store: %v - At source %v\n", tsName, destShape, sourceShape) - return err - } + namedTensors = append(namedTensors, namedTs) + } - ts.NoGrad(func() { - vs.Vars.NamedVariables[tsName].Copy_(currTs) - }) + err = vs.LoadWeights(namedTensors) + if err != nil { + return err } for _, x := range weights { @@ -477,32 +470,21 @@ func LoadPartial(vs *nn.VarStore, modelFile string) ([]string, error) { return nil, err } + var namedTensors []ts.NamedTensor + for n, x := range weights { + namedTs := ts.NamedTensor{ + Name: n, + Tensor: x, + } + + namedTensors = append(namedTensors, namedTs) + } + var missingVariables []string - // Match and in-place copy value (update) from newly loaded tensors - // to existing named tensors if name is matched. Throw error otherwise. - for tsName := range vs.Vars.NamedVariables { - var currTs *ts.Tensor - var ok bool - - // missing variable - if currTs, ok = weights[tsName]; !ok { - missingVariables = append(missingVariables, tsName) - continue - } - - // mismatched shape - destShape := currTs.MustSize() - sourceShape := vs.Vars.NamedVariables[tsName].MustSize() - if !reflect.DeepEqual(destShape, sourceShape) { - fmt.Printf("WARNING: Mismatched shape error for variable name: %v - At store: %v - At source %v. Skip loading this weight...\n", tsName, destShape, sourceShape) - missingVariables = append(missingVariables, tsName) - continue - } - - ts.NoGrad(func() { - vs.Vars.NamedVariables[tsName].Copy_(currTs) - }) + missingVariables, err = vs.LoadWeightsPartial(namedTensors) + if err != nil { + return nil, err } for _, x := range weights { diff --git a/tensor/tensor.go b/tensor/tensor.go index 37317dc..99ff077 100644 --- a/tensor/tensor.go +++ b/tensor/tensor.go @@ -1101,38 +1101,39 @@ func (ngg *NoGradGuard) Enable() { _ = MustGradSetEnabled(ngg.enabled) } -// Reduction type is an enum-like type -type Reduction int - const ( // Do not reduce - ReductionNone Reduction = iota + ReductionNone int64 = 0 // Mean of losses - ReductionMean + ReductionMean int64 = 1 // Sum of losses - ReductionSum + ReductionSum int64 = 2 // Escape hatch in case new options become available - ReductionOther + ReductionOther int64 = 3 ) -func (r Reduction) ToInt() int { - switch r { - case ReductionNone: - return 0 - case ReductionMean: - return 1 - case ReductionSum: - return 2 - case ReductionOther: - return 3 - } - - // NOTE. should it be panic here instead of returning -1? - return -1 -} +// func (r Reduction) ToInt() int { +// switch r { +// case ReductionNone: +// return 0 +// case ReductionMean: +// return 1 +// case ReductionSum: +// return 2 +// case ReductionOther: +// return 3 +// } +// +// // NOTE. should it be panic here instead of returning -1? +// return -1 +// } // Float64Values returns values of tensor in a slice of float64. -func (ts *Tensor) Float64Values() []float64 { +func (ts *Tensor) Float64Values(delOpt ...bool) []float64 { + del := false + if len(delOpt) > 0 { + del = delOpt[0] + } numel := ts.Numel() vec := make([]float64, numel) @@ -1141,11 +1142,19 @@ func (ts *Tensor) Float64Values() []float64 { float64Ts.MustCopyData(vec, numel) float64Ts.MustDrop() + if del { + ts.MustDrop() + } + return vec } // Int64Values returns values of tensor in a slice of int64. -func (ts *Tensor) Int64Values() []int64 { +func (ts *Tensor) Int64Values(delOpt ...bool) []int64 { + del := false + if len(delOpt) > 0 { + del = delOpt[0] + } numel := ts.Numel() vec := make([]int64, numel) @@ -1154,6 +1163,10 @@ func (ts *Tensor) Int64Values() []int64 { int64Ts.MustCopyData(vec, numel) int64Ts.MustDrop() + if del { + ts.MustDrop() + } + return vec }