Merge pull request #12 from sugarme/pointer

Convert to using `pointer receiver`
2020-11-01 13:40:35 +11:00 · 2020-11-01 13:40:35 +11:00 · 11f5bcff73
commit 11f5bcff73
parent 59ea5f0e1b c0960dd764
78 changed files with 31813 additions and 27374 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,7 +13,6 @@
 *.json
 *.pt
 *.ot
-*.jpg

 target/
 _build/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -33,3 +33,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 [#10]: https://github.com/sugarme/gotch/issues/10

+## [0.2.0]
+
+### Changed
+- Convert all APIs to using **Pointer Receiver**
+
+### Added
+- Added drawing image label at `example/yolo` example
+- Added some example images and README files for `example/yolo` and `example/neural-style-transfer`
--- a/README.md
+++ b/README.md
@ -5,6 +5,7 @@

 - **GoTch** is a C++ Libtorch Go binding for developing and implementing deep learning projects in Go.
 - This package is to create a thin wrapper of Libtorch to make use of its tensor APIs and CUDA support while implementing as much idiomatic Go as possible. 
+- There are about **1129** auto-generated tensor APIs.

 ## Dependencies

--- a/example/basic/main.go
+++ b/example/basic/main.go
@ -1,7 +1,7 @@
 package main

 import (
-	"fmt"
+	// "fmt"

 	"github.com/sugarme/gotch"
 	ts "github.com/sugarme/gotch/tensor"
@ -11,12 +11,10 @@ func main() {

 	// Create a tensor [2,3,4]
 	tensor := ts.MustArange(ts.IntScalar(2*3*4), gotch.Int64, gotch.CPU).MustView([]int64{2, 3, 4}, true)
-
 	tensor.Print()

-	fmt.Printf("tensor is nil: %v\n", tensor.IsNil())
+	mul := ts.MustOnes([]int64{4, 5}, gotch.Int64, gotch.CPU)
+	res := tensor.MustMatmul(mul, false)

-	tensor.MustDrop()
-
-	fmt.Printf("tensor is nil: %v\n", tensor.IsNil())
+	res.Print()
 }
--- a/example/char-rnn/main.go
+++ b/example/char-rnn/main.go
@ -18,7 +18,7 @@ const (
 	SamplingLen  int64   = 1024
 )

-func sample(data ts.TextData, lstm nn.LSTM, linear nn.Linear, device gotch.Device) (retVal string) {
+func sample(data *ts.TextData, lstm *nn.LSTM, linear *nn.Linear, device gotch.Device) string {

 	labels := data.Labels()
 	inState := lstm.ZeroState(1)
@ -34,15 +34,15 @@ func sample(data ts.TextData, lstm nn.LSTM, linear nn.Linear, device gotch.Devic
 		state := lstm.Step(input, inState)

 		// 1. Delete inState tensors (from C land memory)
-		inState.(nn.LSTMState).Tensor1.MustDrop()
-		inState.(nn.LSTMState).Tensor2.MustDrop()
+		inState.(*nn.LSTMState).Tensor1.MustDrop()
+		inState.(*nn.LSTMState).Tensor2.MustDrop()
 		// 2. Then update with current state
 		inState = state
 		// 3. Delete intermediate tensors
 		input.MustDrop()
 		inputView.MustDrop()

-		forwardTs := linear.Forward(state.(nn.LSTMState).H()).MustSqueeze1(0, true).MustSoftmax(-1, gotch.Float, true)
+		forwardTs := linear.Forward(state.(*nn.LSTMState).H()).MustSqueeze1(0, true).MustSoftmax(-1, gotch.Float, true)
 		sampledY := forwardTs.MustMultinomial(1, false, true)
 		lastLabel = sampledY.Int64Values()[0]
 		sampledY.MustDrop()
@ -52,8 +52,8 @@ func sample(data ts.TextData, lstm nn.LSTM, linear nn.Linear, device gotch.Devic
 	}

 	// Delete the last state
-	inState.(nn.LSTMState).Tensor1.MustDrop()
-	inState.(nn.LSTMState).Tensor2.MustDrop()
+	inState.(*nn.LSTMState).Tensor1.MustDrop()
+	inState.(*nn.LSTMState).Tensor2.MustDrop()

 	return string(runes)
 }
@ -104,8 +104,8 @@ func main() {
 			lstmOut, outState := lstm.Seq(xsOnehot)
 			// NOTE. Although outState will not be used. There a hidden memory usage
 			// on C land memory that is needed to free up. Don't use `_`
-			outState.(nn.LSTMState).Tensor1.MustDrop()
-			outState.(nn.LSTMState).Tensor2.MustDrop()
+			outState.(*nn.LSTMState).Tensor1.MustDrop()
+			outState.(*nn.LSTMState).Tensor2.MustDrop()

 			logits := linear.Forward(lstmOut)
 			lossView := logits.MustView([]int64{BatchSize * SeqLen, labels}, true)
--- a/example/cifar/main.go
+++ b/example/cifar/main.go
@ -18,7 +18,7 @@ import (
 	"github.com/sugarme/gotch/vision"
 )

-func convBn(p nn.Path, cIn, cOut int64) (retVal nn.SequentialT) {
+func convBn(p *nn.Path, cIn, cOut int64) *nn.SequentialT {
 	config := nn.DefaultConv2DConfig()
 	config.Padding = []int64{1, 1}
 	config.Bias = false
@ -27,19 +27,19 @@ func convBn(p nn.Path, cIn, cOut int64) (retVal nn.SequentialT) {

 	seq.Add(nn.NewConv2D(p, cIn, cOut, 3, config))
 	seq.Add(nn.BatchNorm2D(p, cOut, nn.DefaultBatchNormConfig()))
-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	return seq
 }

-func layer(p nn.Path, cIn, cOut int64) (retVal nn.FuncT) {
+func layer(p *nn.Path, cIn, cOut int64) nn.FuncT {
 	pre := convBn(p.Sub("pre"), cIn, cOut)
 	block1 := convBn(p.Sub("b1"), cOut, cOut)
 	block2 := convBn(p.Sub("b2"), cOut, cOut)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.ApplyT(pre, train)
 		preTs := tmp1.MaxPool2DDefault(2, true)
 		tmp2 := preTs.ApplyT(block1, train)
@ -53,17 +53,17 @@ func layer(p nn.Path, cIn, cOut int64) (retVal nn.FuncT) {
 	})
 }

-func fastResnet(p nn.Path) (retVal nn.SequentialT) {
+func fastResnet(p *nn.Path) *nn.SequentialT {
 	seq := nn.SeqT()

 	seq.Add(convBn(p.Sub("pre"), 3, 64))
 	seq.Add(layer(p.Sub("layer1"), 64, 128))
 	seq.Add(convBn(p.Sub("inter"), 128, 256))
-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MaxPool2DDefault(2, false)
 	}))
 	seq.Add(layer(p.Sub("layer2"), 256, 512))
-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MaxPool2DDefault(4, false)
 		res := tmp.FlatView()
 		tmp.MustDrop()
@ -72,7 +72,7 @@ func fastResnet(p nn.Path) (retVal nn.SequentialT) {
 	}))

 	seq.Add(nn.NewLinear(p.Sub("linear"), 512, 10, nn.DefaultLinearConfig()))
-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustMul1(ts.FloatScalar(0.125), false)
 	}))

@ -89,8 +89,8 @@ func main() {
 	fmt.Printf("TestLabel shape: %v\n", ds.TestLabels.MustSize())
 	fmt.Printf("Number of labels: %v\n", ds.Labels)

-	cuda := gotch.CudaBuilder(0)
-	device := cuda.CudaIfAvailable()
+	// device := gotch.CPU
+	device := gotch.NewCuda().CudaIfAvailable()

 	vs := nn.NewVarStore(device)

@ -104,7 +104,7 @@ func main() {
 	for epoch := 0; epoch < 150; epoch++ {
 		optConfig := nn.NewSGDConfig(0.9, 0.0, 5e-4, true)
 		var (
-			opt nn.Optimizer
+			opt *nn.Optimizer
 			err error
 		)
 		switch {
--- a/example/jit/emu.jpg
+++ b/example/jit/emu.jpg
--- a/example/jit/image.jpg
+++ b/example/jit/image.jpg
--- a/example/jit/kangaroo.jpg
+++ b/example/jit/kangaroo.jpg
--- a/example/jit/koala.jpg
+++ b/example/jit/koala.jpg
--- a/example/jit/pig.jpg
+++ b/example/jit/pig.jpg
--- a/example/jit/wombat.jpg
+++ b/example/jit/wombat.jpg
--- a/example/mnist/cnn.go
+++ b/example/mnist/cnn.go
@ -22,26 +22,26 @@ const (
 )

 type Net struct {
-	conv1 nn.Conv2D
-	conv2 nn.Conv2D
-	fc1   nn.Linear
-	fc2   nn.Linear
+	conv1 *nn.Conv2D
+	conv2 *nn.Conv2D
+	fc1   *nn.Linear
+	fc2   *nn.Linear
 }

-func newNet(vs nn.Path) Net {
+func newNet(vs *nn.Path) *Net {
 	conv1 := nn.NewConv2D(vs, 1, 32, 5, nn.DefaultConv2DConfig())
 	conv2 := nn.NewConv2D(vs, 32, 64, 5, nn.DefaultConv2DConfig())
 	fc1 := nn.NewLinear(vs, 1024, 1024, nn.DefaultLinearConfig())
 	fc2 := nn.NewLinear(vs, 1024, 10, nn.DefaultLinearConfig())

-	return Net{
+	return &Net{
 		conv1,
 		conv2,
 		fc1,
 		fc2}
 }

-func (n Net) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (n *Net) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	outView1 := xs.MustView([]int64{-1, 1, 28, 28}, false)
 	defer outView1.MustDrop()

@ -57,20 +57,19 @@ func (n Net) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
 	outView2 := outMP2.MustView([]int64{-1, 1024}, true)
 	defer outView2.MustDrop()

-	outFC1 := outView2.Apply(&n.fc1)
+	outFC1 := outView2.Apply(n.fc1)

 	outRelu := outFC1.MustRelu(true)
 	defer outRelu.MustDrop()
 	outDropout := ts.MustDropout(outRelu, 0.5, train)
 	defer outDropout.MustDrop()

-	return outDropout.Apply(&n.fc2)
-
+	return outDropout.Apply(n.fc2)
 }

 func runCNN1() {

-	var ds vision.Dataset
+	var ds *vision.Dataset
 	ds = vision.LoadMNISTDir(MnistDirNN)
 	testImages := ds.TestImages
 	testLabels := ds.TestLabels
@ -98,7 +97,7 @@ func runCNN1() {

 		batches := samples / batchSize
 		batchIndex := 0
-		var epocLoss ts.Tensor
+		var epocLoss *ts.Tensor
 		for i := 0; i < batches; i++ {
 			start := batchIndex * batchSize
 			size := batchSize
--- a/example/mnist/linear.go
+++ b/example/mnist/linear.go
@ -17,7 +17,7 @@ const (
 )

 func runLinear() {
-	var ds vision.Dataset
+	var ds *vision.Dataset
 	ds = vision.LoadMNISTDir(MnistDir)

 	device := gotch.CPU
--- a/example/mnist/nn.go
+++ b/example/mnist/nn.go
@ -23,21 +23,21 @@ const (

 var l nn.Linear

-func netInit(vs nn.Path) ts.Module {
+func netInit(vs *nn.Path) ts.Module {
 	n := nn.Seq()

 	n.Add(nn.NewLinear(vs, ImageDimNN, HiddenNodesNN, nn.DefaultLinearConfig()))

-	n.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	n.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	n.Add(nn.NewLinear(vs, HiddenNodesNN, LabelNN, nn.DefaultLinearConfig()))

-	return &n
+	return n
 }

-func train(trainX, trainY, testX, testY ts.Tensor, m ts.Module, opt nn.Optimizer, epoch int) {
+func train(trainX, trainY, testX, testY *ts.Tensor, m ts.Module, opt *nn.Optimizer, epoch int) {

 	logits := m.Forward(trainX)
 	loss := logits.CrossEntropyForLogits(trainY)
@ -56,7 +56,7 @@ func train(trainX, trainY, testX, testY ts.Tensor, m ts.Module, opt nn.Optimizer

 func runNN() {

-	var ds vision.Dataset
+	var ds *vision.Dataset
 	ds = vision.LoadMNISTDir(MnistDirNN)
 	vs := nn.NewVarStore(gotch.CPU)
 	net := netInit(vs.Root())
--- a/example/neural-style-transfer/README.md
+++ b/example/neural-style-transfer/README.md
@ -0,0 +1,25 @@
+# Neural-style Transfer
+
+This example demonstrates how to implement "Neural Style Transfer" with VGG-16 model. 
+
+The model weights can be [downloaded here](https://drive.google.com/file/d/1gO2vzafLoM_scJIaej7F392uEvKR3pV8/view?usp=sharing)
+
+## Content Image
+
+![Content Image](content.jpg)
+
+## Style Image
+
+![Style Image](style.jpg)
+
+## Output image (after 10 steps)
+
+![Output Image](out10.jpg)
+
+
+## Image Resources
+
+1. [Traditional Australian aboriginal art](https://www.pxfuel.com/en/free-photo-omwvj)
+2. [Sydney Harbour Bridge - Circular Quay](https://www.wallpaperflare.com/sydney-australia-circular-quay-cloudy-winter-cold-moody-wallpaper-ejzdw)
+
+
--- a/example/neural-style-transfer/content.jpg
+++ b/example/neural-style-transfer/content.jpg
--- a/example/neural-style-transfer/main.go
+++ b/example/neural-style-transfer/main.go
@ -31,7 +31,7 @@ var (
 	style   string
 )

-func gramMatrix(m ts.Tensor) (retVal ts.Tensor) {
+func gramMatrix(m *ts.Tensor) *ts.Tensor {
 	sizes, err := m.Size4()
 	if err != nil {
 		log.Fatal(err)
@ -52,7 +52,7 @@ func gramMatrix(m ts.Tensor) (retVal ts.Tensor) {
 	return gram.MustDiv1(ts.IntScalar(a*b*c*d), true)
 }

-func styleLoss(m1 ts.Tensor, m2 ts.Tensor) (retVal ts.Tensor) {
+func styleLoss(m1 *ts.Tensor, m2 *ts.Tensor) *ts.Tensor {
 	gram1 := gramMatrix(m1)
 	// m1.MustDrop()
 	gram2 := gramMatrix(m2)
@ -87,9 +87,9 @@ func main() {
 		log.Fatal(err)
 	}

-	cuda := gotch.CudaBuilder(0)
-	device := cuda.CudaIfAvailable()
-	// device := gotch.CPU
+	// cuda := gotch.CudaBuilder(0)
+	// device := cuda.CudaIfAvailable()
+	device := gotch.CPU

 	netVS := nn.NewVarStore(device)
 	in := vision.NewImageNet()
@ -153,13 +153,13 @@ func main() {
 		sLoss := ts.MustZeros([]int64{1}, gotch.Float, device)
 		cLoss := ts.MustZeros([]int64{1}, gotch.Float, device)
 		for _, idx := range StyleIndexes {
-			l := styleLoss(inputLayers[idx], styleLayers[idx])
+			l := styleLoss(&inputLayers[idx], &styleLayers[idx])
 			sLoss = sLoss.MustAdd(l, true)
 			l.MustDrop()
 		}
 		for _, idx := range ContentIndexes {
 			// NOTE: set `del` = true called panic at GPU train (tested on Colab)
-			l := inputLayers[idx].MustMseLoss(contentLayers[idx], int64(ts.ReductionMean), false)
+			l := inputLayers[idx].MustMseLoss(&contentLayers[idx], int64(ts.ReductionMean), false)
 			cLoss = cLoss.MustAdd(l, true)
 			l.MustDrop()
 		}
--- a/example/neural-style-transfer/out10.jpg
+++ b/example/neural-style-transfer/out10.jpg
--- a/example/neural-style-transfer/style.jpg
+++ b/example/neural-style-transfer/style.jpg
--- a/example/transfer-learning/main.go
+++ b/example/transfer-learning/main.go
@ -62,22 +62,24 @@ func main() {

 	trainImages := ts.NoGrad1(func() (retVal interface{}) {
 		return dataset.TrainImages.ApplyT(net, true)
-	}).(ts.Tensor)
+	}).(*ts.Tensor)

 	testImages := ts.NoGrad1(func() (retVal interface{}) {
 		return dataset.TestImages.ApplyT(net, true)
-	}).(ts.Tensor)
+	}).(*ts.Tensor)

 	fmt.Println("start training...")

 	for epoch := 1; epoch <= 1000; epoch++ {

-		predicted := trainImages.Apply(linear)
+		predicted := trainImages.ApplyT(linear, true)
 		loss := predicted.CrossEntropyForLogits(dataset.TrainLabels)
 		sgd.BackwardStep(loss)
 		loss.MustDrop()

-		testAccuracy := testImages.Apply(linear).AccuracyForLogits(dataset.TestLabels)
-		fmt.Printf("Epoch %v\t Accuracy: %5.2f%%\n", epoch, testAccuracy.Float64Values()[0]*100)
+		ts.NoGrad(func() {
+			testAccuracy := testImages.Apply(linear).AccuracyForLogits(dataset.TestLabels)
+			fmt.Printf("Epoch %v\t Accuracy: %5.2f%%\n", epoch, testAccuracy.Float64Values()[0]*100)
+		})
 	}
 }
--- a/example/translation/main.go
+++ b/example/translation/main.go
@ -34,35 +34,35 @@ type Encoder struct {
 	gru       nn.GRU
 }

-func newEncoder(vs nn.Path, inDim, hiddenDim int64) (retVal Encoder) {
+func newEncoder(vs *nn.Path, inDim, hiddenDim int64) *Encoder {

 	gru := nn.NewGRU(vs, hiddenDim, hiddenDim, nn.DefaultRNNConfig())

 	embedding := nn.NewEmbedding(vs, inDim, hiddenDim, nn.DefaultEmbeddingConfig())

-	return Encoder{embedding, gru}
+	return &Encoder{*embedding, *gru}
 }

-func (e Encoder) forward(xs ts.Tensor, state nn.GRUState) (retTs ts.Tensor, retState nn.GRUState) {
+func (e *Encoder) forward(xs *ts.Tensor, state *nn.GRUState) (*ts.Tensor, *nn.GRUState) {

-	retTs = e.embedding.Forward(xs).MustView([]int64{1, -1}, true)
-	retState = e.gru.Step(retTs, state).(nn.GRUState)
+	retTs := e.embedding.Forward(xs).MustView([]int64{1, -1}, true)
+	retState := e.gru.Step(retTs, state).(*nn.GRUState)

 	return retTs, retState
 }

 type Decoder struct {
 	device      gotch.Device
-	embedding   nn.Embedding
-	gru         nn.GRU
-	attn        nn.Linear
-	attnCombine nn.Linear
-	linear      nn.Linear
+	embedding   *nn.Embedding
+	gru         *nn.GRU
+	attn        *nn.Linear
+	attnCombine *nn.Linear
+	linear      *nn.Linear
 }

-func newDecoder(vs nn.Path, hiddenDim, outDim int64) (retVal Decoder) {
+func newDecoder(vs *nn.Path, hiddenDim, outDim int64) *Decoder {

-	return Decoder{
+	return &Decoder{
 		device:      vs.Device(),
 		embedding:   nn.NewEmbedding(vs, outDim, hiddenDim, nn.DefaultEmbeddingConfig()),
 		gru:         nn.NewGRU(vs, hiddenDim, hiddenDim, nn.DefaultRNNConfig()),
@ -72,7 +72,7 @@ func newDecoder(vs nn.Path, hiddenDim, outDim int64) (retVal Decoder) {
 	}
 }

-func (d Decoder) forward(xs ts.Tensor, state nn.GRUState, encOutputs ts.Tensor, isTraining bool) (retTs ts.Tensor, retState nn.GRUState) {
+func (d *Decoder) forward(xs *ts.Tensor, state *nn.GRUState, encOutputs *ts.Tensor, isTraining bool) (*ts.Tensor, *nn.GRUState) {

 	forwardTsTmp := d.embedding.Forward(xs)
 	forwardTsTmp.MustDropout_(0.1, isTraining)
@ -81,7 +81,7 @@ func (d Decoder) forward(xs ts.Tensor, state nn.GRUState, encOutputs ts.Tensor,
 	// NOTE. forwardTs shape: [1, 256] state [1, 1, 256]
 	// hence, just get state[0] of 3D tensor state
 	stateTs := state.Value().MustShallowClone().MustView([]int64{1, -1}, true)
-	catTs := ts.MustCat([]ts.Tensor{forwardTs, stateTs}, 1)
+	catTs := ts.MustCat([]ts.Tensor{*forwardTs, *stateTs}, 1)
 	stateTs.MustDrop()

 	// NOTE. d.attn Ws shape : [512, 10]
@ -97,44 +97,44 @@ func (d Decoder) forward(xs ts.Tensor, state nn.GRUState, encOutputs ts.Tensor,
 	sz2 := size3[1]
 	sz3 := size3[2]

-	var encOutputsTs ts.Tensor
+	var encOutputsTs *ts.Tensor
 	if sz2 == MaxLength {
 		encOutputsTs = encOutputs.MustShallowClone()
 	} else {
 		shape := []int64{sz1, MaxLength - sz2, sz3}
 		zerosTs := ts.MustZeros(shape, gotch.Float, d.device)
-		encOutputsTs = ts.MustCat([]ts.Tensor{encOutputs, zerosTs}, 1)
+		encOutputsTs = ts.MustCat([]ts.Tensor{*encOutputs, *zerosTs}, 1)
 		zerosTs.MustDrop()
 	}

 	attnApplied := attnWeights.MustBmm(encOutputsTs, true).MustSqueeze1(1, true)
 	encOutputsTs.MustDrop()

-	cTs := ts.MustCat([]ts.Tensor{forwardTs, attnApplied}, 1)
+	cTs := ts.MustCat([]ts.Tensor{*forwardTs, *attnApplied}, 1)
 	forwardTs.MustDrop()
 	attnApplied.MustDrop()
 	aTs := cTs.Apply(d.attnCombine)
 	cTs.MustDrop()
 	xsTs := aTs.MustRelu(true)

-	retState = d.gru.Step(xsTs, state).(nn.GRUState)
+	retState := d.gru.Step(xsTs, state).(*nn.GRUState)
 	xsTs.MustDrop()

-	retTs = d.linear.Forward(retState.Value()).MustLogSoftmax(-1, gotch.Float, true)
+	retTs := d.linear.Forward(retState.Value()).MustLogSoftmax(-1, gotch.Float, true)

 	return retTs, retState
 }

 type Model struct {
-	encoder      Encoder
-	decoder      Decoder
-	decoderStart ts.Tensor
+	encoder      *Encoder
+	decoder      *Decoder
+	decoderStart *ts.Tensor
 	decoderEos   int64
 	device       gotch.Device
 }

-func newModel(vs nn.Path, ilang Lang, olang Lang, hiddenDim int64) (retVal Model) {
-	return Model{
+func newModel(vs *nn.Path, ilang Lang, olang Lang, hiddenDim int64) *Model {
+	return &Model{
 		encoder:      newEncoder(vs.Sub("enc"), int64(ilang.Len()), hiddenDim),
 		decoder:      newDecoder(vs.Sub("dec"), hiddenDim, int64(olang.Len())),
 		decoderStart: ts.MustOfSlice([]int64{int64(olang.SosToken())}).MustTo(vs.Device(), true),
@ -143,16 +143,16 @@ func newModel(vs nn.Path, ilang Lang, olang Lang, hiddenDim int64) (retVal Model
 	}
 }

-func (m *Model) trainLoss(input []int, target []int) (retVal ts.Tensor) {
+func (m *Model) trainLoss(input []int, target []int) *ts.Tensor {
 	state := m.encoder.gru.ZeroState(1)
 	var encOutputs []ts.Tensor

 	for _, v := range input {
 		s := ts.MustOfSlice([]int64{int64(v)}).MustTo(m.device, true)
-		outTs, outState := m.encoder.forward(s, state.(nn.GRUState))
+		outTs, outState := m.encoder.forward(s, state.(*nn.GRUState))
 		s.MustDrop()
-		encOutputs = append(encOutputs, outTs)
-		state.(nn.GRUState).Tensor.MustDrop()
+		encOutputs = append(encOutputs, *outTs)
+		state.(*nn.GRUState).Tensor.MustDrop()
 		state = outState
 	}

@ -167,8 +167,8 @@ func (m *Model) trainLoss(input []int, target []int) (retVal ts.Tensor) {

 	for _, s := range target {
 		// TODO: fix memory leak at decoder.forward
-		outTs, outState := m.decoder.forward(prev, state.(nn.GRUState), stackTs, true)
-		state.(nn.GRUState).Tensor.MustDrop()
+		outTs, outState := m.decoder.forward(prev, state.(*nn.GRUState), stackTs, true)
+		state.(*nn.GRUState).Tensor.MustDrop()
 		state = outState

 		targetTs := ts.MustOfSlice([]int64{int64(s)}).MustTo(m.device, true)
@ -195,7 +195,7 @@ func (m *Model) trainLoss(input []int, target []int) (retVal ts.Tensor) {
 		outTs.MustDrop()
 	}

-	state.(nn.GRUState).Tensor.MustDrop()
+	state.(*nn.GRUState).Tensor.MustDrop()
 	stackTs.MustDrop()
 	prev.MustDrop()

@ -203,16 +203,16 @@ func (m *Model) trainLoss(input []int, target []int) (retVal ts.Tensor) {

 }

-func (m *Model) predict(input []int) (retVal []int) {
+func (m *Model) predict(input []int) []int {
 	state := m.encoder.gru.ZeroState(1)
 	var encOutputs []ts.Tensor

 	for _, v := range input {
 		s := ts.MustOfSlice([]int64{int64(v)}).MustTo(m.device, true)
-		outTs, outState := m.encoder.forward(s, state.(nn.GRUState))
+		outTs, outState := m.encoder.forward(s, state.(*nn.GRUState))

-		encOutputs = append(encOutputs, outTs)
-		state.(nn.GRUState).Tensor.MustDrop()
+		encOutputs = append(encOutputs, *outTs)
+		state.(*nn.GRUState).Tensor.MustDrop()
 		state = outState
 	}

@ -225,7 +225,7 @@ func (m *Model) predict(input []int) (retVal []int) {
 	var outputSeq []int

 	for i := 0; i < int(MaxLength); i++ {
-		outTs, outState := m.decoder.forward(prev, state.(nn.GRUState), stackTs, true)
+		outTs, outState := m.decoder.forward(prev, state.(*nn.GRUState), stackTs, true)
 		_, output := outTs.MustTopK(1, -1, true, true)
 		outputVal := output.Int64Values()[0]
 		outputSeq = append(outputSeq, int(outputVal))
@ -234,7 +234,7 @@ func (m *Model) predict(input []int) (retVal []int) {
 			break
 		}

-		state.(nn.GRUState).Tensor.MustDrop()
+		state.(*nn.GRUState).Tensor.MustDrop()
 		state = outState
 		prev.MustDrop()
 		prev = output
@ -249,8 +249,8 @@ type LossStats struct {
 	samples   int
 }

-func newLossStats() (retVal LossStats) {
-	return LossStats{
+func newLossStats() *LossStats {
+	return &LossStats{
 		totalLoss: 0.0,
 		samples:   0,
 	}
@ -261,7 +261,7 @@ func (ls *LossStats) update(loss float64) {
 	ls.samples += 1
 }

-func (ls *LossStats) avgAndReset() (retVal float64) {
+func (ls *LossStats) avgAndReset() float64 {
 	avg := ls.totalLoss / float64(ls.samples)
 	ls.totalLoss = 0.0
 	ls.samples = 0
--- a/example/yolo/README.md
+++ b/example/yolo/README.md
@ -0,0 +1,16 @@
+# YOLO model
+
+This is an example of implementing YOLO v3 model. 
+
+The model weights can be [downloaded here](https://drive.google.com/file/d/16eO9o4rclD929LHweCPW_-7HjKfNKVnA/view?usp=sharing).
+
+Here is an example of image inference using Yolo v3 model.
+
+## Original Image
+
+![Bondi Beach - Original](bondi.jpg "Bondi Beach")
+
+## Yolo v3 inference
+
+![Bondi Beach - Yolo inference](yolo_bondi.jpg "Bondi Beach - YOLO v3")
+
--- a/example/yolo/bondi.jpg
+++ b/example/yolo/bondi.jpg
--- a/example/yolo/darknet.go
+++ b/example/yolo/darknet.go
@ -19,7 +19,7 @@ type Block struct {
 	Parameters map[string]string
 }

-func (b *Block) get(key string) (retVal string) {
+func (b *Block) get(key string) string {
 	val, ok := b.Parameters[key]
 	if !ok {
 		log.Fatalf("Cannot find %v in Block parameters.\n", key)
@ -33,7 +33,7 @@ type Darknet struct {
 	Parameters map[string]string
 }

-func (d Darknet) get(key string) (retVal string) {
+func (d *Darknet) get(key string) string {
 	val, ok := d.Parameters[key]
 	if !ok {
 		log.Fatalf("Cannot find %v in Darknet parameters.\n", key)
@ -44,16 +44,16 @@ func (d Darknet) get(key string) (retVal string) {

 type Accumulator struct {
 	Parameters map[string]string
-	Net        Darknet
+	Net        *Darknet
 	BlockType  *string // optional
 }

-func newAccumulator() (retVal Accumulator) {
+func newAccumulator() *Accumulator {

-	return Accumulator{
+	return &Accumulator{
 		BlockType:  nil,
 		Parameters: make(map[string]string, 0),
-		Net: Darknet{
+		Net: &Darknet{
 			Blocks:     make([]Block, 0),
 			Parameters: make(map[string]string, 0),
 		},
@ -79,7 +79,7 @@ func (acc *Accumulator) finishBlock() {
 	acc.BlockType = nil
 }

-func ParseConfig(path string) (retVal Darknet) {
+func ParseConfig(path string) *Darknet {

 	acc := newAccumulator()

@ -166,7 +166,7 @@ type (
 	}
 )

-func conv(vs nn.Path, index uint, p int64, b Block) (retVal1 int64, retVal2 interface{}) {
+func conv(vs *nn.Path, index uint, p int64, b *Block) (retVal1 int64, retVal2 interface{}) {

 	activation := b.get("activation")

@ -209,7 +209,7 @@ func conv(vs nn.Path, index uint, p int64, b Block) (retVal1 int64, retVal2 inte
 		if p != 0 {
 			sub := vs.Sub(fmt.Sprintf("batch_norm_%v", index))
 			bnVal := nn.BatchNorm2D(sub, filters, nn.DefaultBatchNormConfig())
-			bn = &bnVal
+			bn = bnVal
 			bias = false
 		}
 	} else {
@ -234,18 +234,19 @@ func conv(vs nn.Path, index uint, p int64, b Block) (retVal1 int64, retVal2 inte
 		log.Fatalf("Unsupported activation(%v)\n", activation)
 	}

-	fn := nn.NewFuncT(func(xs ts.Tensor, train bool) (res ts.Tensor) {
+	fn := nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.Apply(conv)

-		var tmp2 ts.Tensor
+		var tmp2 *ts.Tensor

 		if bn != nil {
-			tmp2 = tmp1.ApplyT(*bn, train)
+			tmp2 = tmp1.ApplyT(bn, train)
 			tmp1.MustDrop()
 		} else {
 			tmp2 = tmp1
 		}

+		var res *ts.Tensor
 		if leaky {
 			tmp2Mul := tmp2.MustMul1(ts.FloatScalar(0.1), false)
 			res = tmp2.MustMax1(tmp2Mul, true)
@ -261,7 +262,7 @@ func conv(vs nn.Path, index uint, p int64, b Block) (retVal1 int64, retVal2 inte
 }

 func upsample(prevChannels int64) (retVal1 int64, retVal2 interface{}) {
-	layer := nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	layer := nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		// []int64{n, c, h, w}
 		res, err := xs.Size4()
 		if err != nil {
@ -276,7 +277,8 @@ func upsample(prevChannels int64) (retVal1 int64, retVal2 interface{}) {
 	return prevChannels, Layer{Val: layer}
 }

-func intListOfString(s string) (retVal []int64) {
+func intListOfString(s string) []int64 {
+	var retVal []int64
 	strs := strings.Split(s, ",")
 	for _, str := range strs {
 		str = strings.TrimSpace(str)
@ -290,7 +292,7 @@ func intListOfString(s string) (retVal []int64) {
 	return retVal
 }

-func uintOfIndex(index uint, i int64) (retVal uint) {
+func uintOfIndex(index uint, i int64) uint {
 	if i >= 0 {
 		return uint(i)
 	} else {
@ -298,7 +300,7 @@ func uintOfIndex(index uint, i int64) (retVal uint) {
 	}
 }

-func route(index uint, p []ChannelsBl, blk Block) (retVal1 int64, retVal2 interface{}) {
+func route(index uint, p []ChannelsBl, blk *Block) (retVal1 int64, retVal2 interface{}) {
 	intLayers := intListOfString(blk.get("layers"))

 	var layers []uint
@ -314,7 +316,7 @@ func route(index uint, p []ChannelsBl, blk Block) (retVal1 int64, retVal2 interf
 	return channels, Route{TsIdxs: layers}
 }

-func shortcut(index uint, p int64, blk Block) (retVal1 int64, retVal2 interface{}) {
+func shortcut(index uint, p int64, blk *Block) (retVal1 int64, retVal2 interface{}) {
 	fromStr := blk.get("from")

 	from, err := strconv.ParseInt(fromStr, 10, 64)
@ -325,7 +327,7 @@ func shortcut(index uint, p int64, blk Block) (retVal1 int64, retVal2 interface{
 	return p, Shortcut{TsIdx: uintOfIndex(index, from)}
 }

-func yolo(p int64, blk Block) (retVal1 int64, retVal2 interface{}) {
+func yolo(p int64, blk *Block) (retVal1 int64, retVal2 interface{}) {
 	classesStr := blk.get("classes")
 	classes, err := strconv.ParseInt(classesStr, 10, 64)
 	if err != nil {
@ -356,7 +358,7 @@ func yolo(p int64, blk Block) (retVal1 int64, retVal2 interface{}) {
 }

 // Apply f to a slice of tensor xs and replace xs values with f output.
-func sliceApplyAndSet(xs ts.Tensor, start int64, len int64, f func(ts.Tensor) ts.Tensor) {
+func sliceApplyAndSet(xs *ts.Tensor, start int64, len int64, f func(*ts.Tensor) *ts.Tensor) {
 	slice := xs.MustNarrow(2, start, len, false)
 	src := f(slice)

@ -365,7 +367,7 @@ func sliceApplyAndSet(xs ts.Tensor, start int64, len int64, f func(ts.Tensor) ts
 	slice.MustDrop()
 }

-func detect(xs ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) (retVal ts.Tensor) {
+func detect(xs *ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) *ts.Tensor {

 	device, err := xs.Device()

@ -396,7 +398,7 @@ func detect(xs ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) (r

 	xOffset := a.MustView([]int64{-1, 1}, true)
 	yOffset := b.MustView([]int64{-1, 1}, true)
-	xyOffsetTmp1 := ts.MustCat([]ts.Tensor{xOffset, yOffset}, 1)
+	xyOffsetTmp1 := ts.MustCat([]ts.Tensor{*xOffset, *yOffset}, 1)
 	xyOffsetTmp2 := xyOffsetTmp1.MustRepeat([]int64{1, nanchors}, true)
 	xyOffsetTmp3 := xyOffsetTmp2.MustView([]int64{-1, 2}, true)
 	xyOffset := xyOffsetTmp3.MustUnsqueeze(0, true)
@ -417,23 +419,21 @@ func detect(xs ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) (r
 	anchorsTmp3 := anchorsTmp2.MustRepeat([]int64{gridSize * gridSize, 1}, true)
 	anchorsTs := anchorsTmp3.MustUnsqueeze(0, true).MustTo(device, true)

-	sliceApplyAndSet(xsTs, 0, 2, func(xs ts.Tensor) (res ts.Tensor) {
+	sliceApplyAndSet(xsTs, 0, 2, func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustSigmoid(false)
-		res = tmp.MustAdd(xyOffset, true)
-		return res
+		return tmp.MustAdd(xyOffset, true)
 	})

-	sliceApplyAndSet(xsTs, 4, classes+1, func(xs ts.Tensor) (res ts.Tensor) {
+	sliceApplyAndSet(xsTs, 4, classes+1, func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustSigmoid(false)
 	})

-	sliceApplyAndSet(xsTs, 2, 2, func(xs ts.Tensor) (res ts.Tensor) {
+	sliceApplyAndSet(xsTs, 2, 2, func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustExp(false)
-		res = tmp.MustMul(anchorsTs, true)
-		return res
+		return tmp.MustMul(anchorsTs, true)
 	})

-	sliceApplyAndSet(xsTs, 0, 4, func(xs ts.Tensor) (res ts.Tensor) {
+	sliceApplyAndSet(xsTs, 0, 4, func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustMul1(ts.IntScalar(stride), false)
 	})

@ -441,7 +441,7 @@ func detect(xs ts.Tensor, imageHeight int64, classes int64, anchors []Anchor) (r
 	return xsTs
 }

-func (dn *Darknet) Height() (retVal int64) {
+func (dn *Darknet) Height() int64 {
 	imageHeightStr := dn.get("height")
 	retVal, err := strconv.ParseInt(imageHeightStr, 10, 64)
 	if err != nil {
@ -451,7 +451,7 @@ func (dn *Darknet) Height() (retVal int64) {
 	return retVal
 }

-func (dn *Darknet) Width() (retVal int64) {
+func (dn *Darknet) Width() int64 {
 	imageWidthStr := dn.get("width")
 	retVal, err := strconv.ParseInt(imageWidthStr, 10, 64)
 	if err != nil {
@ -461,7 +461,7 @@ func (dn *Darknet) Width() (retVal int64) {
 	return retVal
 }

-func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {
+func (dn *Darknet) BuildModel(vs *nn.Path) nn.FuncT {
 	var blocks []ChannelsBl // Param is a struct{int64, interface{}}
 	var prevChannels int64 = 3

@ -471,15 +471,15 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {

 		switch *blk.BlockType {
 		case "convolutional":
-			channels, bl = conv(vs.Sub(fmt.Sprintf("%v", index)), uint(index), prevChannels, blk)
+			channels, bl = conv(vs.Sub(fmt.Sprintf("%v", index)), uint(index), prevChannels, &blk)
 		case "upsample":
 			channels, bl = upsample(prevChannels)
 		case "shortcut":
-			channels, bl = shortcut(uint(index), prevChannels, blk)
+			channels, bl = shortcut(uint(index), prevChannels, &blk)
 		case "route":
-			channels, bl = route(uint(index), blocks, blk)
+			channels, bl = route(uint(index), blocks, &blk)
 		case "yolo":
-			channels, bl = yolo(prevChannels, blk)
+			channels, bl = yolo(prevChannels, &blk)
 		default:
 			log.Fatalf("Unsupported block type: %v\n", *blk.BlockType)
 		}
@ -489,7 +489,7 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {

 	imageHeight := dn.Height()

-	retVal = nn.NewFuncT(func(xs ts.Tensor, train bool) (res ts.Tensor) {
+	retVal := nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {

 		var prevYs []ts.Tensor = make([]ts.Tensor, 0)
 		var detections []ts.Tensor = make([]ts.Tensor, 0)
@ -497,13 +497,13 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {
 		// NOTE: we will delete all tensors in prevYs after looping
 		for _, b := range blocks {
 			blkTyp := reflect.TypeOf(b.Bl)
-			var ysTs ts.Tensor
+			var ysTs *ts.Tensor
 			switch blkTyp.Name() {
 			case "Layer":
 				layer := b.Bl.(Layer)
 				xsTs := xs
 				if len(prevYs) > 0 {
-					xsTs = prevYs[len(prevYs)-1] // last prevYs element
+					xsTs = &prevYs[len(prevYs)-1] // last prevYs element
 				}
 				ysTs = layer.Val.ForwardT(xsTs, train)
 			case "Route":
@ -516,7 +516,7 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {

 			case "Shortcut":
 				from := b.Bl.(Shortcut).TsIdx
-				addTs := prevYs[int(from)]
+				addTs := &prevYs[int(from)]
 				last := prevYs[len(prevYs)-1]
 				ysTs = last.MustAdd(addTs, false)
 			case "Yolo":
@ -524,12 +524,12 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {
 				anchors := b.Bl.(Yolo).Anchors
 				xsTs := xs
 				if len(prevYs) > 0 {
-					xsTs = prevYs[len(prevYs)-1]
+					xsTs = &prevYs[len(prevYs)-1]
 				}

 				dt := detect(xsTs, imageHeight, classes, anchors)

-				detections = append(detections, dt)
+				detections = append(detections, *dt)

 				ysTs = ts.NewTensor()

@ -537,10 +537,10 @@ func (dn *Darknet) BuildModel(vs nn.Path) (retVal nn.FuncT) {
 				// log.Fatalf("BuildModel - FuncT - Unsupported block type: %v\n", blkTyp.Name())
 			} // end of Switch

-			prevYs = append(prevYs, ysTs)
+			prevYs = append(prevYs, *ysTs)
 		} // end of For loop

-		res = ts.MustCat(detections, 1)
+		res := ts.MustCat(detections, 1)

 		// Now, free-up memory held up by prevYs
 		for _, t := range prevYs {
--- a/example/yolo/draw.go
+++ b/example/yolo/draw.go
@ -0,0 +1,147 @@
+package main
+
+import (
+	"image"
+	"image/color"
+	// "image/jpeg"
+	"io/ioutil"
+
+	"flag"
+	"log"
+	"os"
+	"path/filepath"
+
+	"golang.org/x/image/draw"
+	"golang.org/x/image/font"
+
+	"github.com/sugarme/gotch/example/yolo/freetype"
+	ts "github.com/sugarme/gotch/tensor"
+)
+
+var (
+	dpi      = flag.Float64("dpi", 72, "screen resolution in Dots Per Inch")
+	fontfile = flag.String("fontfile", "luxisr.ttf", "filename of the ttf font")
+	hinting  = flag.String("hinting", "none", "none | full")
+	size     = flag.Float64("size", 12, "font size in points")
+	spacing  = flag.Float64("spacing", 1.2, "line spacing (e.g. 2 means double spaced)")
+	wonb     = flag.Bool("whiteonblack", false, "white text on a black background")
+	bound    = flag.Bool("bound", true, "generates image with minimum size for the text")
+)
+
+func loadImage(file string) (retVal image.Image, err error) {
+	imagePath, err := filepath.Abs(file)
+	if err != nil {
+		return nil, err
+	}
+	f, err := os.Open(imagePath)
+	if err != nil {
+		return nil, err
+	}
+
+	img, _, err := image.Decode(f)
+	return img, err
+}
+
+func textToImageTs(text []string) *ts.Tensor {
+	offset := 0
+
+	flag.Parse()
+
+	// Read font data
+	fontBytes, err := ioutil.ReadFile(*fontfile)
+	if err != nil {
+		log.Println(err)
+		return nil
+	}
+
+	f, err := freetype.ParseFont(fontBytes)
+	if err != nil {
+		log.Println(err)
+		return nil
+	}
+
+	var width, height int
+	// Initialize the context.
+	c := freetype.NewContext()
+	c.SetDPI(*dpi)
+	c.SetFont(f)
+	c.SetFontSize(*size)
+
+	switch *hinting {
+	default:
+		c.SetHinting(font.HintingNone)
+	case "full":
+		c.SetHinting(font.HintingFull)
+	}
+
+	// Measure the text to calculate the minimum size of the image
+	if *bound {
+		pt := freetype.Pt(offset, offset+int(c.PointToFixed(*size)>>6))
+		for _, s := range text {
+			ptr, err := c.MeasureString(s, pt)
+			if err != nil {
+				log.Println(err)
+				return nil
+			}
+			pt.Y += c.PointToFixed(*size * *spacing)
+			x := int(ptr.X >> 6)
+			if x > width {
+				width = x
+			}
+		}
+		width += offset
+		height = int(pt.Y)>>6 - int(c.PointToFixed(*size)>>6)
+		// Use default size for the image
+	} else {
+		width = 640
+		height = 480
+	}
+
+	// Creates image with the specified size
+	fg, bg := image.Black, image.White
+	ruler := color.RGBA{0xdd, 0xdd, 0xdd, 0xff}
+	if *wonb {
+		fg, bg = image.White, image.Black
+		ruler = color.RGBA{0x22, 0x22, 0x22, 0xff}
+	}
+	rgba := image.NewRGBA(image.Rect(0, 0, width, height))
+	draw.Draw(rgba, rgba.Bounds(), bg, image.ZP, draw.Src)
+	c.SetClip(rgba.Bounds())
+	c.SetDst(rgba)
+	c.SetSrc(fg)
+
+	// Draw the guidelines
+	for i := 0; i < 200; i++ {
+		rgba.Set(offset, offset+i, ruler)
+		rgba.Set(offset+i, offset, ruler)
+	}
+
+	// Draw the text.
+	pt := freetype.Pt(offset, offset+int(c.PointToFixed(*size)>>6))
+	for _, s := range text {
+		_, err = c.DrawString(s, pt)
+		if err != nil {
+			log.Println(err)
+			return nil
+		}
+		pt.Y += c.PointToFixed(*size * *spacing)
+	}
+
+	var rgb []float64
+	var r, g, b []float64
+	for i := 0; i < len(rgba.Pix); i += 4 {
+		start := i
+		r = append(r, float64(rgba.Pix[start])/255.0)
+		g = append(g, float64(rgba.Pix[start+1])/255.0)
+		b = append(b, float64(rgba.Pix[start+2])/255.0)
+	}
+
+	rgb = append(rgb, r...)
+	rgb = append(rgb, g...)
+	rgb = append(rgb, b...)
+
+	w := int64(rgba.Rect.Dx())
+	h := int64(rgba.Rect.Dy())
+
+	return ts.MustOfSlice(rgb).MustView([]int64{3, h, w}, false)
+}
--- a/example/yolo/freetype/freetype.go
+++ b/example/yolo/freetype/freetype.go
@ -0,0 +1,366 @@
+// Copyright 2010 The Freetype-Go Authors. All rights reserved.
+// Use of this source code is governed by your choice of either the
+// FreeType License or the GNU General Public License version 2 (or
+// any later version), both of which can be found in the LICENSE file.
+
+// The freetype package provides a convenient API to draw text onto an image.
+// Use the freetype/raster and freetype/truetype packages for lower level
+// control over rasterization and TrueType parsing.
+package freetype // import "github.com/golang/freetype"
+
+import (
+	"errors"
+	"image"
+	"image/draw"
+
+	"github.com/golang/freetype/raster"
+	"github.com/golang/freetype/truetype"
+	"golang.org/x/image/font"
+	"golang.org/x/image/math/fixed"
+)
+
+// These constants determine the size of the glyph cache. The cache is keyed
+// primarily by the glyph index modulo nGlyphs, and secondarily by sub-pixel
+// position for the mask image. Sub-pixel positions are quantized to
+// nXFractions possible values in both the x and y directions.
+const (
+	nGlyphs     = 256
+	nXFractions = 4
+	nYFractions = 1
+)
+
+// An entry in the glyph cache is keyed explicitly by the glyph index and
+// implicitly by the quantized x and y fractional offset. It maps to a mask
+// image and an offset.
+type cacheEntry struct {
+	valid        bool
+	glyph        truetype.Index
+	advanceWidth fixed.Int26_6
+	mask         *image.Alpha
+	offset       image.Point
+}
+
+// ParseFont just calls the Parse function from the freetype/truetype package.
+// It is provided here so that code that imports this package doesn't need
+// to also include the freetype/truetype package.
+func ParseFont(b []byte) (*truetype.Font, error) {
+	return truetype.Parse(b)
+}
+
+// Pt converts from a co-ordinate pair measured in pixels to a fixed.Point26_6
+// co-ordinate pair measured in fixed.Int26_6 units.
+func Pt(x, y int) fixed.Point26_6 {
+	return fixed.Point26_6{
+		X: fixed.Int26_6(x << 6),
+		Y: fixed.Int26_6(y << 6),
+	}
+}
+
+// A Context holds the state for drawing text in a given font and size.
+type Context struct {
+	r        *raster.Rasterizer
+	f        *truetype.Font
+	glyphBuf truetype.GlyphBuf
+	// clip is the clip rectangle for drawing.
+	clip image.Rectangle
+	// dst and src are the destination and source images for drawing.
+	dst draw.Image
+	src image.Image
+	// fontSize and dpi are used to calculate scale. scale is the number of
+	// 26.6 fixed point units in 1 em. hinting is the hinting policy.
+	fontSize, dpi float64
+	scale         fixed.Int26_6
+	hinting       font.Hinting
+	// cache is the glyph cache.
+	cache [nGlyphs * nXFractions * nYFractions]cacheEntry
+}
+
+// PointToFixed converts the given number of points (as in "a 12 point font")
+// into a 26.6 fixed point number of pixels.
+func (c *Context) PointToFixed(x float64) fixed.Int26_6 {
+	return fixed.Int26_6(x * float64(c.dpi) * (64.0 / 72.0))
+}
+
+// drawContour draws the given closed contour with the given offset.
+func (c *Context) drawContour(ps []truetype.Point, dx, dy fixed.Int26_6) {
+	if len(ps) == 0 {
+		return
+	}
+
+	// The low bit of each point's Flags value is whether the point is on the
+	// curve. Truetype fonts only have quadratic Bézier curves, not cubics.
+	// Thus, two consecutive off-curve points imply an on-curve point in the
+	// middle of those two.
+	//
+	// See http://chanae.walon.org/pub/ttf/ttf_glyphs.htm for more details.
+
+	// ps[0] is a truetype.Point measured in FUnits and positive Y going
+	// upwards. start is the same thing measured in fixed point units and
+	// positive Y going downwards, and offset by (dx, dy).
+	start := fixed.Point26_6{
+		X: dx + ps[0].X,
+		Y: dy - ps[0].Y,
+	}
+	others := []truetype.Point(nil)
+	if ps[0].Flags&0x01 != 0 {
+		others = ps[1:]
+	} else {
+		last := fixed.Point26_6{
+			X: dx + ps[len(ps)-1].X,
+			Y: dy - ps[len(ps)-1].Y,
+		}
+		if ps[len(ps)-1].Flags&0x01 != 0 {
+			start = last
+			others = ps[:len(ps)-1]
+		} else {
+			start = fixed.Point26_6{
+				X: (start.X + last.X) / 2,
+				Y: (start.Y + last.Y) / 2,
+			}
+			others = ps
+		}
+	}
+	c.r.Start(start)
+	q0, on0 := start, true
+	for _, p := range others {
+		q := fixed.Point26_6{
+			X: dx + p.X,
+			Y: dy - p.Y,
+		}
+		on := p.Flags&0x01 != 0
+		if on {
+			if on0 {
+				c.r.Add1(q)
+			} else {
+				c.r.Add2(q0, q)
+			}
+		} else {
+			if on0 {
+				// No-op.
+			} else {
+				mid := fixed.Point26_6{
+					X: (q0.X + q.X) / 2,
+					Y: (q0.Y + q.Y) / 2,
+				}
+				c.r.Add2(q0, mid)
+			}
+		}
+		q0, on0 = q, on
+	}
+	// Close the curve.
+	if on0 {
+		c.r.Add1(start)
+	} else {
+		c.r.Add2(q0, start)
+	}
+}
+
+// rasterize returns the advance width, glyph mask and integer-pixel offset
+// to render the given glyph at the given sub-pixel offsets.
+// The 26.6 fixed point arguments fx and fy must be in the range [0, 1).
+func (c *Context) rasterize(glyph truetype.Index, fx, fy fixed.Int26_6) (
+	fixed.Int26_6, *image.Alpha, image.Point, error) {
+
+	if err := c.glyphBuf.Load(c.f, c.scale, glyph, c.hinting); err != nil {
+		return 0, nil, image.Point{}, err
+	}
+	// Calculate the integer-pixel bounds for the glyph.
+	xmin := int(fx+c.glyphBuf.Bounds.Min.X) >> 6
+	ymin := int(fy-c.glyphBuf.Bounds.Max.Y) >> 6
+	xmax := int(fx+c.glyphBuf.Bounds.Max.X+0x3f) >> 6
+	ymax := int(fy-c.glyphBuf.Bounds.Min.Y+0x3f) >> 6
+	if xmin > xmax || ymin > ymax {
+		return 0, nil, image.Point{}, errors.New("freetype: negative sized glyph")
+	}
+	// A TrueType's glyph's nodes can have negative co-ordinates, but the
+	// rasterizer clips anything left of x=0 or above y=0. xmin and ymin are
+	// the pixel offsets, based on the font's FUnit metrics, that let a
+	// negative co-ordinate in TrueType space be non-negative in rasterizer
+	// space. xmin and ymin are typically <= 0.
+	fx -= fixed.Int26_6(xmin << 6)
+	fy -= fixed.Int26_6(ymin << 6)
+	// Rasterize the glyph's vectors.
+	c.r.Clear()
+	e0 := 0
+	for _, e1 := range c.glyphBuf.Ends {
+		c.drawContour(c.glyphBuf.Points[e0:e1], fx, fy)
+		e0 = e1
+	}
+	a := image.NewAlpha(image.Rect(0, 0, xmax-xmin, ymax-ymin))
+	c.r.Rasterize(raster.NewAlphaSrcPainter(a))
+	return c.glyphBuf.AdvanceWidth, a, image.Point{xmin, ymin}, nil
+}
+
+// glyph returns the advance width, glyph mask and integer-pixel offset to
+// render the given glyph at the given sub-pixel point. It is a cache for the
+// rasterize method. Unlike rasterize, p's co-ordinates do not have to be in
+// the range [0, 1).
+func (c *Context) glyph(glyph truetype.Index, p fixed.Point26_6) (
+	fixed.Int26_6, *image.Alpha, image.Point, error) {
+
+	// Split p.X and p.Y into their integer and fractional parts.
+	ix, fx := int(p.X>>6), p.X&0x3f
+	iy, fy := int(p.Y>>6), p.Y&0x3f
+	// Calculate the index t into the cache array.
+	tg := int(glyph) % nGlyphs
+	tx := int(fx) / (64 / nXFractions)
+	ty := int(fy) / (64 / nYFractions)
+	t := ((tg*nXFractions)+tx)*nYFractions + ty
+	// Check for a cache hit.
+	if e := c.cache[t]; e.valid && e.glyph == glyph {
+		return e.advanceWidth, e.mask, e.offset.Add(image.Point{ix, iy}), nil
+	}
+	// Rasterize the glyph and put the result into the cache.
+	advanceWidth, mask, offset, err := c.rasterize(glyph, fx, fy)
+	if err != nil {
+		return 0, nil, image.Point{}, err
+	}
+	c.cache[t] = cacheEntry{true, glyph, advanceWidth, mask, offset}
+	return advanceWidth, mask, offset.Add(image.Point{ix, iy}), nil
+}
+
+// DrawString draws s at p and returns p advanced by the text extent. The text
+// is placed so that the left edge of the em square of the first character of s
+// and the baseline intersect at p. The majority of the affected pixels will be
+// above and to the right of the point, but some may be below or to the left.
+// For example, drawing a string that starts with a 'J' in an italic font may
+// affect pixels below and left of the point.
+//
+// p is a fixed.Point26_6 and can therefore represent sub-pixel positions.
+func (c *Context) DrawString(s string, p fixed.Point26_6) (fixed.Point26_6, error) {
+	if c.f == nil {
+		return fixed.Point26_6{}, errors.New("freetype: DrawString called with a nil font")
+	}
+	prev, hasPrev := truetype.Index(0), false
+	for _, rune := range s {
+		index := c.f.Index(rune)
+		if hasPrev {
+			kern := c.f.Kern(c.scale, prev, index)
+			if c.hinting != font.HintingNone {
+				kern = (kern + 32) &^ 63
+			}
+			p.X += kern
+		}
+		advanceWidth, mask, offset, err := c.glyph(index, p)
+		if err != nil {
+			return fixed.Point26_6{}, err
+		}
+		p.X += advanceWidth
+		glyphRect := mask.Bounds().Add(offset)
+		dr := c.clip.Intersect(glyphRect)
+		if !dr.Empty() {
+			mp := image.Point{0, dr.Min.Y - glyphRect.Min.Y}
+			draw.DrawMask(c.dst, dr, c.src, image.ZP, mask, mp, draw.Over)
+		}
+		prev, hasPrev = index, true
+	}
+	return p, nil
+}
+
+// MeasureString is identical to DrawString but only measure the text.
+func (c *Context) MeasureString(s string, p fixed.Point26_6) (fixed.Point26_6, error) {
+	if c.f == nil {
+		return fixed.Point26_6{}, errors.New("freetype: MeasureString called with a nil font")
+	}
+	prev, hasPrev := truetype.Index(0), false
+	for _, rune := range s {
+		index := c.f.Index(rune)
+		if hasPrev {
+			kern := c.f.Kern(c.scale, prev, index)
+			if c.hinting != font.HintingNone {
+				kern = (kern + 32) &^ 63
+			}
+			p.X += kern
+		}
+		advanceWidth, _, _, err := c.glyph(index, p)
+		if err != nil {
+			return fixed.Point26_6{}, err
+		}
+		p.X += advanceWidth
+		prev, hasPrev = index, true
+	}
+	return p, nil
+}
+
+// recalc recalculates scale and bounds values from the font size, screen
+// resolution and font metrics, and invalidates the glyph cache.
+func (c *Context) recalc() {
+	c.scale = fixed.Int26_6(c.fontSize * c.dpi * (64.0 / 72.0))
+	if c.f == nil {
+		c.r.SetBounds(0, 0)
+	} else {
+		// Set the rasterizer's bounds to be big enough to handle the largest glyph.
+		b := c.f.Bounds(c.scale)
+		xmin := +int(b.Min.X) >> 6
+		ymin := -int(b.Max.Y) >> 6
+		xmax := +int(b.Max.X+63) >> 6
+		ymax := -int(b.Min.Y-63) >> 6
+		c.r.SetBounds(xmax-xmin, ymax-ymin)
+	}
+	for i := range c.cache {
+		c.cache[i] = cacheEntry{}
+	}
+}
+
+// SetDPI sets the screen resolution in dots per inch.
+func (c *Context) SetDPI(dpi float64) {
+	if c.dpi == dpi {
+		return
+	}
+	c.dpi = dpi
+	c.recalc()
+}
+
+// SetFont sets the font used to draw text.
+func (c *Context) SetFont(f *truetype.Font) {
+	if c.f == f {
+		return
+	}
+	c.f = f
+	c.recalc()
+}
+
+// SetFontSize sets the font size in points (as in "a 12 point font").
+func (c *Context) SetFontSize(fontSize float64) {
+	if c.fontSize == fontSize {
+		return
+	}
+	c.fontSize = fontSize
+	c.recalc()
+}
+
+// SetHinting sets the hinting policy.
+func (c *Context) SetHinting(hinting font.Hinting) {
+	c.hinting = hinting
+	for i := range c.cache {
+		c.cache[i] = cacheEntry{}
+	}
+}
+
+// SetDst sets the destination image for draw operations.
+func (c *Context) SetDst(dst draw.Image) {
+	c.dst = dst
+}
+
+// SetSrc sets the source image for draw operations. This is typically an
+// image.Uniform.
+func (c *Context) SetSrc(src image.Image) {
+	c.src = src
+}
+
+// SetClip sets the clip rectangle for drawing.
+func (c *Context) SetClip(clip image.Rectangle) {
+	c.clip = clip
+}
+
+// TODO(nigeltao): implement Context.SetGamma.
+
+// NewContext creates a new Context.
+func NewContext() *Context {
+	return &Context{
+		r:        raster.NewRasterizer(0, 0),
+		fontSize: 12,
+		dpi:      72,
+		scale:    12 << 6,
+	}
+}
--- a/example/yolo/luxisr.ttf
+++ b/example/yolo/luxisr.ttf
--- a/example/yolo/main.go
+++ b/example/yolo/main.go
@ -21,8 +21,8 @@ const (
 )

 var (
-	model string
-	image string
+	model     string
+	imageFile string
 )

 type Bbox struct {
@ -59,7 +59,7 @@ func Iou(b1, b2 Bbox) (retVal float64) {
 }

 // Assuming x1 <= x2 and y1 <= y2
-func drawRect(t ts.Tensor, x1, x2, y1, y2 int64) {
+func drawRect(t *ts.Tensor, x1, x2, y1, y2 int64) {
 	color := ts.MustOfSlice([]float64{0.0, 0.0, 1.0}).MustView([]int64{3, 1, 1}, true)

 	// NOTE: `narrow` will create a tensor (view) that share same storage with
@ -71,7 +71,39 @@ func drawRect(t ts.Tensor, x1, x2, y1, y2 int64) {
 	color.MustDrop()
 }

-func report(pred ts.Tensor, img ts.Tensor, w int64, h int64) (retVal ts.Tensor) {
+func drawLabel(t *ts.Tensor, text []string, x, y int64) {
+	device, err := t.Device()
+	if err != nil {
+		log.Fatal(err)
+	}
+	label := textToImageTs(text).MustTo(device, true)
+
+	labelSize := label.MustSize()
+	height := labelSize[1]
+	width := labelSize[2]
+
+	imageSize := t.MustSize()
+	lenY := height
+	if lenY > imageSize[1] {
+		lenY = imageSize[1] - y
+	}
+
+	lenX := width
+	if lenX > imageSize[2] {
+		lenX = imageSize[2] - x
+	}
+
+	// NOTE: `narrow` will create a tensor (view) that share same storage with
+	// original one.
+
+	tmp1 := t.MustNarrow(2, x, lenX, false)
+	tmp2 := tmp1.MustNarrow(1, y, lenY, true)
+	tmp2.Copy_(label)
+	tmp2.MustDrop()
+	label.MustDrop()
+}
+
+func report(pred *ts.Tensor, img *ts.Tensor, w int64, h int64) *ts.Tensor {
 	size2, err := pred.Size2()
 	if err != nil {
 		log.Fatal(err)
@ -176,18 +208,21 @@ func report(pred ts.Tensor, img ts.Tensor, w int64, h int64) (retVal ts.Tensor)
 			drawRect(image, xmin, xmax, max(ymin, ymax-2), ymax)
 			drawRect(image, xmin, min(xmax, xmin+2), ymin, ymax)
 			drawRect(image, max(xmin, xmax-2), xmax, ymin, ymax)
+
+			label := fmt.Sprintf("%v; %.3f\n", CocoClasses[classIndex], b.confidence)
+			drawLabel(image, []string{label}, xmin, ymin-15)
 		}
 	}

 	imgTmp := image.MustMul1(ts.FloatScalar(255.0), true)
-	retVal = imgTmp.MustTotype(gotch.Uint8, true)
+	retVal := imgTmp.MustTotype(gotch.Uint8, true)

 	return retVal
 }

 func init() {
 	flag.StringVar(&model, "model", "../../data/yolo/yolo-v3.pt", "Yolo model weights file")
-	flag.StringVar(&image, "image", "../../data/yolo/bondi.jpg", "image file to infer")
+	flag.StringVar(&imageFile, "image", "../../data/yolo/bondi.jpg", "image file to infer")
 }

 func main() {
@ -203,12 +238,12 @@ func main() {
 		log.Fatal(err)
 	}

-	imagePath, err := filepath.Abs(image)
+	imagePath, err := filepath.Abs(imageFile)
 	if err != nil {
 		log.Fatal(err)
 	}

-	var darknet Darknet = ParseConfig(configPath)
+	var darknet *Darknet = ParseConfig(configPath)

 	vs := nn.NewVarStore(gotch.CPU)
 	model := darknet.BuildModel(vs.Root())
@ -256,10 +291,6 @@ func main() {
 	if err != nil {
 		log.Fatal(err)
 	}
-
-	// TODO: write label/confidence val next to bouding boxes.
-	// Naive way is write 'write text on image' rather than on tensor.
-	// See this: https://stackoverflow.com/questions/38299930
 }

 func max(v1, v2 int64) (retVal int64) {
--- a/example/yolo/yolo_bondi.jpg
+++ b/example/yolo/yolo_bondi.jpg
--- a/gen/gen.ml
+++ b/gen/gen.ml
@ -1,7 +1,6 @@
 (* Automatically generate the C++ -> C -> Go bindings.
   This takes as input the Descriptions.yaml file that gets generated when
 func (Func.c_go_args_list func)  building PyTorch from source.
-
   Run with: dune exec gen/gen.exe
 *)
 open Base
@ -347,15 +346,15 @@ module Func = struct
              | Bool -> "bool"
              | Int64 -> "int64"
              | Double -> "float64"
-              | Tensor -> "Tensor"
-              | TensorOption -> "Tensor"
+              | Tensor -> "*Tensor"
+              | TensorOption -> "*Tensor"
              | IntList -> "[]int64"
              | TensorList -> "[]Tensor"
              | String -> "string"
              (* TODO. Struct{Kind gotch.DType Device gotch.Device} *)
              (* E.g. `type KindDevice struct{}` *)
              | TensorOptions -> "gotch.KindDevice"
-              | Scalar -> "Scalar"
+              | Scalar -> "*Scalar"
              | ScalarType -> "gotch.DType"
              | Device -> "gotch.Device"
            in
@ -396,9 +395,9 @@ module Func = struct
    (* printf "t name: %s\n" t.name ; *)
    let returns =
      match t.returns with
-      | `fixed 1 -> "retVal Tensor"
+      | `fixed 1 -> "retVal *Tensor"
      | `fixed v ->
-          List.init v ~f:(fun i -> Printf.sprintf "retVal%d Tensor" i)
+          List.init v ~f:(fun i -> Printf.sprintf "retVal%d *Tensor" i)
          |> String.concat ~sep:", " |> Printf.sprintf "%s"
      | `dynamic -> "retVal []Tensor"
    in
@ -698,7 +697,7 @@ let write_wrapper funcs filename =
            match func.returns with
            | `dynamic ->
                pm "\n" ;
-                if is_method then pm "func(ts Tensor) %s(" gofunc_name
+                if is_method then pm "func(ts *Tensor) %s(" gofunc_name
                else pm "func %s(" gofunc_name ;
                pm "%s" go_args_list ;
                pm ")(%s) { \n" (Func.go_return_type func ~fallible:true) ;
@ -714,13 +713,13 @@ let write_wrapper funcs filename =
                pm "  }\n" ;
                (* NOTE. if in_place method, no retVal return *)
                if not (Func.is_inplace func) then
-                  pm "  retVal = Tensor{ctensor: *ptr}\n" ;
+                  pm "  retVal = &Tensor{ctensor: *ptr}\n" ;
                pm "  \n" ;
                pm "  return %s\n" (Func.go_return_notype func ~fallible:true) ;
                pm "} \n"
            | `fixed 1 ->
                pm "\n" ;
-                if is_method then pm "func(ts Tensor) %s(" gofunc_name
+                if is_method then pm "func(ts *Tensor) %s(" gofunc_name
                else pm "func %s(" gofunc_name ;
                pm "%s" go_args_list ;
                pm ")(%s) { \n" (Func.go_return_type func ~fallible:true) ;
@ -736,7 +735,7 @@ let write_wrapper funcs filename =
                pm "  }\n" ;
                (* NOTE. if in_place method, no retVal return *)
                if not (Func.is_inplace func) then
-                  pm "  retVal = Tensor{ctensor: *ptr}\n" ;
+                  pm "  retVal = &Tensor{ctensor: *ptr}\n" ;
                pm "  \n" ;
                pm "  return %s\n" (Func.go_return_notype func ~fallible:true) ;
                pm "} \n"
@ -804,7 +803,7 @@ let write_must_wrapper funcs filename =
            match func.returns with
            | `dynamic ->
                pm "\n" ;
-                if is_method then pm "func(ts Tensor) %s(" gofunc_name
+                if is_method then pm "func(ts *Tensor) %s(" gofunc_name
                else pm "func Must%s(" gofunc_name ;
                pm "%s" go_args_list ;
                pm ")(%s) { \n" (Func.go_return_type func ~fallible:false) ;
@ -821,7 +820,7 @@ let write_must_wrapper funcs filename =
                pm "} \n"
            | `fixed 1 ->
                pm "\n" ;
-                if is_method then pm "func(ts Tensor) Must%s(" gofunc_name
+                if is_method then pm "func(ts *Tensor) Must%s(" gofunc_name
                else pm "func Must%s(" gofunc_name ;
                pm "%s" go_args_list ;
                pm ")(%s) { \n" (Func.go_return_type func ~fallible:false) ;
--- a/go.mod
+++ b/go.mod
@ -1,3 +1,8 @@
 module github.com/sugarme/gotch

 go 1.14
+
+require (
+	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
+	golang.org/x/image v0.0.0-20200927104501-e162460cd6b5
+)
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,6 @@
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/sugarme/playgo v0.0.0-20200730185408-03b868cebe81 h1:s43waOvGVYyjw8i+Ll2Qb/ASt+etXG7LhWetEGTLjbc=
+golang.org/x/image v0.0.0-20200927104501-e162460cd6b5 h1:QelT11PB4FXiDEXucrfNckHoFxwt8USGY1ajP1ZF5lM=
+golang.org/x/image v0.0.0-20200927104501-e162460cd6b5/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
--- a/libtch/c-generated.go
+++ b/libtch/c-generated.go
--- a/nn/batch-norm.go
+++ b/nn/batch-norm.go
@ -17,8 +17,8 @@ type BatchNormConfig struct {
 	BsInit      Init
 }

-func DefaultBatchNormConfig() BatchNormConfig {
-	return BatchNormConfig{
+func DefaultBatchNormConfig() *BatchNormConfig {
+	return &BatchNormConfig{
 		CudnnEnable: true,
 		Eps:         1e-5,
 		Momentum:    0.1,
@ -29,17 +29,17 @@ func DefaultBatchNormConfig() BatchNormConfig {

 // A batch-normalization layer.
 type BatchNorm struct {
-	config      BatchNormConfig
-	RunningMean ts.Tensor
-	RunningVar  ts.Tensor
-	Ws          ts.Tensor
-	Bs          ts.Tensor
+	config      *BatchNormConfig
+	RunningMean *ts.Tensor
+	RunningVar  *ts.Tensor
+	Ws          *ts.Tensor
+	Bs          *ts.Tensor
 	Nd          uint
 }

 // NewBatchNorm creates a new BatchNorm layer
-func NewBatchNorm(vs Path, nd uint, outDim int64, config BatchNormConfig) BatchNorm {
-	return BatchNorm{
+func NewBatchNorm(vs *Path, nd uint, outDim int64, config *BatchNormConfig) *BatchNorm {
+	return &BatchNorm{
 		config:      config,
 		RunningMean: vs.ZerosNoTrain("running_mean", []int64{outDim}),
 		RunningVar:  vs.OnesNoTrain("running_var", []int64{outDim}),
@ -52,7 +52,7 @@ func NewBatchNorm(vs Path, nd uint, outDim int64, config BatchNormConfig) BatchN
 //
 // The input shape is assumed to be (N, C, L). Normalization
 // is performed over the first batch dimension N.
-func BatchNorm1D(vs Path, outDim int64, config BatchNormConfig) BatchNorm {
+func BatchNorm1D(vs *Path, outDim int64, config *BatchNormConfig) *BatchNorm {
 	return NewBatchNorm(vs, 1, outDim, config)
 }

@ -60,7 +60,7 @@ func BatchNorm1D(vs Path, outDim int64, config BatchNormConfig) BatchNorm {
 //
 // The input shape is assumed to be (N, C, H, W). Normalization
 // is performed over the first batch dimension N.
-func BatchNorm2D(vs Path, outDim int64, config BatchNormConfig) BatchNorm {
+func BatchNorm2D(vs *Path, outDim int64, config *BatchNormConfig) *BatchNorm {
 	return NewBatchNorm(vs, 2, outDim, config)
 }

@ -68,14 +68,14 @@ func BatchNorm2D(vs Path, outDim int64, config BatchNormConfig) BatchNorm {
 //
 // The input shape is assumed to be (N, C, D, H, W). Normalization
 // is performed over the first batch dimension N.
-func BatchNorm3D(vs Path, outDim int64, config BatchNormConfig) BatchNorm {
+func BatchNorm3D(vs *Path, outDim int64, config *BatchNormConfig) *BatchNorm {
 	return NewBatchNorm(vs, 3, outDim, config)
 }

 // Implement ModuleT interface for BatchNorm:
 // ==========================================

-func (bn BatchNorm) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (bn *BatchNorm) ForwardT(xs *ts.Tensor, train bool) (retVal *ts.Tensor) {

 	dim := xs.Dim()

--- a/nn/conv-transpose.go
+++ b/nn/conv-transpose.go
@ -42,8 +42,8 @@ type ConvTranspose3DConfig struct {
 }

 // DefaultConvConfig create a default 1D ConvConfig
-func DefaultConvTranspose1DConfig() ConvTranspose1DConfig {
-	return ConvTranspose1DConfig{
+func DefaultConvTranspose1DConfig() *ConvTranspose1DConfig {
+	return &ConvTranspose1DConfig{
 		Stride:        []int64{1},
 		Padding:       []int64{0},
 		OutputPadding: []int64{0},
@ -56,83 +56,107 @@ func DefaultConvTranspose1DConfig() ConvTranspose1DConfig {
 }

 type ConvTranspose1D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config ConvTranspose1DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *ConvTranspose1DConfig
 }

-func NewConvTranspose1D(vs *Path, inDim, outDim int64, ksizes []int64, cfg ConvTranspose1DConfig) ConvTranspose1D {
+func NewConvTranspose1D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *ConvTranspose1DConfig) *ConvTranspose1D {
 	if len(ksizes) != 1 {
 		log.Fatalf("NewConvTranspose1D method call: Kernel size should be 1. Got %v\n", len(ksizes))
 	}

-	var conv ConvTranspose1D
-	conv.Config = cfg
-	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
-	}
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
+
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, ksizes...)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	if cfg.Bias {
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+	}
+
+	return &ConvTranspose1D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 type ConvTranspose2D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config ConvTranspose2DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *ConvTranspose2DConfig
 }

-func NewConvTranspose2D(vs *Path, inDim, outDim int64, ksizes []int64, cfg ConvTranspose2DConfig) ConvTranspose2D {
+func NewConvTranspose2D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *ConvTranspose2DConfig) *ConvTranspose2D {

 	if len(ksizes) != 2 {
 		log.Fatalf("NewConvTranspose2D method call: Kernel size should be 2. Got %v\n", len(ksizes))
 	}
-	var conv ConvTranspose2D
-	conv.Config = cfg
+
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
+
 	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 	}
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, ksizes...)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	return &ConvTranspose2D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 type ConvTranspose3D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config ConvTranspose3DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *ConvTranspose3DConfig
 }

-func NewConvTranspose3D(vs *Path, inDim, outDim int64, ksizes []int64, cfg ConvTranspose3DConfig) ConvTranspose3D {
+func NewConvTranspose3D(vs *Path, inDim, outDim int64, ksizes []int64, cfg *ConvTranspose3DConfig) *ConvTranspose3D {
 	if len(ksizes) != 3 {
 		log.Fatalf("NewConvTranspose3D method call: Kernel size should be 3. Got %v\n", len(ksizes))
 	}
-	var conv ConvTranspose3D
-	conv.Config = cfg
+
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
+
 	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 	}
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, ksizes...)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	return &ConvTranspose3D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 // Implement Module for Conv1D, Conv2D, Conv3D:
 // ============================================

-func (c ConvTranspose1D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *ConvTranspose1D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConvTranspose1d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.OutputPadding, c.Config.Groups, c.Config.Dilation)
 }

-func (c ConvTranspose2D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *ConvTranspose2D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConvTranspose2d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.OutputPadding, c.Config.Groups, c.Config.Dilation)
 }
-func (c ConvTranspose3D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *ConvTranspose3D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConvTranspose3d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.OutputPadding, c.Config.Groups, c.Config.Dilation)
 }
--- a/nn/conv.go
+++ b/nn/conv.go
@ -40,8 +40,8 @@ type Conv3DConfig struct {
 }

 // DefaultConvConfig create a default 1D ConvConfig
-func DefaultConv1DConfig() Conv1DConfig {
-	return Conv1DConfig{
+func DefaultConv1DConfig() *Conv1DConfig {
+	return &Conv1DConfig{
 		Stride:   []int64{1},
 		Padding:  []int64{0},
 		Dilation: []int64{1},
@ -53,8 +53,8 @@ func DefaultConv1DConfig() Conv1DConfig {
 }

 // DefaultConvConfig2D creates a default 2D ConvConfig
-func DefaultConv2DConfig() Conv2DConfig {
-	return Conv2DConfig{
+func DefaultConv2DConfig() *Conv2DConfig {
+	return &Conv2DConfig{
 		Stride:   []int64{1, 1},
 		Padding:  []int64{0, 0},
 		Dilation: []int64{1, 1},
@ -66,60 +66,78 @@ func DefaultConv2DConfig() Conv2DConfig {
 }

 type Conv1D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config Conv1DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *Conv1DConfig
 }

-func NewConv1D(vs *Path, inDim, outDim, k int64, cfg Conv1DConfig) Conv1D {
-	var conv Conv1D
-	conv.Config = cfg
+func NewConv1D(vs *Path, inDim, outDim, k int64, cfg *Conv1DConfig) *Conv1D {
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
 	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 	}
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, k)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	return &Conv1D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 type Conv2D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config Conv2DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *Conv2DConfig
 }

-func NewConv2D(vs Path, inDim, outDim int64, k int64, cfg Conv2DConfig) Conv2D {
-	var conv Conv2D
-	conv.Config = cfg
+func NewConv2D(vs *Path, inDim, outDim int64, k int64, cfg *Conv2DConfig) *Conv2D {
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
 	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 	}
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, k, k)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	return &Conv2D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 type Conv3D struct {
-	Ws     ts.Tensor
-	Bs     ts.Tensor // optional
-	Config Conv3DConfig
+	Ws     *ts.Tensor
+	Bs     *ts.Tensor // optional
+	Config *Conv3DConfig
 }

-func NewConv3D(vs *Path, inDim, outDim, k int64, cfg Conv3DConfig) Conv3D {
-	var conv Conv3D
-	conv.Config = cfg
+func NewConv3D(vs *Path, inDim, outDim, k int64, cfg *Conv3DConfig) *Conv3D {
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)
 	if cfg.Bias {
-		conv.Bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 	}
 	weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 	weightSize = append(weightSize, k, k, k)
-	conv.Ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+	ws = vs.NewVar("weight", weightSize, cfg.WsInit)

-	return conv
+	return &Conv3D{
+		Ws:     ws,
+		Bs:     bs,
+		Config: cfg,
+	}
 }

 type Conv interface{}
@ -172,43 +190,56 @@ func buildConvConfig(ksizes []int64) interface{} {

 // NewConv is a generic builder to build Conv1D, Conv2D, Conv3D. It returns
 // an interface Conv which might need a type assertion for further use.
-func NewConv(vs Path, inDim, outDim int64, ksizes []int64, config interface{}) Conv {
+func NewConv(vs *Path, inDim, outDim int64, ksizes []int64, config interface{}) Conv {

 	configT := reflect.TypeOf(config)
+	var (
+		ws *ts.Tensor
+		bs *ts.Tensor = ts.NewTensor()
+	)

 	switch {
-	case len(ksizes) == 1 && configT.Name() == "Conv1DConfig":
-		var conv Conv1D
-		conv.Config = config.(Conv1DConfig)
-		if config.(Conv1DConfig).Bias {
-			conv.Bs = vs.NewVar("bias", []int64{outDim}, config.(Conv1DConfig).BsInit)
+	case len(ksizes) == 1 && configT.String() == "*nn.Conv1DConfig":
+		cfg := config.(*Conv1DConfig)
+		if cfg.Bias {
+			bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 		}
-		weightSize := []int64{outDim, int64(inDim / config.(Conv1DConfig).Groups)}
+		weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 		weightSize = append(weightSize, ksizes...)
-		conv.Ws = vs.NewVar("weight", weightSize, config.(Conv1DConfig).WsInit)
-		return conv
-	case len(ksizes) == 2 && configT.Name() == "Conv2DConfig":
-		var conv Conv2D
-		conv.Config = config.(Conv2DConfig)
-		if config.(Conv2DConfig).Bias {
-			conv.Bs = vs.NewVar("bias", []int64{outDim}, config.(Conv2DConfig).BsInit)
+		ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+		return &Conv1D{
+			Ws:     ws,
+			Bs:     bs,
+			Config: cfg,
 		}
-		weightSize := []int64{outDim, int64(inDim / config.(Conv2DConfig).Groups)}
-		weightSize = append(weightSize, ksizes...)
-		conv.Ws = vs.NewVar("weight", weightSize, config.(Conv2DConfig).WsInit)
-		return conv
-	case len(ksizes) == 3 && configT.Name() == "Conv3DConfig":
-		var conv Conv3D
-		conv.Config = config.(Conv3DConfig)
-		if config.(Conv3DConfig).Bias {
-			conv.Bs = vs.NewVar("bias", []int64{outDim}, config.(Conv3DConfig).BsInit)
+	case len(ksizes) == 2 && configT.String() == "*nn.Conv2DConfig":
+		cfg := config.(*Conv2DConfig)
+		if cfg.Bias {
+			bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
 		}
-		weightSize := []int64{outDim, int64(inDim / config.(Conv3DConfig).Groups)}
+		weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
 		weightSize = append(weightSize, ksizes...)
-		conv.Ws = vs.NewVar("weight", weightSize, config.(Conv3DConfig).WsInit)
-		return conv
+		ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+		return &Conv2D{
+			Ws:     ws,
+			Bs:     bs,
+			Config: cfg,
+		}
+	case len(ksizes) == 3 && configT.String() == "*nn.Conv3DConfig":
+		cfg := config.(*Conv3DConfig)
+		if cfg.Bias {
+			bs = vs.NewVar("bias", []int64{outDim}, cfg.BsInit)
+		}
+		weightSize := []int64{outDim, int64(inDim / cfg.Groups)}
+		weightSize = append(weightSize, ksizes...)
+		ws = vs.NewVar("weight", weightSize, cfg.WsInit)
+		return &Conv3D{
+			Ws:     ws,
+			Bs:     bs,
+			Config: cfg,
+		}
 	default:
-		err := fmt.Errorf("Expected nd length from 1 to 3. Got %v\n", len(ksizes))
+		err := fmt.Errorf("Expected nd length from 1 to 3. Got %v - configT name: '%v'\n", len(ksizes), configT.String())
 		panic(err)
 	}
 }
@ -216,14 +247,14 @@ func NewConv(vs Path, inDim, outDim int64, ksizes []int64, config interface{}) C
 // Implement Module for Conv1D, Conv2D, Conv3D:
 // ============================================

-func (c Conv1D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *Conv1D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConv1d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }

-func (c Conv2D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *Conv2D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConv2d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }
-func (c Conv3D) Forward(xs ts.Tensor) ts.Tensor {
+func (c *Conv3D) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustConv3d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }

@ -232,13 +263,13 @@ func (c Conv3D) Forward(xs ts.Tensor) ts.Tensor {

 // NOTE: `train` param won't be used, will be?

-func (c Conv1D) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
+func (c *Conv1D) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	return ts.MustConv1d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }

-func (c Conv2D) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
+func (c *Conv2D) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	return ts.MustConv2d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }
-func (c Conv3D) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
+func (c *Conv3D) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	return ts.MustConv3d(xs, c.Ws, c.Bs, c.Config.Stride, c.Config.Padding, c.Config.Dilation, c.Config.Groups)
 }
--- a/nn/func.go
+++ b/nn/func.go
@ -7,36 +7,36 @@ import (
 )

 type Func struct {
-	f func(ts.Tensor) ts.Tensor
+	f func(*ts.Tensor) *ts.Tensor
 }

-func NewFunc(fn func(ts.Tensor) ts.Tensor) (retVal Func) {
+func NewFunc(fn func(*ts.Tensor) *ts.Tensor) (retVal Func) {
 	return Func{f: fn}
 }

 // Implement Module interface for Func:
 // ====================================
-func (fn Func) Forward(xs ts.Tensor) (retVal ts.Tensor) {
+func (fn Func) Forward(xs *ts.Tensor) (retVal *ts.Tensor) {
 	return fn.f(xs)
 }

 // ForwardT implements ModuleT for Func object as well.
 //
 // NOTE: train param will not be used.
-func (fn Func) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (fn Func) ForwardT(xs *ts.Tensor, train bool) (retVal *ts.Tensor) {
 	return fn.f(xs)
 }

 type FuncT struct {
-	f func(ts.Tensor, bool) ts.Tensor
+	f func(*ts.Tensor, bool) *ts.Tensor
 }

-func NewFuncT(fn func(ts.Tensor, bool) ts.Tensor) (retVal FuncT) {
+func NewFuncT(fn func(*ts.Tensor, bool) *ts.Tensor) (retVal FuncT) {
 	return FuncT{f: fn}
 }

 // Implement Module interface for Func:
 // ====================================
-func (fn FuncT) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (fn FuncT) ForwardT(xs *ts.Tensor, train bool) (retVal *ts.Tensor) {
 	return fn.f(xs, train)
 }
--- a/nn/init.go
+++ b/nn/init.go
@ -11,10 +11,10 @@ import (

 type Init interface {
 	// creates a new tensor with specified initiation
-	InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor)
+	InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor)

 	// re-initializes (in-place) an existing tensor with the specified initiation
-	Set(tensor ts.Tensor)
+	Set(tensor *ts.Tensor)
 }

 // constInit:
@ -28,7 +28,7 @@ func NewConstInit(v float64) constInit {
 	return constInit{v}
 }

-func (c constInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor) {
+func (c constInit) InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor) {
 	var err error
 	kind := gotch.Float
 	switch {
@ -50,7 +50,7 @@ func (c constInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tens
 	return retVal
 }

-func (c constInit) Set(tensor ts.Tensor) {
+func (c constInit) Set(tensor *ts.Tensor) {
 	var err error
 	scalarVal := ts.FloatScalar(c.value)
 	if err != nil {
@ -71,7 +71,7 @@ func NewRandnInit(mean, stdev float64) randnInit {
 	return randnInit{mean, stdev}
 }

-func (r randnInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor) {
+func (r randnInit) InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor) {
 	var err error
 	rand.Seed(86)

@ -92,9 +92,9 @@ func (r randnInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tens

 }

-func (r randnInit) Set(tensor ts.Tensor) {
+func (r randnInit) Set(tensor *ts.Tensor) {
 	var (
-		randnTs ts.Tensor
+		randnTs *ts.Tensor
 		err     error
 	)

@ -128,7 +128,7 @@ func NewUniformInit(lo, up float64) uniformInit {
 	return uniformInit{lo, up}
 }

-func (u uniformInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor) {
+func (u uniformInit) InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor) {
 	var err error
 	kind := gotch.Float
 	retVal = ts.MustZeros(dims, kind, device)
@ -139,7 +139,7 @@ func (u uniformInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Te
 	return retVal
 }

-func (u uniformInit) Set(tensor ts.Tensor) {
+func (u uniformInit) Set(tensor *ts.Tensor) {
 	tensor.Uniform_(u.lo, u.up)
 }

@ -152,7 +152,7 @@ func NewKaimingUniformInit() kaimingUniformInit {
 	return kaimingUniformInit{}
 }

-func (k kaimingUniformInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor) {
+func (k kaimingUniformInit) InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor) {
 	var fanIn int64
 	if len(dims) == 0 {
 		log.Fatalf("KaimingUniformInit method call: dims (%v) should have length >= 1", dims)
@ -191,7 +191,7 @@ func factorial(n int64) (result int64) {
 	return 1
 }

-func (k kaimingUniformInit) Set(tensor ts.Tensor) {
+func (k kaimingUniformInit) Set(tensor *ts.Tensor) {
 	dims, err := tensor.Size()
 	if err != nil {
 		log.Fatalf("uniformInit - Set method call error: %v\n", err)
@ -218,12 +218,12 @@ func NewGlorotNInit() glorotNInit {
 	return glorotNInit{}
 }

-func (gl glorotNInit) InitTensor(dims []int64, device gotch.Device) (retVal ts.Tensor) {
+func (gl glorotNInit) InitTensor(dims []int64, device gotch.Device) (retVal *ts.Tensor) {
 	// TODO: implement

 	return
 }

-func (gl glorotNInit) Set(tensor ts.Tensor) {
+func (gl glorotNInit) Set(tensor *ts.Tensor) {
 	// TODO: implement
 }
--- a/nn/layer-norm.go
+++ b/nn/layer-norm.go
@ -14,8 +14,8 @@ type LayerNormConfig struct {
 	BsInit            Init
 }

-func DefaultLayerNormConfig() LayerNormConfig {
-	return LayerNormConfig{
+func DefaultLayerNormConfig() *LayerNormConfig {
+	return &LayerNormConfig{
 		CudnnEnable:       true,
 		Eps:               1e-5,
 		ElementwiseAffine: true,
@ -26,30 +26,30 @@ func DefaultLayerNormConfig() LayerNormConfig {

 // A layer-normalization layer.
 type LayerNorm struct {
-	Config          LayerNormConfig
-	Ws              ts.Tensor // optional
-	Bs              ts.Tensor // optional
+	Config          *LayerNormConfig
+	Ws              *ts.Tensor // optional
+	Bs              *ts.Tensor // optional
 	NormalizedShape []int64
 }

-func NewLayerNorm(vs Path, normalizedShape []int64, config LayerNormConfig) LayerNorm {
+func NewLayerNorm(vs Path, normalizedShape []int64, config *LayerNormConfig) *LayerNorm {

 	var (
-		ws ts.Tensor
-		bs ts.Tensor
+		ws *ts.Tensor
+		bs *ts.Tensor
 	)
 	if config.ElementwiseAffine {
 		ws = vs.NewVar("weight", normalizedShape, config.WsInit)
 		bs = vs.NewVar("bias", normalizedShape, config.BsInit)
 	}

-	return LayerNorm{config, ws, bs, normalizedShape}
+	return &LayerNorm{config, ws, bs, normalizedShape}
 }

 // Implement Module interface for LayerNorm:
 // =========================================

-func (ln LayerNorm) Forward(xs ts.Tensor) (retVal ts.Tensor) {
+func (ln *LayerNorm) Forward(xs *ts.Tensor) (retVal *ts.Tensor) {

 	return ts.MustLayerNorm(xs, ln.NormalizedShape, ln.Ws, ln.Bs, ln.Config.Eps, ln.Config.CudnnEnable)
 }
--- a/nn/linear.go
+++ b/nn/linear.go
@ -18,8 +18,8 @@ type LinearConfig struct {

 // DefaultLinearConfig creates default LinearConfig with
 // weights initiated using KaimingUniform and Bias is set to true
-func DefaultLinearConfig() LinearConfig {
-	return LinearConfig{
+func DefaultLinearConfig() *LinearConfig {
+	return &LinearConfig{
 		WsInit: NewKaimingUniformInit(),
 		BsInit: nil,
 		Bias:   true,
@ -28,8 +28,8 @@ func DefaultLinearConfig() LinearConfig {

 // Linear is a linear fully-connected layer
 type Linear struct {
-	Ws ts.Tensor
-	Bs ts.Tensor
+	Ws *ts.Tensor
+	Bs *ts.Tensor
 }

 // NewLinear creates a new linear layer
@ -37,9 +37,9 @@ type Linear struct {
 // inDim - input dimension (x) [input features - columns]
 // outDim - output dimension (y) [output features - columns]
 // NOTE: w will have shape{outDim, inDim}; b will have shape{outDim}
-func NewLinear(vs Path, inDim, outDim int64, c LinearConfig) Linear {
+func NewLinear(vs *Path, inDim, outDim int64, c *LinearConfig) *Linear {

-	var bs ts.Tensor
+	var bs *ts.Tensor
 	// bs has size of output dimension
 	switch c.Bias {
 	case false:
@ -55,7 +55,7 @@ func NewLinear(vs Path, inDim, outDim int64, c LinearConfig) Linear {
 		}
 	}

-	return Linear{
+	return &Linear{
 		Ws: vs.NewVar("weight", []int64{outDim, inDim}, c.WsInit).MustT(false),
 		Bs: bs,
 	}
@ -89,7 +89,7 @@ func NewLinear(vs Path, inDim, outDim int64, c LinearConfig) Linear {
 // 	  1 1 1
 // 	  1 1 1
 // 		1 1 1 ]
-func (l Linear) Forward(xs ts.Tensor) (retVal ts.Tensor) {
+func (l *Linear) Forward(xs *ts.Tensor) (retVal *ts.Tensor) {

 	mul := xs.MustMatmul(l.Ws, false)
 	return mul.MustAdd(l.Bs, true)
@ -98,7 +98,7 @@ func (l Linear) Forward(xs ts.Tensor) (retVal ts.Tensor) {
 // ForwardT implements ModuleT interface for Linear layer.
 //
 // NOTE: train param will not be used.
-func (l Linear) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (l *Linear) ForwardT(xs *ts.Tensor, train bool) (retVal *ts.Tensor) {

 	mul := xs.MustMatmul(l.Ws, false)
 	return mul.MustAdd(l.Bs, true)
--- a/nn/optimizer.go
+++ b/nn/optimizer.go
@ -10,7 +10,7 @@ import (

 // Optimizer is a struct object to run gradient descent.
 type Optimizer struct {
-	opt ts.COptimizer
+	opt *ts.COptimizer
 	// variables            Variables // having embedded sync.Mutex
 	variablesInOptimizer uint8
 	config               interface{}
@ -18,7 +18,7 @@ type Optimizer struct {

 // OptimizerConfig defines Optimizer configurations. These configs can be used to build optimizer.
 type OptimizerConfig interface {
-	buildCOpt(lr float64) (retVal ts.COptimizer, err error)
+	buildCOpt(lr float64) (*ts.COptimizer, error)

 	// Build builds an optimizer with the specified learning rate handling variables stored in `vs`.
 	//
@ -29,11 +29,11 @@ type OptimizerConfig interface {
 	// (config AdamOptimizerConfig) Build(vs VarStore, lr float64) (retVal Optimizer, err error){
 	//		return defaultBuild(config, vs, lr)
 	// }
-	Build(vs VarStore, lr float64) (retVal Optimizer, err error)
+	Build(vs *VarStore, lr float64) (*Optimizer, error)
 }

 // defaultBuild is `default` Build method for OptimizerConfig interface
-func defaultBuild(config OptimizerConfig, vs VarStore, lr float64) (retVal Optimizer, err error) {
+func defaultBuild(config OptimizerConfig, vs *VarStore, lr float64) (retVal *Optimizer, err error) {

 	opt, err := config.buildCOpt(lr)
 	if err != nil {
@ -43,7 +43,7 @@ func defaultBuild(config OptimizerConfig, vs VarStore, lr float64) (retVal Optim
 	var parameters []ts.Tensor
 	for _, v := range vs.Vars.TrainableVariables {
 		param := v.MustShallowClone()
-		parameters = append(parameters, param)
+		parameters = append(parameters, *param)
 	}

 	if len(vs.Vars.TrainableVariables) > 0 {
@ -54,7 +54,7 @@ func defaultBuild(config OptimizerConfig, vs VarStore, lr float64) (retVal Optim

 	// TODO: should we clone or copy?

-	return Optimizer{
+	return &Optimizer{
 		opt: opt,
 		// variables:            vs.Vars,
 		variablesInOptimizer: uint8(len(vs.Vars.TrainableVariables)),
@ -74,8 +74,8 @@ type SGDConfig struct {
 }

 // DefaultSGDConfig creates SGDConfig with default values.
-func DefaultSGDConfig() SGDConfig {
-	return SGDConfig{
+func DefaultSGDConfig() *SGDConfig {
+	return &SGDConfig{
 		Momentum:  0.0,
 		Dampening: 0.0,
 		Wd:        0.0,
@ -84,8 +84,8 @@ func DefaultSGDConfig() SGDConfig {
 }

 // NewSGD creates the configuration for a SGD optimizer with specified values
-func NewSGDConfig(momentum, dampening, wd float64, nesterov bool) (retVal SGDConfig) {
-	return SGDConfig{
+func NewSGDConfig(momentum, dampening, wd float64, nesterov bool) *SGDConfig {
+	return &SGDConfig{
 		Momentum:  momentum,
 		Dampening: dampening,
 		Wd:        wd,
@ -94,11 +94,11 @@ func NewSGDConfig(momentum, dampening, wd float64, nesterov bool) (retVal SGDCon
 }

 // Implement OptimizerConfig interface for SGDConfig
-func (c SGDConfig) buildCOpt(lr float64) (retVal ts.COptimizer, err error) {
+func (c *SGDConfig) buildCOpt(lr float64) (*ts.COptimizer, error) {
 	return ts.Sgd(lr, c.Momentum, c.Dampening, c.Wd, c.Nesterov)
 }

-func (c SGDConfig) Build(vs VarStore, lr float64) (retVal Optimizer, err error) {
+func (c *SGDConfig) Build(vs *VarStore, lr float64) (*Optimizer, error) {
 	return defaultBuild(c, vs, lr)
 }

@ -112,8 +112,8 @@ type AdamConfig struct {
 }

 // DefaultAdamConfig creates AdamConfig with default values
-func DefaultAdamConfig() AdamConfig {
-	return AdamConfig{
+func DefaultAdamConfig() *AdamConfig {
+	return &AdamConfig{
 		Beta1: 0.9,
 		Beta2: 0.999,
 		Wd:    0.0,
@ -121,8 +121,8 @@ func DefaultAdamConfig() AdamConfig {
 }

 // NewAdamConfig creates AdamConfig with specified values
-func NewAdamConfig(beta1, beta2, wd float64) AdamConfig {
-	return AdamConfig{
+func NewAdamConfig(beta1, beta2, wd float64) *AdamConfig {
+	return &AdamConfig{
 		Beta1: beta1,
 		Beta2: beta2,
 		Wd:    wd,
@ -130,11 +130,11 @@ func NewAdamConfig(beta1, beta2, wd float64) AdamConfig {
 }

 // Implement OptimizerConfig interface for AdamConfig
-func (c AdamConfig) buildCOpt(lr float64) (retVal ts.COptimizer, err error) {
+func (c *AdamConfig) buildCOpt(lr float64) (*ts.COptimizer, error) {
 	return ts.Adam(lr, c.Beta1, c.Beta2, c.Wd)
 }

-func (c AdamConfig) Build(vs VarStore, lr float64) (retVal Optimizer, err error) {
+func (c *AdamConfig) Build(vs *VarStore, lr float64) (*Optimizer, error) {
 	return defaultBuild(c, vs, lr)
 }

@ -150,8 +150,8 @@ type RMSPropConfig struct {
 }

 // DefaultAdamConfig creates AdamConfig with default values
-func DefaultRMSPropConfig() RMSPropConfig {
-	return RMSPropConfig{
+func DefaultRMSPropConfig() *RMSPropConfig {
+	return &RMSPropConfig{
 		Alpha:    0.99,
 		Eps:      1e-8,
 		Wd:       0.0,
@ -161,8 +161,8 @@ func DefaultRMSPropConfig() RMSPropConfig {
 }

 // NewRMSPropConfig creates RMSPropConfig with specified values
-func NewRMSPropConfig(alpha, eps, wd, momentum float64, centered bool) RMSPropConfig {
-	return RMSPropConfig{
+func NewRMSPropConfig(alpha, eps, wd, momentum float64, centered bool) *RMSPropConfig {
+	return &RMSPropConfig{
 		Alpha:    alpha,
 		Eps:      eps,
 		Wd:       wd,
@ -172,11 +172,11 @@ func NewRMSPropConfig(alpha, eps, wd, momentum float64, centered bool) RMSPropCo
 }

 // Implement OptimizerConfig interface for RMSPropConfig
-func (c RMSPropConfig) buildCOpt(lr float64) (retVal ts.COptimizer, err error) {
+func (c *RMSPropConfig) buildCOpt(lr float64) (*ts.COptimizer, error) {
 	return ts.RmsProp(lr, c.Alpha, c.Eps, c.Wd, c.Momentum, c.Centered)
 }

-func (c RMSPropConfig) Build(vs VarStore, lr float64) (retVal Optimizer, err error) {
+func (c *RMSPropConfig) Build(vs *VarStore, lr float64) (*Optimizer, error) {
 	return defaultBuild(c, vs, lr)
 }

@ -229,7 +229,7 @@ func (opt *Optimizer) Step() {
 }

 // BackwardStep applies a backward step pass, update the gradients, and performs an optimization step.
-func (opt *Optimizer) BackwardStep(loss ts.Tensor) {
+func (opt *Optimizer) BackwardStep(loss *ts.Tensor) {

 	opt.addMissingVariables()

@ -250,7 +250,7 @@ func (opt *Optimizer) BackwardStep(loss ts.Tensor) {
 // BackwardStepClip applies a backward step pass, update the gradients, and performs an optimization step.
 //
 // The gradients are clipped based on `max` before being applied.
-func (opt *Optimizer) BackwardStepClip(loss ts.Tensor, max float64) {
+func (opt *Optimizer) BackwardStepClip(loss *ts.Tensor, max float64) {
 	opt.addMissingVariables()

 	err := opt.opt.ZeroGrad()
--- a/nn/rnn.go
+++ b/nn/rnn.go
@ -15,33 +15,33 @@ type RNN interface {
 	// Applies a single step of the recurrent network.
 	//
 	// The input should have dimensions [batch_size, features].
-	Step(input ts.Tensor, inState State) State
+	Step(input *ts.Tensor, inState State) State

 	// Applies multiple steps of the recurrent network.
 	//
 	// The input should have dimensions [batch_size, seq_len, features].
 	// The initial state is the result of applying zero_state.
-	Seq(input ts.Tensor) (ts.Tensor, State)
+	Seq(input *ts.Tensor) (*ts.Tensor, State)

 	// Applies multiple steps of the recurrent network.
 	//
 	// The input should have dimensions [batch_size, seq_len, features].
-	SeqInit(input ts.Tensor, inState State) (ts.Tensor, State)
+	SeqInit(input *ts.Tensor, inState State) (*ts.Tensor, State)
 }

 // The state for a LSTM network, this contains two tensors.
 type LSTMState struct {
-	Tensor1 ts.Tensor
-	Tensor2 ts.Tensor
+	Tensor1 *ts.Tensor
+	Tensor2 *ts.Tensor
 }

 // The hidden state vector, which is also the output of the LSTM.
-func (ls LSTMState) H() (retVal ts.Tensor) {
+func (ls *LSTMState) H() *ts.Tensor {
 	return ls.Tensor1.MustShallowClone()
 }

 // The cell state vector.
-func (ls LSTMState) C() (retVal ts.Tensor) {
+func (ls *LSTMState) C() *ts.Tensor {
 	return ls.Tensor2.MustShallowClone()
 }

@ -57,8 +57,8 @@ type RNNConfig struct {
 }

 // Default creates default RNN configuration
-func DefaultRNNConfig() RNNConfig {
-	return RNNConfig{
+func DefaultRNNConfig() *RNNConfig {
+	return &RNNConfig{
 		HasBiases:     true,
 		NumLayers:     1,
 		Dropout:       float64(0.0),
@ -74,12 +74,12 @@ func DefaultRNNConfig() RNNConfig {
 type LSTM struct {
 	flatWeights []ts.Tensor
 	hiddenDim   int64
-	config      RNNConfig
+	config      *RNNConfig
 	device      gotch.Device
 }

 // NewLSTM creates a LSTM layer.
-func NewLSTM(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal LSTM) {
+func NewLSTM(vs *Path, inDim, hiddenDim int64, cfg *RNNConfig) *LSTM {

 	var numDirections int64 = 1
 	if cfg.Bidirectional {
@ -100,7 +100,7 @@ func NewLSTM(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal LSTM) {
 			bIh := vs.Zeros("b_ih", []int64{gateDim})
 			bHh := vs.Zeros("b_hh", []int64{gateDim})

-			flatWeights = append(flatWeights, wIh, wHh, bIh, bHh)
+			flatWeights = append(flatWeights, *wIh, *wHh, *bIh, *bHh)
 		}
 	}

@ -112,7 +112,7 @@ func NewLSTM(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal LSTM) {
 		ts.Must_CudnnRnnFlattenWeight(flatWeights, 4, inDim, 2, hiddenDim, cfg.NumLayers, cfg.BatchFirst, cfg.Bidirectional)
 	}

-	return LSTM{
+	return &LSTM{
 		flatWeights: flatWeights,
 		hiddenDim:   hiddenDim,
 		config:      cfg,
@ -124,7 +124,7 @@ func NewLSTM(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal LSTM) {
 // Implement RNN interface for LSTM:
 // =================================

-func (l LSTM) ZeroState(batchDim int64) (retVal State) {
+func (l *LSTM) ZeroState(batchDim int64) State {
 	var numDirections int64 = 1
 	if l.config.Bidirectional {
 		numDirections = 2
@ -134,7 +134,7 @@ func (l LSTM) ZeroState(batchDim int64) (retVal State) {
 	shape := []int64{layerDim, batchDim, l.hiddenDim}
 	zeros := ts.MustZeros(shape, gotch.Float, l.device)

-	retVal = LSTMState{
+	retVal := &LSTMState{
 		Tensor1: zeros.MustShallowClone(),
 		Tensor2: zeros.MustShallowClone(),
 	}
@ -144,7 +144,7 @@ func (l LSTM) ZeroState(batchDim int64) (retVal State) {
 	return retVal
 }

-func (l LSTM) Step(input ts.Tensor, inState State) (retVal State) {
+func (l *LSTM) Step(input *ts.Tensor, inState State) State {
 	ip := input.MustUnsqueeze(1, false)

 	output, state := l.SeqInit(ip, inState)
@ -156,24 +156,24 @@ func (l LSTM) Step(input ts.Tensor, inState State) (retVal State) {
 	return state
 }

-func (l LSTM) Seq(input ts.Tensor) (output ts.Tensor, state State) {
+func (l *LSTM) Seq(input *ts.Tensor) (*ts.Tensor, State) {
 	batchDim := input.MustSize()[0]
 	inState := l.ZeroState(batchDim)

-	output, state = l.SeqInit(input, inState)
+	output, state := l.SeqInit(input, inState)

 	// Delete intermediate tensors in inState
-	inState.(LSTMState).Tensor1.MustDrop()
-	inState.(LSTMState).Tensor2.MustDrop()
+	inState.(*LSTMState).Tensor1.MustDrop()
+	inState.(*LSTMState).Tensor2.MustDrop()

 	return output, state
 }

-func (l LSTM) SeqInit(input ts.Tensor, inState State) (ts.Tensor, State) {
+func (l *LSTM) SeqInit(input *ts.Tensor, inState State) (*ts.Tensor, State) {

-	output, h, c := input.MustLstm([]ts.Tensor{inState.(LSTMState).Tensor1, inState.(LSTMState).Tensor2}, l.flatWeights, l.config.HasBiases, l.config.NumLayers, l.config.Dropout, l.config.Train, l.config.Bidirectional, l.config.BatchFirst)
+	output, h, c := input.MustLstm([]ts.Tensor{*inState.(*LSTMState).Tensor1, *inState.(*LSTMState).Tensor2}, l.flatWeights, l.config.HasBiases, l.config.NumLayers, l.config.Dropout, l.config.Train, l.config.Bidirectional, l.config.BatchFirst)

-	return output, LSTMState{
+	return output, &LSTMState{
 		Tensor1: h,
 		Tensor2: c,
 	}
@ -181,10 +181,10 @@ func (l LSTM) SeqInit(input ts.Tensor, inState State) (ts.Tensor, State) {

 // GRUState is a GRU state. It contains a single tensor.
 type GRUState struct {
-	Tensor ts.Tensor
+	Tensor *ts.Tensor
 }

-func (gs GRUState) Value() ts.Tensor {
+func (gs *GRUState) Value() *ts.Tensor {
 	return gs.Tensor
 }

@ -194,12 +194,12 @@ func (gs GRUState) Value() ts.Tensor {
 type GRU struct {
 	flatWeights []ts.Tensor
 	hiddenDim   int64
-	config      RNNConfig
+	config      *RNNConfig
 	device      gotch.Device
 }

 // NewGRU create a new GRU layer
-func NewGRU(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal GRU) {
+func NewGRU(vs *Path, inDim, hiddenDim int64, cfg *RNNConfig) (retVal *GRU) {
 	var numDirections int64 = 1
 	if cfg.Bidirectional {
 		numDirections = 2
@ -222,7 +222,7 @@ func NewGRU(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal GRU) {
 			bIh := vs.Zeros("b_ih", []int64{gateDim})
 			bHh := vs.Zeros("b_hh", []int64{gateDim})

-			flatWeights = append(flatWeights, wIh, wHh, bIh, bHh)
+			flatWeights = append(flatWeights, *wIh, *wHh, *bIh, *bHh)
 		}
 	}

@ -232,7 +232,7 @@ func NewGRU(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal GRU) {
 		ts.Must_CudnnRnnFlattenWeight(flatWeights, 4, inDim, 3, hiddenDim, cfg.NumLayers, cfg.BatchFirst, cfg.Bidirectional)
 	}

-	return GRU{
+	return &GRU{
 		flatWeights: flatWeights,
 		hiddenDim:   hiddenDim,
 		config:      cfg,
@ -243,7 +243,7 @@ func NewGRU(vs Path, inDim, hiddenDim int64, cfg RNNConfig) (retVal GRU) {
 // Implement RNN interface for GRU:
 // ================================

-func (g GRU) ZeroState(batchDim int64) (retVal State) {
+func (g *GRU) ZeroState(batchDim int64) State {
 	var numDirections int64 = 1
 	if g.config.Bidirectional {
 		numDirections = 2
@ -254,10 +254,10 @@ func (g GRU) ZeroState(batchDim int64) (retVal State) {

 	tensor := ts.MustZeros(shape, gotch.Float, g.device)

-	return GRUState{Tensor: tensor}
+	return &GRUState{Tensor: tensor}
 }

-func (g GRU) Step(input ts.Tensor, inState State) (retVal State) {
+func (g *GRU) Step(input *ts.Tensor, inState State) State {
 	unsqueezedInput := input.MustUnsqueeze(1, false)
 	output, state := g.SeqInit(unsqueezedInput, inState)

@ -269,21 +269,21 @@ func (g GRU) Step(input ts.Tensor, inState State) (retVal State) {
 	return state
 }

-func (g GRU) Seq(input ts.Tensor) (output ts.Tensor, state State) {
+func (g *GRU) Seq(input *ts.Tensor) (*ts.Tensor, State) {
 	batchDim := input.MustSize()[0]
 	inState := g.ZeroState(batchDim)

-	output, state = g.SeqInit(input, inState)
+	output, state := g.SeqInit(input, inState)

 	// Delete intermediate tensors in inState
-	inState.(GRUState).Tensor.MustDrop()
+	inState.(*GRUState).Tensor.MustDrop()

 	return output, state
 }

-func (g GRU) SeqInit(input ts.Tensor, inState State) (ts.Tensor, State) {
+func (g *GRU) SeqInit(input *ts.Tensor, inState State) (*ts.Tensor, State) {

-	output, h := input.MustGru(inState.(GRUState).Tensor, g.flatWeights, g.config.HasBiases, g.config.NumLayers, g.config.Dropout, g.config.Train, g.config.Bidirectional, g.config.BatchFirst)
+	output, h := input.MustGru(inState.(*GRUState).Tensor, g.flatWeights, g.config.HasBiases, g.config.NumLayers, g.config.Dropout, g.config.Train, g.config.Bidirectional, g.config.BatchFirst)

-	return output, GRUState{Tensor: h}
+	return output, &GRUState{Tensor: h}
 }
--- a/nn/rnn_test.go
+++ b/nn/rnn_test.go
@ -10,7 +10,7 @@ import (
 	ts "github.com/sugarme/gotch/tensor"
 )

-func gruTest(rnnConfig nn.RNNConfig, t *testing.T) {
+func gruTest(rnnConfig *nn.RNNConfig, t *testing.T) {

 	var (
 		batchDim  int64 = 5
@ -32,10 +32,10 @@ func gruTest(rnnConfig nn.RNNConfig, t *testing.T) {

 	// Step test
 	input := ts.MustRandn([]int64{batchDim, inputDim}, gotch.Float, gotch.CPU)
-	output := gru.Step(input, gru.ZeroState(batchDim).(nn.GRUState))
+	output := gru.Step(input, gru.ZeroState(batchDim).(*nn.GRUState))

 	want := []int64{layerDim, batchDim, outputDim}
-	got := output.(nn.GRUState).Tensor.MustSize()
+	got := output.(*nn.GRUState).Tensor.MustSize()

 	if !reflect.DeepEqual(want, got) {
 		fmt.Println("Step test:")
@ -47,7 +47,7 @@ func gruTest(rnnConfig nn.RNNConfig, t *testing.T) {
 	input = ts.MustRandn([]int64{batchDim, seqLen, inputDim}, gotch.Float, gotch.CPU)
 	output, _ = gru.Seq(input)
 	wantSeq := []int64{batchDim, seqLen, outputDim * numDirections}
-	gotSeq := output.(ts.Tensor).MustSize()
+	gotSeq := output.(*ts.Tensor).MustSize()

 	if !reflect.DeepEqual(wantSeq, gotSeq) {
 		fmt.Println("Seq test:")
@ -75,7 +75,7 @@ func TestGRU(t *testing.T) {
 	gruTest(cfg, t)
 }

-func lstmTest(rnnConfig nn.RNNConfig, t *testing.T) {
+func lstmTest(rnnConfig *nn.RNNConfig, t *testing.T) {

 	var (
 		batchDim  int64 = 5
@ -97,12 +97,12 @@ func lstmTest(rnnConfig nn.RNNConfig, t *testing.T) {

 	// Step test
 	input := ts.MustRandn([]int64{batchDim, inputDim}, gotch.Float, gotch.CPU)
-	output := lstm.Step(input, lstm.ZeroState(batchDim).(nn.LSTMState))
+	output := lstm.Step(input, lstm.ZeroState(batchDim).(*nn.LSTMState))

 	wantH := []int64{layerDim, batchDim, outputDim}
-	gotH := output.(nn.LSTMState).Tensor1.MustSize()
+	gotH := output.(*nn.LSTMState).Tensor1.MustSize()
 	wantC := []int64{layerDim, batchDim, outputDim}
-	gotC := output.(nn.LSTMState).Tensor2.MustSize()
+	gotC := output.(*nn.LSTMState).Tensor2.MustSize()

 	if !reflect.DeepEqual(wantH, gotH) {
 		fmt.Println("Step test:")
@ -121,7 +121,7 @@ func lstmTest(rnnConfig nn.RNNConfig, t *testing.T) {
 	output, _ = lstm.Seq(input)

 	wantSeq := []int64{batchDim, seqLen, outputDim * numDirections}
-	gotSeq := output.(ts.Tensor).MustSize()
+	gotSeq := output.(*ts.Tensor).MustSize()

 	if !reflect.DeepEqual(wantSeq, gotSeq) {
 		fmt.Println("Seq test:")
--- a/nn/sequential.go
+++ b/nn/sequential.go
@ -14,15 +14,15 @@ type Sequential struct {
 }

 // Seq creates a new empty sequential layer
-func Seq() Sequential {
-	return Sequential{layers: make([]ts.Module, 0)}
+func Seq() *Sequential {
+	return &Sequential{layers: make([]ts.Module, 0)}
 }

 // Sequential methods:
 //====================

 // Len returns number of sub-layers embedded in this layer
-func (s Sequential) Len() (retVal int64) {
+func (s *Sequential) Len() (retVal int64) {
 	return int64(len(s.layers))
 }

@ -47,7 +47,7 @@ func (s *Sequential) AddFn(fn ts.Module) {
 }

 // ForwardAll applies the forward pass and returns the output for each layer.
-func (s *Sequential) ForwardAll(xs ts.Tensor, opts ...uint8) (retVal []ts.Tensor) {
+func (s *Sequential) ForwardAll(xs *ts.Tensor, opts ...uint8) (retVal []ts.Tensor) {

 	var n uint8 = uint8(len(s.layers))
 	if len(opts) > 0 {
@ -55,11 +55,11 @@ func (s *Sequential) ForwardAll(xs ts.Tensor, opts ...uint8) (retVal []ts.Tensor
 	}

 	if s.IsEmpty() {
-		return []ts.Tensor{xs.MustShallowClone()}
+		return []ts.Tensor{*xs.MustShallowClone()}
 	}

 	for i := 0; i < int(n); i++ {
-		retVal = append(retVal, s.layers[i].Forward(xs))
+		retVal = append(retVal, *s.layers[i].Forward(xs))
 	}

 	return retVal
@ -76,7 +76,7 @@ func WithUint8(n uint8) func() uint8 {
 // ==========================================

 // Forward implements Module interface for Sequential
-func (s *Sequential) Forward(xs ts.Tensor) (retVal ts.Tensor) {
+func (s *Sequential) Forward(xs *ts.Tensor) (retVal *ts.Tensor) {
 	if s.IsEmpty() {
 		return xs.MustShallowClone()
 	}
@ -85,12 +85,12 @@ func (s *Sequential) Forward(xs ts.Tensor) (retVal ts.Tensor) {
 	outs := make([]ts.Tensor, len(s.layers))
 	for i := 0; i < len(s.layers); i++ {
 		if i == 0 {
-			outs[0] = s.layers[i].Forward(xs)
+			outs[0] = *s.layers[i].Forward(xs)
 			defer outs[0].MustDrop()
 		} else if i == len(s.layers)-1 {
-			return s.layers[i].Forward(outs[i-1])
+			return s.layers[i].Forward(&outs[i-1])
 		} else {
-			outs[i] = s.layers[i].Forward(outs[i-1])
+			outs[i] = *s.layers[i].Forward(&outs[i-1])
 			defer outs[i].MustDrop()
 		}
 	}
@ -104,8 +104,8 @@ type SequentialT struct {
 }

 /// SeqT creates a new empty sequential layer.
-func SeqT() SequentialT {
-	return SequentialT{
+func SeqT() *SequentialT {
+	return &SequentialT{
 		layers: make([]ts.ModuleT, 0),
 	}
 }
@ -125,22 +125,8 @@ func (s *SequentialT) IsEmpty() (retVal bool) {

 // Implement ModuleT interface for SequentialT:
 // ==========================================
-/*
- * func (s SequentialT) Forward(xs ts.Tensor) (retVal ts.Tensor) {
- *   if s.IsEmpty() {
- *     return xs.MustShallowClone()
- *   }
- *
- *   // forward sequentially
- *   var currTs ts.Tensor = xs
- *   for i := 0; i < len(s.layers); i++ {
- *     currTs = s.layers[i].Forward(currTs)
- *   }
- *
- *   return currTs
- * }
- *  */
-func (s SequentialT) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+
+func (s *SequentialT) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	if s.IsEmpty() {
 		return xs.MustShallowClone()
 	}
@ -149,18 +135,17 @@ func (s SequentialT) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
 	outs := make([]ts.Tensor, len(s.layers))
 	for i := 0; i < len(s.layers); i++ {
 		if i == 0 {
-			outs[0] = s.layers[i].ForwardT(xs, train)
+			outs[0] = *s.layers[i].ForwardT(xs, train)
 			defer outs[0].MustDrop()
 		} else if i == len(s.layers)-1 {
-			return s.layers[i].ForwardT(outs[i-1], train)
+			return s.layers[i].ForwardT(&outs[i-1], train)
 		} else {
-			outs[i] = s.layers[i].ForwardT(outs[i-1], train)
+			outs[i] = *s.layers[i].ForwardT(&outs[i-1], train)
 			defer outs[i].MustDrop()
 		}
 	}

-	return
-
+	panic("Shouldn't reached here.")
 }

 // Add appends a layer after all the current layers.
@ -187,7 +172,7 @@ func (s *SequentialT) AddFnT(fn ts.ModuleT) {
 }

 // ForwardAll applies the forward pass and returns the output for each layer.
-func (s *SequentialT) ForwardAllT(xs ts.Tensor, train bool, opts ...uint8) (retVal []ts.Tensor) {
+func (s *SequentialT) ForwardAllT(xs *ts.Tensor, train bool, opts ...uint8) (retVal []ts.Tensor) {

 	var n uint8 = uint8(len(s.layers))
 	if len(opts) > 0 {
@ -195,13 +180,13 @@ func (s *SequentialT) ForwardAllT(xs ts.Tensor, train bool, opts ...uint8) (retV
 	}

 	if s.IsEmpty() {
-		return []ts.Tensor{xs.MustShallowClone()}
+		return []ts.Tensor{*xs.MustShallowClone()}
 	}

 	currTs := xs
 	for i := 0; i < int(n); i++ {
 		res := s.layers[i].ForwardT(currTs, train)
-		retVal = append(retVal, res)
+		retVal = append(retVal, *res)
 		currTs = res
 	}

@ -214,15 +199,15 @@ func (s *SequentialT) ForwardAllT(xs ts.Tensor, train bool, opts ...uint8) (retV
 // Ref. https://stackoverflow.com/a/42182987
 // NOTE: Specifically, `ForwardWith` is used to wrap anonymous function
 // as input parameter of `AddFn` Sequential method.
-type ForwardWith func(ts.Tensor) ts.Tensor
+type ForwardWith func(*ts.Tensor) *ts.Tensor

-func (fw ForwardWith) Forward(xs ts.Tensor) ts.Tensor {
+func (fw ForwardWith) Forward(xs *ts.Tensor) *ts.Tensor {
 	return fw(xs)
 }

-type ForwardTWith func(ts.Tensor, bool) ts.Tensor
+type ForwardTWith func(*ts.Tensor, bool) *ts.Tensor

-func (fw ForwardTWith) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
+func (fw ForwardTWith) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	return fw(xs, train)
 }

@ -235,7 +220,7 @@ func (fw ForwardTWith) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
 // This seems not working in Go.
 // There 2 ways to get around. One is freeze VarStore, the other is
 // set manually set AutoGrad at `loss` tensor. I.e., `loss = loss.MustSetRequiresGrad(true)`
-func BatchAccuracyForLogits(vs VarStore, m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
+func BatchAccuracyForLogits(vs *VarStore, m ts.ModuleT, xs, ys *ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {

 	var (
 		sumAccuracy float64 = 0.0
@ -272,7 +257,7 @@ func BatchAccuracyForLogits(vs VarStore, m ts.ModuleT, xs, ys ts.Tensor, d gotch
 // BatchAccuracyForLogitIdx is an alternative of BatchAccuracyForLogits to
 // calculate accuracy for specified batch on module weight. It uses tensor
 // indexing instead of Iter2
-func BatchAccuracyForLogitsIdx(vs VarStore, m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
+func BatchAccuracyForLogitsIdx(vs *VarStore, m ts.ModuleT, xs, ys *ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
 	var (
 		sumAccuracy float64 = 0.0
 		sampleCount float64 = 0.0
--- a/nn/sparse.go
+++ b/nn/sparse.go
@ -14,8 +14,8 @@ type EmbeddingConfig struct {
 	PaddingIdx      int64
 }

-func DefaultEmbeddingConfig() EmbeddingConfig {
-	return EmbeddingConfig{
+func DefaultEmbeddingConfig() *EmbeddingConfig {
+	return &EmbeddingConfig{
 		Sparse:          false,
 		ScaleGradByFreq: false,
 		WsInit:          NewRandnInit(0.0, 1.0),
@ -28,13 +28,13 @@ func DefaultEmbeddingConfig() EmbeddingConfig {
 // An embedding layer acts as a simple lookup table that stores embeddings.
 // This is commonly used to store word embeddings.
 type Embedding struct {
-	Ws     ts.Tensor
-	config EmbeddingConfig
+	Ws     *ts.Tensor
+	config *EmbeddingConfig
 }

 // NewEmbedding creates a new Embedding
-func NewEmbedding(vs Path, numEmbeddings int64, embeddingDim int64, config EmbeddingConfig) Embedding {
-	return Embedding{
+func NewEmbedding(vs *Path, numEmbeddings int64, embeddingDim int64, config *EmbeddingConfig) *Embedding {
+	return &Embedding{
 		Ws:     vs.NewVar("weight", []int64{numEmbeddings, embeddingDim}, config.WsInit),
 		config: config,
 	}
@ -44,11 +44,11 @@ func NewEmbedding(vs Path, numEmbeddings int64, embeddingDim int64, config Embed
 // =========================================

 // Forward implements Module interface for Embedding
-func (e Embedding) Forward(xs ts.Tensor) (retVal ts.Tensor) {
+func (e *Embedding) Forward(xs *ts.Tensor) *ts.Tensor {
 	return ts.MustEmbedding(e.Ws, xs, e.config.PaddingIdx, e.config.ScaleGradByFreq, e.config.Sparse)
 }

 // ForwardT implements ModuleT interface for Embedding
-func (e Embedding) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+func (e *Embedding) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
 	return ts.MustEmbedding(e.Ws, xs, e.config.PaddingIdx, e.config.ScaleGradByFreq, e.config.Sparse)
 }
--- a/nn/sparse_test.go
+++ b/nn/sparse_test.go
@ -9,7 +9,7 @@ import (
 	ts "github.com/sugarme/gotch/tensor"
 )

-func embeddingTest(embeddingConfig nn.EmbeddingConfig, t *testing.T) {
+func embeddingTest(embeddingConfig *nn.EmbeddingConfig, t *testing.T) {

 	var (
 		batchDim  int64 = 5
--- a/nn/varstore.go
+++ b/nn/varstore.go
@ -20,7 +20,7 @@ const SEP = "."
 // however the tensor is not set to require gradients.
 type Variables struct {
 	mutex              *sync.Mutex
-	NamedVariables     map[string]ts.Tensor
+	NamedVariables     map[string]*ts.Tensor
 	TrainableVariables []ts.Tensor
 }

@ -45,14 +45,14 @@ type Entry struct {
 }

 // NewVarStore creates a new variable store located on the specified device
-func NewVarStore(device gotch.Device) VarStore {
+func NewVarStore(device gotch.Device) *VarStore {
 	variables := Variables{
 		mutex:              &sync.Mutex{},
-		NamedVariables:     make(map[string]ts.Tensor, 0),
+		NamedVariables:     make(map[string]*ts.Tensor, 0),
 		TrainableVariables: make([]ts.Tensor, 0),
 	}

-	return VarStore{
+	return &VarStore{
 		device: device,
 		Vars:   variables,
 	}
@ -94,7 +94,7 @@ func (vs *VarStore) TrainableVariables() (retVal []ts.Tensor) {

 	retVal = vs.Vars.TrainableVariables
 	for _, t := range vs.Vars.TrainableVariables {
-		retVal = append(retVal, t.MustShallowClone())
+		retVal = append(retVal, *t.MustShallowClone())
 	}

 	return retVal
@ -108,7 +108,7 @@ func (vs *VarStore) Variables() (retVal map[string]ts.Tensor) {
 	retVal = make(map[string]ts.Tensor, 0)

 	for k, v := range vs.Vars.NamedVariables {
-		retVal[k] = v.MustShallowClone()
+		retVal[k] = *v.MustShallowClone()
 	}

 	return retVal
@ -119,8 +119,8 @@ func (vs *VarStore) Variables() (retVal map[string]ts.Tensor) {
 // NOTE: Variables are named and organized using paths. This function returns
 // the top level path for the var store and can be combined with '/'
 // to create sub-paths.
-func (vs *VarStore) Root() (retVal Path) {
-	return Path{
+func (vs *VarStore) Root() *Path {
+	return &Path{
 		path:     []string{},
 		varstore: vs,
 	}
@ -130,7 +130,7 @@ func (vs *VarStore) Root() (retVal Path) {
 //
 // NOTE: Weight values for all the tensors currently stored in the
 // var-store gets saved in the given file.
-func (vs *VarStore) Save(filepath string) (err error) {
+func (vs *VarStore) Save(filepath string) error {
 	vs.Vars.mutex.Lock()
 	defer vs.Vars.mutex.Unlock()

@ -155,7 +155,7 @@ func (vs *VarStore) Save(filepath string) (err error) {
 // for these tensors are modified.
 // It will throw error if name of the loaded tensors can not find
 // in the current var-store named tensors set.
-func (vs *VarStore) Load(filepath string) (err error) {
+func (vs *VarStore) Load(filepath string) error {
 	namedTensors, err := ts.LoadMultiWithDevice(filepath, vs.device)
 	if err != nil {
 		return err
@ -163,7 +163,7 @@ func (vs *VarStore) Load(filepath string) (err error) {

 	var namedTensorsMap map[string]ts.Tensor = make(map[string]ts.Tensor, 0)
 	for _, namedTensor := range namedTensors {
-		namedTensorsMap[namedTensor.Name] = namedTensor.Tensor
+		namedTensorsMap[namedTensor.Name] = *namedTensor.Tensor
 	}

 	// Match and in-place copy value (update) from newly loaded tensors
@ -190,7 +190,7 @@ func (vs *VarStore) Load(filepath string) (err error) {
 		}

 		ts.NoGrad(func() {
-			vs.Vars.NamedVariables[tsName].Copy_(currTs)
+			vs.Vars.NamedVariables[tsName].Copy_(&currTs)
 		})
 	}
 	return nil
@ -213,7 +213,7 @@ func (vs *VarStore) LoadPartial(filepath string) (retVal []string, err error) {
 		return nil, err
 	}

-	var namedTensorsMap map[string]ts.Tensor = make(map[string]ts.Tensor, 0)
+	var namedTensorsMap map[string]*ts.Tensor = make(map[string]*ts.Tensor, 0)
 	for _, namedTensor := range namedTensors {
 		namedTensorsMap[namedTensor.Name] = namedTensor.Tensor
 	}
@ -226,7 +226,7 @@ func (vs *VarStore) LoadPartial(filepath string) (retVal []string, err error) {
 	defer vs.Vars.mutex.Unlock()

 	for tsName := range vs.Vars.NamedVariables {
-		var currTs ts.Tensor
+		var currTs *ts.Tensor
 		var ok bool

 		// missing variable
@ -320,7 +320,7 @@ func (vs *VarStore) Copy(src VarStore) (err error) {
 // =============

 // Sub gets a sub-path of the given path.
-func (p *Path) Sub(str string) (retVal Path) {
+func (p *Path) Sub(str string) *Path {

 	if strings.Contains(str, SEP) {
 		log.Fatalf("Sub name cannot contain %v (%v)\n", SEP, str)
@ -328,7 +328,7 @@ func (p *Path) Sub(str string) (retVal Path) {

 	path := p.path
 	path = append(path, str)
-	return Path{
+	return &Path{
 		path:     path,
 		varstore: p.varstore,
 	}
@ -355,7 +355,7 @@ func (p *Path) getpath(name string) (retVal string) {
 	}
 }

-func (p *Path) add(name string, newTs ts.Tensor, trainable bool) (retVal ts.Tensor) {
+func (p *Path) add(name string, newTs *ts.Tensor, trainable bool) (retVal *ts.Tensor) {
 	path := p.getpath(name)

 	p.varstore.Vars.mutex.Lock()
@ -366,7 +366,7 @@ func (p *Path) add(name string, newTs ts.Tensor, trainable bool) (retVal ts.Tens
 	}

 	var (
-		tensor ts.Tensor
+		tensor *ts.Tensor
 		err    error
 	)
 	if trainable {
@ -379,7 +379,7 @@ func (p *Path) add(name string, newTs ts.Tensor, trainable bool) (retVal ts.Tens
 	}

 	if trainable {
-		p.varstore.Vars.TrainableVariables = append(p.varstore.Vars.TrainableVariables, tensor)
+		p.varstore.Vars.TrainableVariables = append(p.varstore.Vars.TrainableVariables, *tensor)
 	}

 	p.varstore.Vars.NamedVariables[path] = tensor
@ -387,7 +387,7 @@ func (p *Path) add(name string, newTs ts.Tensor, trainable bool) (retVal ts.Tens
 	return tensor
 }

-func (p *Path) getOrAddWithLock(name string, tensor ts.Tensor, trainable bool, variables Variables) (retVal ts.Tensor) {
+func (p *Path) getOrAddWithLock(name string, tensor *ts.Tensor, trainable bool, variables Variables) (retVal *ts.Tensor) {
 	path := p.getpath(name)

 	// if found, return it
@ -397,7 +397,7 @@ func (p *Path) getOrAddWithLock(name string, tensor ts.Tensor, trainable bool, v

 	// not found, add it
 	var err error
-	var ttensor ts.Tensor
+	var ttensor *ts.Tensor
 	if trainable {
 		ttensor, err = tensor.SetRequiresGrad(true, false)
 		if err != nil {
@ -408,7 +408,7 @@ func (p *Path) getOrAddWithLock(name string, tensor ts.Tensor, trainable bool, v
 	}

 	if trainable {
-		variables.TrainableVariables = append(variables.TrainableVariables, ttensor)
+		variables.TrainableVariables = append(variables.TrainableVariables, *ttensor)
 	}

 	variables.NamedVariables[path] = ttensor
@ -422,7 +422,7 @@ func (p *Path) getOrAddWithLock(name string, tensor ts.Tensor, trainable bool, v
 // has the specified shape. The variable will not be trainable so
 // gradients will not be tracked.
 // The variable uses a float tensor initialized with zeros.
-func (p *Path) ZerosNoTrain(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) ZerosNoTrain(name string, dims []int64) (retVal *ts.Tensor) {

 	device := p.Device()
 	z, err := ts.Zeros(dims, gotch.Float, device)
@ -439,7 +439,7 @@ func (p *Path) ZerosNoTrain(name string, dims []int64) (retVal ts.Tensor) {
 // has the specified shape. The variable will not be trainable so
 // gradients will not be tracked.
 // The variable uses a float tensor initialized with ones.
-func (p *Path) OnesNoTrain(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) OnesNoTrain(name string, dims []int64) (retVal *ts.Tensor) {

 	device := p.Device()
 	z, err := ts.Ones(dims, gotch.Float, device)
@ -457,7 +457,7 @@ func (p *Path) OnesNoTrain(name string, dims []int64) (retVal ts.Tensor) {
 // will be tracked.
 // The variable uses a float tensor initialized as per the
 // related argument.
-func (p *Path) NewVar(name string, dims []int64, ini Init) (retVal ts.Tensor) {
+func (p *Path) NewVar(name string, dims []int64, ini Init) (retVal *ts.Tensor) {

 	v := ini.InitTensor(dims, p.varstore.device)

@ -470,7 +470,7 @@ func (p *Path) NewVar(name string, dims []int64, ini Init) (retVal ts.Tensor) {
 // has the specified shape. The variable is trainable, its gradient
 // will be tracked.
 // The variable uses a float tensor initialized with zeros.
-func (p *Path) Zeros(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) Zeros(name string, dims []int64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewConstInit(0.0))
 }
@ -481,7 +481,7 @@ func (p *Path) Zeros(name string, dims []int64) (retVal ts.Tensor) {
 // has the specified shape. The variable is trainable, its gradient
 // will be tracked.
 // The variable uses a float tensor initialized with ones.
-func (p *Path) Ones(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) Ones(name string, dims []int64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewConstInit(1.0))
 }
@ -493,7 +493,7 @@ func (p *Path) Ones(name string, dims []int64) (retVal ts.Tensor) {
 // will be tracked.
 // The variable uses a float tensor initialized randomly using a
 // standard normal distribution.
-func (p *Path) RandnStandard(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) RandnStandard(name string, dims []int64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewRandnInit(0.0, 1.0))
 }
@ -505,7 +505,7 @@ func (p *Path) RandnStandard(name string, dims []int64) (retVal ts.Tensor) {
 // will be tracked.
 // The variable uses a float tensor initialized randomly using a
 // normal distribution with the specified mean and standard deviation.
-func (p *Path) Randn(name string, dims []int64, mean float64, stdev float64) (retVal ts.Tensor) {
+func (p *Path) Randn(name string, dims []int64, mean float64, stdev float64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewRandnInit(mean, stdev))
 }
@ -517,7 +517,7 @@ func (p *Path) Randn(name string, dims []int64, mean float64, stdev float64) (re
 // will be tracked.
 // The variable uses a float tensor initialized randomly using a
 // uniform distribution between the specified bounds.
-func (p *Path) Uniform(name string, dims []int64, lo, up float64) (retVal ts.Tensor) {
+func (p *Path) Uniform(name string, dims []int64, lo, up float64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewUniformInit(lo, up))
 }
@ -529,7 +529,7 @@ func (p *Path) Uniform(name string, dims []int64, lo, up float64) (retVal ts.Ten
 // will be tracked.
 // The variable uses a float tensor initialized randomly using a
 // uniform distribution which bounds follow Kaiming initialization.
-func (p *Path) KaimingUniform(name string, dims []int64) (retVal ts.Tensor) {
+func (p *Path) KaimingUniform(name string, dims []int64) (retVal *ts.Tensor) {

 	return p.NewVar(name, dims, NewKaimingUniformInit())
 }
@ -541,7 +541,7 @@ func (p *Path) KaimingUniform(name string, dims []int64) (retVal ts.Tensor) {
 // will be tracked.
 // The variable uses a float tensor initialized by copying some
 // given tensor.
-func (p *Path) VarCopy(name string, t ts.Tensor) (retVal ts.Tensor) {
+func (p *Path) VarCopy(name string, t *ts.Tensor) (retVal *ts.Tensor) {

 	size, err := t.Size()
 	if err != nil {
@ -557,7 +557,7 @@ func (p *Path) VarCopy(name string, t ts.Tensor) (retVal ts.Tensor) {
 }

 // Get gets the tensor corresponding to a given name if present.
-func (p *Path) Get(name string) (retVal ts.Tensor, err error) {
+func (p *Path) Get(name string) (retVal *ts.Tensor, err error) {

 	p.varstore.Vars.mutex.Lock()
 	defer p.varstore.Vars.mutex.Unlock()
@ -572,11 +572,11 @@ func (p *Path) Get(name string) (retVal ts.Tensor, err error) {
 }

 // Entry gets the entry corresponding to a given name for in-place manipulation.
-func (p *Path) Entry(name string) (retVal Entry) {
+func (p *Path) Entry(name string) *Entry {
 	p.varstore.Vars.mutex.Lock()
 	defer p.varstore.Vars.mutex.Unlock()

-	return Entry{
+	return &Entry{
 		name:      name,
 		variables: &p.varstore.Vars,
 		path:      p,
@ -592,14 +592,14 @@ func (p *Path) Entry(name string) (retVal Entry) {
 // var store, the corresponding tensor is returned. Otherwise a new
 // variable is added to the var-store with the entry name and is
 // initialized according to the init parameter.
-func (e *Entry) OrVar(dims []int64, init Init) (retVal ts.Tensor) {
+func (e *Entry) OrVar(dims []int64, init Init) (retVal *ts.Tensor) {

 	v := init.InitTensor(dims, e.path.varstore.device)
 	return e.path.getOrAddWithLock(e.name, v, true, *e.variables)
 }

 // Returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrVarCopy(tensor ts.Tensor) (retVal ts.Tensor) {
+func (e *Entry) OrVarCopy(tensor *ts.Tensor) (retVal *ts.Tensor) {

 	size, err := tensor.Size()
 	if err != nil {
@ -615,50 +615,50 @@ func (e *Entry) OrVarCopy(tensor ts.Tensor) (retVal ts.Tensor) {
 }

 // Returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrKaimingUniform(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrKaimingUniform(dims []int64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewKaimingUniformInit())
 }

 // OrOnes returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrOnes(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrOnes(dims []int64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewConstInit(1.0))
 }

 // OrOnesNoTrain returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrOnesNoTrain(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrOnesNoTrain(dims []int64) (retVal *ts.Tensor) {

 	o := ts.MustOnes(dims, gotch.Float, e.path.Device())
 	return e.path.getOrAddWithLock(e.name, o, true, *e.variables)
 }

 // OrRandn returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrRandn(dims []int64, mean, stdev float64) (retVal ts.Tensor) {
+func (e *Entry) OrRandn(dims []int64, mean, stdev float64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewRandnInit(mean, stdev))
 }

 // OrRandnStandard returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrRandnStandard(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrRandnStandard(dims []int64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewRandnInit(0.0, 1.0))
 }

 // OrUniform returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrUniform(dims []int64, lo, up float64) (retVal ts.Tensor) {
+func (e *Entry) OrUniform(dims []int64, lo, up float64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewUniformInit(lo, up))
 }

 // OrZeros returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrZeros(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrZeros(dims []int64) (retVal *ts.Tensor) {

 	return e.OrVar(dims, NewConstInit(0.0))
 }

 // OrZerosNoTrain returns the existing entry if, otherwise create a new variable.
-func (e *Entry) OrZerosNoTrain(dims []int64) (retVal ts.Tensor) {
+func (e *Entry) OrZerosNoTrain(dims []int64) (retVal *ts.Tensor) {

 	z := ts.MustZeros(dims, gotch.Float, e.path.Device())
 	return e.path.getOrAddWithLock(e.name, z, true, *e.variables)
--- a/nn/varstore_test.go
+++ b/nn/varstore_test.go
@ -46,7 +46,7 @@ func TestSaveLoad(t *testing.T) {
 		panic(err)
 	}

-	add := func(vs nn.Path) (ts.Tensor, ts.Tensor) {
+	add := func(vs *nn.Path) (*ts.Tensor, *ts.Tensor) {
 		subA := vs.Sub("a")
 		subB := subA.Sub("b")
 		v := subB.Ones("t2", []int64{3})
--- a/setup-cpu.sh
+++ b/setup-cpu.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 # Env
-GOTCH_VERSION="${GOTCH_VER:-v0.1.10}"
+GOTCH_VERSION="${GOTCH_VER:-v0.2.0}"
 LIBTORCH_VERSION="${LIBTORCH_VER:-1.5.1}"

 GOTCH="$GOPATH/pkg/mod/github.com/sugarme/gotch@$GOTCH_VERSION"
--- a/setup-gpu.sh
+++ b/setup-gpu.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-GOTCH_VERSION="${GOTCH_VER:-v0.1.10}"
+GOTCH_VERSION="${GOTCH_VER:-v0.2.0}"
 LIBTORCH_VERSION="${LIBTORCH_VER:-1.5.1}"
 CUDA_VERSION="${CUDA_VER:-10.1}"
 CU_VERSION="${CUDA_VERSION//./}"
--- a/tensor/data.go
+++ b/tensor/data.go
@ -16,8 +16,8 @@ import (
 // containing a (potentially random) slice of each of the two input
 // tensors.
 type Iter2 struct {
-	xs                   Tensor
-	ys                   Tensor
+	xs                   *Tensor
+	ys                   *Tensor
 	batchIndex           int64
 	batchSize            int64
 	totalSize            int64
@ -38,12 +38,16 @@ type Iter2 struct {
 // * `xs` - the features to be used by the model.
 // * `ys` - the targets that the model attempts to predict.
 // * `batch_size` - the size of batches to be returned.
-func NewIter2(xs, ys Tensor, batchSize int64) (retVal Iter2, err error) {
+func NewIter2(xs, ys *Tensor, batchSize int64) (*Iter2, error) {
+	var (
+		iter *Iter2
+		err  error
+	)

 	totalSize := xs.MustSize()[0]
 	if ys.MustSize()[0] != totalSize {
 		err = fmt.Errorf("Different dimension for the two inputs: %v - %v", xs.MustSize(), ys.MustSize())
-		return retVal, err
+		return nil, err
 	}

 	// xsClone, err := xs.ZerosLike(false)
@ -58,7 +62,7 @@ func NewIter2(xs, ys Tensor, batchSize int64) (retVal Iter2, err error) {
 	// }
 	// ysClone.Copy_(ys)

-	retVal = Iter2{
+	iter = &Iter2{
 		xs: xs.MustShallowClone(),
 		ys: ys.MustShallowClone(),
 		// xs:                   xsClone,
@ -69,7 +73,7 @@ func NewIter2(xs, ys Tensor, batchSize int64) (retVal Iter2, err error) {
 		returnSmallLastBatch: false,
 	}

-	return retVal, nil
+	return iter, nil
 }

 // MustNewIter2 returns a new iterator.
@ -84,14 +88,14 @@ func NewIter2(xs, ys Tensor, batchSize int64) (retVal Iter2, err error) {
 // * `xs` - the features to be used by the model.
 // * `ys` - the targets that the model attempts to predict.
 // * `batch_size` - the size of batches to be returned.
-func MustNewIter2(xs, ys Tensor, batchSize int64) (retVal Iter2) {
-	retVal, err := NewIter2(xs, ys, batchSize)
+func MustNewIter2(xs, ys *Tensor, batchSize int64) *Iter2 {
+	iter, err := NewIter2(xs, ys, batchSize)

 	if err != nil {
 		log.Fatal(err)
 	}

-	return retVal
+	return iter
 }

 // Shuffle shuffles the dataset.
@ -108,20 +112,20 @@ func (it *Iter2) Shuffle() {
 }

 // ToDevice transfers the mini-batches to a specified device.
-func (it Iter2) ToDevice(device gotch.Device) (retVal Iter2) {
+func (it *Iter2) ToDevice(device gotch.Device) *Iter2 {
 	it.device = device
 	return it
 }

 // ReturnSmallLastBatch when set, returns the last batch even if smaller than the batch size.
-func (it Iter2) ReturnSmallLastBatch() (retVal Iter2) {
+func (it *Iter2) ReturnSmallLastBatch() *Iter2 {
 	it.returnSmallLastBatch = true
 	return it
 }

 type Iter2Item struct {
-	Data  Tensor
-	Label Tensor
+	Data  *Tensor
+	Label *Tensor
 }

 // Next implements iterator for Iter2
@ -148,7 +152,7 @@ func (it *Iter2) Next() (item Iter2Item, ok bool) {
 	}
 }

-func (it Iter2) Drop() {
+func (it *Iter2) Drop() {
 	it.xs.MustDrop()
 	it.ys.MustDrop()
 }
@ -156,17 +160,17 @@ func (it Iter2) Drop() {
 // TextData represent text data in tensor of runes (uint8)
 // and its corresponding string
 type TextData struct {
-	Data         Tensor // frequency (occurence) of byte value from input text
-	CharForLabel []rune // unique rune values from input text
+	Data         *Tensor // frequency (occurence) of byte value from input text
+	CharForLabel []rune  // unique rune values from input text
 }

 // TextDataIter is a text data interator
 type TextDataIter struct {
-	Data       Tensor
+	Data       *Tensor
 	SeqLen     int64
 	BatchIndex int64
 	BatchSize  int64
-	Indexes    Tensor
+	Indexes    *Tensor
 	IndexesLen int64
 }

@ -179,17 +183,17 @@ type TextDataIter struct {
 // will labelled with new label(index)
 // Data: tensor of labels
 // CharForLabel: []rune (unique runes from text input)
-func NewTextData(filename string) (retVal TextData, err error) {
+func NewTextData(filename string) (*TextData, error) {
 	filePath, err := filepath.Abs(filename)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	r, err := os.Open(filePath)

 	buffer, err := ioutil.ReadAll(r)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	var labelForChar map[byte]uint8 = make(map[byte]uint8, 0)
@ -216,35 +220,35 @@ func NewTextData(filename string) (retVal TextData, err error) {

 	data := MustOfSlice(dataIndexes)

-	return TextData{
+	return &TextData{
 		Data:         data,
 		CharForLabel: charForLabel,
 	}, nil
 }

 // Labels returns the number of different `character` (rune) used by the dataset.
-func (td TextData) Labels() (retVal int64) {
+func (td *TextData) Labels() (retVal int64) {
 	return int64(len(td.CharForLabel))
 }

 // Data returns a shallow copy of the data.
-func (td TextData) CloneData() (retVal Tensor) {
+func (td *TextData) CloneData() *Tensor {
 	return td.Data.MustShallowClone()
 }

 // LabelForChar returns a corresponding `char` (rune) for
 // specified label input
-func (td TextData) LabelForChar(label int64) (retVal rune) {
+func (td *TextData) LabelForChar(label int64) rune {
 	return td.CharForLabel[int(label)]
 }

 // IterShuffle returns a batch iterator over the dataset.
 // Each sample is made of seq_len characters.
-func (td TextData) IterShuffle(seqLen int64, batchSize int64) (retVal TextDataIter) {
+func (td *TextData) IterShuffle(seqLen int64, batchSize int64) *TextDataIter {

 	indexesLen := td.Data.MustSize()[0] - seqLen + 1

-	return TextDataIter{
+	return &TextDataIter{
 		Data:       td.Data.MustShallowClone(),
 		SeqLen:     seqLen,
 		BatchIndex: 0,
@ -255,12 +259,12 @@ func (td TextData) IterShuffle(seqLen int64, batchSize int64) (retVal TextDataIt
 }

 // Next implements iterator for TextDataIter
-func (tdi *TextDataIter) Next() (retVal Tensor, ok bool) {
+func (tdi *TextDataIter) Next() (*Tensor, bool) {
 	start := tdi.BatchIndex * tdi.BatchSize
 	size := min(tdi.BatchSize, tdi.IndexesLen-start)

 	if size < tdi.BatchSize {
-		return retVal, false
+		return nil, false
 	}

 	tdi.BatchIndex += 1
@ -276,10 +280,10 @@ func (tdi *TextDataIter) Next() (retVal Tensor, ok bool) {
 	for _, idx := range indexes {
 		narrowIdx := NewNarrow(idx, idx+tdi.SeqLen)
 		idxTs := tdi.Data.Idx(narrowIdx)
-		batch = append(batch, idxTs)
+		batch = append(batch, *idxTs)
 	}

-	retVal = MustStack(batch, 0)
+	retVal := MustStack(batch, 0)

 	// Delete intermediate tensors
 	for _, xs := range batch {
@ -289,7 +293,7 @@ func (tdi *TextDataIter) Next() (retVal Tensor, ok bool) {
 	return retVal, true
 }

-func min(v1, v2 int64) (retVal int64) {
+func min(v1, v2 int64) int64 {
 	if v1 < v2 {
 		return v1
 	}
--- a/tensor/image.go
+++ b/tensor/image.go
@ -9,22 +9,20 @@ import (
 )

 // LoadHwc returns a tensor of shape [height, width, channels] on success.
-func LoadHwc(path string) (retVal Tensor, err error) {
+func LoadHwc(path string) (*Tensor, error) {

 	ctensor := lib.AtLoadImage(path)
-	err = TorchErr()
+	err := TorchErr()
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

-	retVal = Tensor{ctensor}
-
-	return retVal, nil
+	return &Tensor{ctensor}, nil
 }

 // SaveHwc save an image from tensor. It expects a tensor of shape [height,
 // width, channels]
-func SaveHwc(ts Tensor, path string) (err error) {
+func SaveHwc(ts *Tensor, path string) error {

 	lib.AtSaveImage(ts.ctensor, path)
 	return TorchErr()
@ -32,14 +30,13 @@ func SaveHwc(ts Tensor, path string) (err error) {

 // ResizeHwc expects a tensor of shape [height, width, channels].
 // On success returns a tensor of shape [height, width, channels].
-func ResizeHwc(ts Tensor, outWidth, outHeight int64) (retVal Tensor, err error) {
+func ResizeHwc(ts *Tensor, outWidth, outHeight int64) (*Tensor, error) {

 	ctensor := lib.AtResizeImage(ts.ctensor, outWidth, outHeight)
-	err = TorchErr()
+	err := TorchErr()
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}
-	retVal = Tensor{ctensor}

-	return retVal, nil
+	return &Tensor{ctensor}, nil
 }
--- a/tensor/index.go
+++ b/tensor/index.go
@ -79,7 +79,7 @@ type Narrow struct {
 	Start int64
 	End   int64
 }
-type IndexSelect struct{ Index Tensor }
+type IndexSelect struct{ Index *Tensor }
 type InsertNewAxis struct{}

 // NewSelect creates an tensor indexer with given index.
@ -93,7 +93,7 @@ func NewNarrow(start, end int64) Narrow {
 	return Narrow{Start: start, End: end}
 }

-func NewIndexSelect(ts Tensor) IndexSelect {
+func NewIndexSelect(ts *Tensor) IndexSelect {
 	return IndexSelect{Index: ts}
 }

@ -130,7 +130,7 @@ type IndexOp interface {
 //
 // NOTE:
 // - `index`: expects type `TensorIndexer` or `[]TensorIndexer`
-func (ts *Tensor) Idx(index interface{}) (retVal Tensor) {
+func (ts *Tensor) Idx(index interface{}) (retVal *Tensor) {

 	// indexTyp := reflect.TypeOf(index)
 	indexVal := reflect.ValueOf(index)
@ -196,7 +196,7 @@ func (ts *Tensor) Idx(index interface{}) (retVal Tensor) {

 // Tensor Methods:
 // ===============
-func (ts Tensor) indexer(indexSpec []TensorIndexer) (retVal Tensor, err error) {
+func (ts *Tensor) indexer(indexSpec []TensorIndexer) (retVal *Tensor, err error) {

 	// Make sure number of non-newaxis is not exceed number of dimensions
 	var numNewAxis int = 0
@ -221,7 +221,7 @@ func (ts Tensor) indexer(indexSpec []TensorIndexer) (retVal Tensor, err error) {
 		// If `spec` is `IndexSelect` type and
 		if reflect.TypeOf(spec).Name() == "IndexSelect" {
 			if reflect.ValueOf(spec).Kind() == reflect.Struct {
-				inputTensor := reflect.ValueOf(spec).FieldByName("Index").Interface().(Tensor)
+				inputTensor := reflect.ValueOf(spec).FieldByName("Index").Interface().(*Tensor)

 				// 1. Either its input tensor has dimension > 1, throw error.
 				inputTensorShape, err := inputTensor.Size()
@ -249,9 +249,9 @@ func (ts Tensor) indexer(indexSpec []TensorIndexer) (retVal Tensor, err error) {

 	// Now, apply indexing from left to right.
 	var (
-		currTensor Tensor = ts.MustShallowClone()
-		currIdx    int64  = 0
-		nextTensor Tensor
+		currTensor *Tensor = ts.MustShallowClone()
+		currIdx    int64   = 0
+		nextTensor *Tensor
 		nextIdx    int64
 	)

@ -282,8 +282,8 @@ func (ts Tensor) indexer(indexSpec []TensorIndexer) (retVal Tensor, err error) {
 				return retVal, err
 			}
 			nextIdx = currIdx + 1
-		case "IndexSelect": // 1 field `(Index Tensor)`
-			indexTensor := reflect.ValueOf(spec).FieldByName("Index").Interface().(Tensor)
+		case "IndexSelect": // 1 field `(Index *Tensor)`
+			indexTensor := reflect.ValueOf(spec).FieldByName("Index").Interface().(*Tensor)
 			device, err := currTensor.Device()
 			if err != nil {
 				return retVal, err
@ -307,7 +307,7 @@ func (ts Tensor) indexer(indexSpec []TensorIndexer) (retVal Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) mustIndexer(indexSpec []TensorIndexer) (retVal Tensor) {
+func (ts *Tensor) mustIndexer(indexSpec []TensorIndexer) (retVal *Tensor) {
 	retVal, err := ts.indexer(indexSpec)
 	if err != nil {
 		panic(err)
--- a/tensor/iter.go
+++ b/tensor/iter.go
@ -14,27 +14,27 @@ type Iterator interface {
 type Iterable struct {
 	Index    int64
 	Len      int64
-	Content  Tensor
+	Content  *Tensor
 	ItemKind gotch.DType
 }

 // Next implements Iterator interface
-func (it *Iterable) Next() (retVal interface{}, ok bool) {
+func (it *Iterable) Next() (item interface{}, ok bool) {

 	if it.Index == it.Len {
-		return retVal, false
+		return nil, false
 	}

 	var err error
 	switch it.ItemKind.Kind().String() {
 	case "int64":
-		retVal, err = it.Content.Int64Value([]int64{it.Index})
+		item, err = it.Content.Int64Value([]int64{it.Index})
 		if err != nil {
 			log.Fatal(err)
 		}
 		it.Index += 1
 	case "float64":
-		retVal, err = it.Content.Float64Value([]int64{it.Index})
+		item, err = it.Content.Float64Value([]int64{it.Index})
 		if err != nil {
 			log.Fatal(err)
 		}
@ -44,22 +44,22 @@ func (it *Iterable) Next() (retVal interface{}, ok bool) {
 		log.Fatal(err)
 	}

-	return retVal, true
+	return item, true
 }

 // Iter creates an iterable object with specified item type.
-func (ts Tensor) Iter(dtype gotch.DType) (retVal Iterable, err error) {
+func (ts *Tensor) Iter(dtype gotch.DType) (*Iterable, error) {
 	num, err := ts.Size1() // size for 1D tensor
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}
 	tmp, err := ts.ShallowClone()
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}
 	content := tmp.MustTotype(dtype, true)

-	return Iterable{
+	return &Iterable{
 		Index:    0,
 		Len:      num,
 		Content:  content,
--- a/tensor/jit.go
+++ b/tensor/jit.go
@ -950,7 +950,7 @@ func ModuleLoadDataOnDevice(stream io.Reader, device gotch.Device) (retVal CModu
 }

 // Performs the forward pass for a model on some specified tensor inputs.
-func (cm CModule) ForwardTs(tensors []Tensor) (retVal Tensor, err error) {
+func (cm CModule) ForwardTs(tensors []Tensor) (retVal *Tensor, err error) {
 	var ctensors []lib.Ctensor
 	for _, t := range tensors {
 		ctensors = append(ctensors, t.ctensor)
@ -994,7 +994,7 @@ func (cm CModule) ForwardTs(tensors []Tensor) (retVal Tensor, err error) {
 		return retVal, err
 	}

-	return Tensor{ctensor}, nil
+	return &Tensor{ctensor}, nil
 }

 // Performs the forward pass for a model on some specified ivalue input.
@ -1066,9 +1066,9 @@ func (cm CModule) To(device gotch.Device, kind gotch.DType, nonBlocking bool) {
 // Implement Module for CModule:
 // =============================

-func (cm CModule) Forward(tensor Tensor) (retVal Tensor, err error) {
+func (cm CModule) Forward(tensor *Tensor) (retVal *Tensor, err error) {

-	var tensors []Tensor = []Tensor{tensor}
+	var tensors []Tensor = []Tensor{*tensor}
 	return cm.ForwardTs(tensors)
 }

@ -1076,7 +1076,7 @@ func (cm CModule) Forward(tensor Tensor) (retVal Tensor, err error) {
 // ======================================

 // Apply forwards tensor itself through a module.
-func (ts Tensor) ApplyCModule(m CModule) (retVal Tensor) {
+func (ts *Tensor) ApplyCModule(m CModule) (retVal *Tensor) {
 	retVal, err := m.Forward(ts)
 	if err != nil {
 		log.Fatal(err)
--- a/tensor/jit_test.go
+++ b/tensor/jit_test.go
@ -59,7 +59,7 @@ func TestModuleForwardTs(t *testing.T) {
 	ts1 := ts.TensorFrom([]int64{42})
 	ts2 := ts.TensorFrom([]int64{1337})

-	res, err := foo.ForwardTs([]ts.Tensor{ts1, ts2})
+	res, err := foo.ForwardTs([]ts.Tensor{*ts1, *ts2})
 	if err != nil {
 		t.Error(err)
 	}
@ -83,8 +83,8 @@ func TestModuleForwardIValue(t *testing.T) {
 	ts1 := ts.TensorFrom([]int64{42})
 	ts2 := ts.TensorFrom([]int64{1337})

-	iv1 := ts.NewIValue(ts1)
-	iv2 := ts.NewIValue(ts2)
+	iv1 := ts.NewIValue(*ts1)
+	iv2 := ts.NewIValue(*ts2)

 	got, err := foo.ForwardIs([]ts.IValue{iv1, iv2})
 	if err != nil {
@ -93,7 +93,7 @@ func TestModuleForwardIValue(t *testing.T) {

 	expectedTs1 := ts.TensorFrom([]int64{1421})
 	expectedTs2 := ts.TensorFrom([]int64{-1295})
-	want := ts.NewIValue([]ts.Tensor{expectedTs1, expectedTs2})
+	want := ts.NewIValue([]ts.Tensor{*expectedTs1, *expectedTs2})

 	if !reflect.DeepEqual(want.Name(), got.Name()) {
 		t.Errorf("Expected Ivalue Name: %v\n", want.Name())
--- a/tensor/module.go
+++ b/tensor/module.go
@ -9,7 +9,7 @@ package tensor
 // be registered, and will have their parameters converted too when you call .cuda(), etc.
 type Module interface {
 	// ModuleT
-	Forward(xs Tensor) Tensor
+	Forward(xs *Tensor) *Tensor
 }

 // ModuleT is a `Module` with an additional train parameter
@ -17,7 +17,7 @@ type Module interface {
 // between training and evaluation. E.g. When using dropout or batch-normalization.
 type ModuleT interface {
 	// Forward(xs Tensor) Tensor
-	ForwardT(xs Tensor, train bool) Tensor
+	ForwardT(xs *Tensor, train bool) *Tensor
 }

 /*
@ -99,18 +99,18 @@ type ModuleT interface {
 // ======================================

 // Apply forwards tensor itself through a module.
-func (ts Tensor) Apply(m Module) (retVal Tensor) {
+func (ts *Tensor) Apply(m Module) (retVal *Tensor) {
 	return m.Forward(ts)
 }

 // Apply forwards tensor itself through a module T.
-func (ts Tensor) ApplyT(m ModuleT, train bool) (retVal Tensor) {
+func (ts *Tensor) ApplyT(m ModuleT, train bool) (retVal *Tensor) {
 	return m.ForwardT(ts, train)
 }

 // ApplyOpt forwards a tensor itself through a module if given, shallow-copies
 // the tensor otherwise.
-func (ts Tensor) ApplyOpt(opts ...ModuleOption) (retVal Tensor) {
+func (ts *Tensor) ApplyOpt(opts ...ModuleOption) (retVal *Tensor) {

 	switch {
 	case len(opts) > 0:
@ -131,7 +131,7 @@ func WithModule(m Module) ModuleOption {

 // ApplyOptT forwards a tensor itself through a module T if given, shallow-copies
 // the tensor otherwise.
-func (ts Tensor) ApplyOptT(train bool, opts ...ModuleTOption) (retVal Tensor) {
+func (ts *Tensor) ApplyOptT(train bool, opts ...ModuleTOption) (retVal *Tensor) {

 	switch {
 	case len(opts) > 0:
--- a/tensor/must-tensor-generated.go
+++ b/tensor/must-tensor-generated.go
--- a/tensor/optimizer.go
+++ b/tensor/optimizer.go
@ -11,20 +11,18 @@ type COptimizer struct {
 }

 // Adam returns Adam optimizer
-func Adam(lr, beta1, beta2, weightDecay float64) (retVal COptimizer, err error) {
+func Adam(lr, beta1, beta2, weightDecay float64) (*COptimizer, error) {
 	coptimizer := lib.AtoAdam(lr, beta1, beta2, weightDecay)

-	err = TorchErr()
-	if err != nil {
-		return retVal, err
+	if err := TorchErr(); err != nil {
+		return nil, err
 	}

-	retVal = COptimizer{coptimizer}
-	return retVal, nil
+	return &COptimizer{coptimizer}, nil
 }

 // RmsProp returns RMSProp optimizer
-func RmsProp(lr, alpha, eps, wd, momentum float64, centered bool) (retVal COptimizer, err error) {
+func RmsProp(lr, alpha, eps, wd, momentum float64, centered bool) (*COptimizer, error) {
 	var centeredCInt int
 	switch centered {
 	case true:
@ -34,19 +32,15 @@ func RmsProp(lr, alpha, eps, wd, momentum float64, centered bool) (retVal COptim
 	}

 	coptimizer := lib.AtoRmsProp(lr, alpha, eps, wd, momentum, centeredCInt)
-	err = TorchErr()
-	if err != nil {
-		return retVal, err
+	if err := TorchErr(); err != nil {
+		return nil, err
 	}

-	retVal = COptimizer{coptimizer}
-
-	return retVal, nil
-
+	return &COptimizer{coptimizer}, nil
 }

 // Sgd returns SGD optimizer
-func Sgd(lr, momentum, dampening, wd float64, nesterov bool) (retVal COptimizer, err error) {
+func Sgd(lr, momentum, dampening, wd float64, nesterov bool) (*COptimizer, error) {
 	var nesterovCInt int
 	switch nesterov {
 	case true:
@ -56,18 +50,15 @@ func Sgd(lr, momentum, dampening, wd float64, nesterov bool) (retVal COptimizer,
 	}

 	coptimizer := lib.AtoSgd(lr, momentum, dampening, wd, nesterovCInt)
-	err = TorchErr()
-	if err != nil {
-		return retVal, err
+	if err := TorchErr(); err != nil {
+		return nil, err
 	}

-	retVal = COptimizer{coptimizer}
-
-	return retVal, nil
+	return &COptimizer{coptimizer}, nil
 }

 // AddParameters adds parameters as a slice of tensors to optimizer
-func (co COptimizer) AddParameters(tensors []Tensor) (err error) {
+func (co *COptimizer) AddParameters(tensors []Tensor) error {

 	var ctensors []lib.Ctensor
 	for _, t := range tensors {
@ -82,35 +73,35 @@ func (co COptimizer) AddParameters(tensors []Tensor) (err error) {
 }

 // SetLeanringRate sets learning rate for the optimizer
-func (co COptimizer) SetLearningRate(lr float64) (err error) {
+func (co *COptimizer) SetLearningRate(lr float64) error {
 	lib.AtoSetLearningRate(co.coptimizer, lr)

 	return TorchErr()
 }

 // SetMomentum sets a momentum for the optimizer
-func (co COptimizer) SetMomentum(m float64) (err error) {
+func (co *COptimizer) SetMomentum(m float64) error {
 	lib.AtoSetMomentum(co.coptimizer, m)

 	return TorchErr()
 }

 // ZeroGrad sets gradients to zero
-func (co COptimizer) ZeroGrad() (err error) {
+func (co *COptimizer) ZeroGrad() error {
 	lib.AtoZeroGrad(co.coptimizer)

 	return TorchErr()
 }

 // Steps proceeds optimizer
-func (co COptimizer) Step() (err error) {
+func (co *COptimizer) Step() error {
 	lib.AtoStep(co.coptimizer)

 	return TorchErr()
 }

 // Drop removes optimizer and frees up memory.
-func (co COptimizer) Drop() {
+func (co *COptimizer) Drop() {
 	lib.AtoFree(co.coptimizer)

 	if err := TorchErr(); err != nil {
--- a/tensor/other.go
+++ b/tensor/other.go
@ -7,7 +7,7 @@ import (
 )

 // CrossEntropyForLogits computes the cross-entropy loss based on some logits and targets.
-func (ts Tensor) CrossEntropyForLogits(targets Tensor) (retVal Tensor) {
+func (ts *Tensor) CrossEntropyForLogits(targets *Tensor) (retVal *Tensor) {
 	weight := NewTensor()
 	reduction := int64(1) // Mean of loss
 	ignoreIndex := int64(-100)
@ -18,13 +18,13 @@ func (ts Tensor) CrossEntropyForLogits(targets Tensor) (retVal Tensor) {

 // AccuracyForLogits returns the average accuracy for some given logits assuming that
 // targets represent ground-truth.
-func (ts Tensor) AccuracyForLogits(targets Tensor) (retVal Tensor) {
+func (ts *Tensor) AccuracyForLogits(targets *Tensor) (retVal *Tensor) {
 	argmax := ts.MustArgmax(-1, false, true)
 	eq1 := argmax.MustEq1(targets, true)
 	return eq1.MustTotype(gotch.Float, true).MustMean(gotch.Float, true)
 }

-func (ts Tensor) MaxPool2DDefault(ksize int64, del bool) (retVal Tensor) {
+func (ts *Tensor) MaxPool2DDefault(ksize int64, del bool) (retVal *Tensor) {
 	return ts.MustMaxPool2d([]int64{ksize, ksize}, []int64{ksize, ksize}, []int64{0, 0}, []int64{1, 1}, false, del)
 }

--- a/tensor/patch.go
+++ b/tensor/patch.go
@ -13,7 +13,7 @@ import (
 // NOTE. This is a temporarily patched to make it run.
 // TODO. make change at generator for []Tensor input

-func (ts Tensor) Lstm(hxData []Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h, c Tensor, err error) {
+func (ts *Tensor) Lstm(hxData []Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h, c *Tensor, err error) {

 	// NOTE: `atg_lstm` will create 3 consecutive Ctensors in memory of C land. The first
 	// Ctensor will have address given by `ctensorPtr1` here.
@ -55,11 +55,11 @@ func (ts Tensor) Lstm(hxData []Tensor, paramsData []Tensor, hasBiases bool, numL
 		return output, h, c, err
 	}

-	return Tensor{ctensor: *ctensorPtr1}, Tensor{ctensor: *ctensorPtr2}, Tensor{ctensor: *ctensorPtr3}, nil
+	return &Tensor{ctensor: *ctensorPtr1}, &Tensor{ctensor: *ctensorPtr2}, &Tensor{ctensor: *ctensorPtr3}, nil

 }

-func (ts Tensor) MustLstm(hxData []Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h, c Tensor) {
+func (ts *Tensor) MustLstm(hxData []Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h, c *Tensor) {
 	output, h, c, err := ts.Lstm(hxData, paramsData, hasBiases, numLayers, dropout, train, bidirectional, batchFirst)

 	if err != nil {
@ -69,7 +69,7 @@ func (ts Tensor) MustLstm(hxData []Tensor, paramsData []Tensor, hasBiases bool,
 	return output, h, c
 }

-func (ts Tensor) Gru(hx Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h Tensor, err error) {
+func (ts *Tensor) Gru(hx *Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h *Tensor, err error) {

 	// NOTE: `atg_gru` will create 2 consecutive Ctensors in memory of C land.
 	// The first Ctensor will have address given by `ctensorPtr1` here.
@ -105,11 +105,11 @@ func (ts Tensor) Gru(hx Tensor, paramsData []Tensor, hasBiases bool, numLayers i
 		return output, h, err
 	}

-	return Tensor{ctensor: *ctensorPtr1}, Tensor{ctensor: *ctensorPtr2}, nil
+	return &Tensor{ctensor: *ctensorPtr1}, &Tensor{ctensor: *ctensorPtr2}, nil

 }

-func (ts Tensor) MustGru(hx Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h Tensor) {
+func (ts *Tensor) MustGru(hx *Tensor, paramsData []Tensor, hasBiases bool, numLayers int64, dropout float64, train bool, bidirectional bool, batchFirst bool) (output, h *Tensor) {
 	output, h, err := ts.Gru(hx, paramsData, hasBiases, numLayers, dropout, train, bidirectional, batchFirst)
 	if err != nil {
 		log.Fatal(err)
@ -118,7 +118,7 @@ func (ts Tensor) MustGru(hx Tensor, paramsData []Tensor, hasBiases bool, numLaye
 	return output, h
 }

-func (ts Tensor) TopK(k int64, dim int64, largest bool, sorted bool) (ts1 Tensor, ts2 Tensor, err error) {
+func (ts *Tensor) TopK(k int64, dim int64, largest bool, sorted bool) (ts1, ts2 *Tensor, err error) {

 	// NOTE: `lib.AtgTopk` will return 2 tensors in C memory. First tensor pointer
 	// is given by ctensorPtr1
@ -139,10 +139,10 @@ func (ts Tensor) TopK(k int64, dim int64, largest bool, sorted bool) (ts1 Tensor
 		return ts1, ts2, err
 	}

-	return Tensor{ctensor: *ctensorPtr1}, Tensor{ctensor: *ctensorPtr2}, nil
+	return &Tensor{ctensor: *ctensorPtr1}, &Tensor{ctensor: *ctensorPtr2}, nil
 }

-func (ts Tensor) MustTopK(k int64, dim int64, largest bool, sorted bool) (ts1 Tensor, ts2 Tensor) {
+func (ts *Tensor) MustTopK(k int64, dim int64, largest bool, sorted bool) (ts1, ts2 *Tensor) {

 	ts1, ts2, err := ts.TopK(k, dim, largest, sorted)
 	if err != nil {
@ -154,7 +154,7 @@ func (ts Tensor) MustTopK(k int64, dim int64, largest bool, sorted bool) (ts1 Te

 // NOTE. `NLLLoss` is a version of `NllLoss` in tensor-generated
 // with default weight, reduction and ignoreIndex
-func (ts Tensor) NLLLoss(target Tensor, del bool) (retVal Tensor, err error) {
+func (ts *Tensor) NLLLoss(target *Tensor, del bool) (retVal *Tensor, err error) {
 	ptr := (*lib.Ctensor)(unsafe.Pointer(C.malloc(0)))
 	if del {
 		defer ts.MustDrop()
@ -169,12 +169,12 @@ func (ts Tensor) NLLLoss(target Tensor, del bool) (retVal Tensor, err error) {
 		return retVal, err
 	}

-	retVal = Tensor{ctensor: *ptr}
+	retVal = &Tensor{ctensor: *ptr}

 	return retVal, nil
 }

-func (ts Tensor) MustNLLLoss(target Tensor, del bool) (retVal Tensor) {
+func (ts *Tensor) MustNLLLoss(target *Tensor, del bool) (retVal *Tensor) {
 	retVal, err := ts.NLLLoss(target, del)
 	if err != nil {
 		log.Fatal(err)
@ -285,7 +285,7 @@ func MustBroadcastTensors(tensors []Tensor, del bool) (retVal []Tensor) {
 }

 // tensor *atg_chunk(tensor self, int64_t chunks, int64_t dim);
-func (ts Tensor) Chunk(chunks int64, dim int64) (retVal []Tensor, err error) {
+func (ts *Tensor) Chunk(chunks int64, dim int64) (retVal []Tensor, err error) {
 	ctensorsPtr := lib.AtgChunk(ts.ctensor, chunks, dim)
 	if err = TorchErr(); err != nil {
 		return retVal, err
@ -307,7 +307,7 @@ func (ts Tensor) Chunk(chunks int64, dim int64) (retVal []Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) MustChunk(chunks int64, dim int64, del bool) (retVal []Tensor) {
+func (ts *Tensor) MustChunk(chunks int64, dim int64, del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
@ -321,7 +321,7 @@ func (ts Tensor) MustChunk(chunks int64, dim int64, del bool) (retVal []Tensor)
 }

 // tensor *atg_meshgrid(tensor *tensors_data, int tensors_len);
-func (ts Tensor) Meshgrid(tensors []Tensor) (retVal []Tensor, err error) {
+func (ts *Tensor) Meshgrid(tensors []Tensor) (retVal []Tensor, err error) {

 	var ctensors []lib.Ctensor
 	for _, t := range tensors {
@ -348,7 +348,7 @@ func (ts Tensor) Meshgrid(tensors []Tensor) (retVal []Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) MustMeshgrid(tensors []Tensor, del bool) (retVal []Tensor) {
+func (ts *Tensor) MustMeshgrid(tensors []Tensor, del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
@ -362,7 +362,7 @@ func (ts Tensor) MustMeshgrid(tensors []Tensor, del bool) (retVal []Tensor) {
 }

 // tensor *atg_nonzero_numpy(tensor self);
-func (ts Tensor) NonzeroNumpy() (retVal []Tensor, err error) {
+func (ts *Tensor) NonzeroNumpy() (retVal []Tensor, err error) {

 	ctensorsPtr := lib.AtgNonzeroNumpy(ts.ctensor)
 	if err = TorchErr(); err != nil {
@ -384,7 +384,7 @@ func (ts Tensor) NonzeroNumpy() (retVal []Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) MustNonzeroNumpy(del bool) (retVal []Tensor) {
+func (ts *Tensor) MustNonzeroNumpy(del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
@ -403,7 +403,7 @@ func (ts Tensor) MustNonzeroNumpy(del bool) (retVal []Tensor) {
 //  - splitSize – size of a single chunk
 //  - dim – dimension along which to split the tensor.
 // Ref. https://pytorch.org/docs/stable/generated/torch.split.html
-func (ts Tensor) Split(splitSize, dim int64) (retVal []Tensor, err error) {
+func (ts *Tensor) Split(splitSize, dim int64) (retVal []Tensor, err error) {

 	ctensorsPtr := lib.AtgSplit(ts.ctensor, splitSize, dim)
 	if err = TorchErr(); err != nil {
@ -430,7 +430,7 @@ func (ts Tensor) Split(splitSize, dim int64) (retVal []Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) MustSplit(splitSize, dim int64, del bool) (retVal []Tensor) {
+func (ts *Tensor) MustSplit(splitSize, dim int64, del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
@ -449,7 +449,7 @@ func (ts Tensor) MustSplit(splitSize, dim int64, del bool) (retVal []Tensor) {
 //  - splitSizes – slice of sizes for each chunk
 //  - dim – dimension along which to split the tensor.
 // Ref. https://pytorch.org/docs/stable/generated/torch.split.html
-func (ts Tensor) SplitWithSizes(splitSizes []int64, dim int64) (retVal []Tensor, err error) {
+func (ts *Tensor) SplitWithSizes(splitSizes []int64, dim int64) (retVal []Tensor, err error) {

 	ctensorsPtr := lib.AtgSplitWithSizes(ts.ctensor, splitSizes, len(splitSizes), dim)
 	if err = TorchErr(); err != nil {
@ -476,7 +476,7 @@ func (ts Tensor) SplitWithSizes(splitSizes []int64, dim int64) (retVal []Tensor,
 	return retVal, nil
 }

-func (ts Tensor) MustSplitWithSizes(splitSizes []int64, dim int64, del bool) (retVal []Tensor) {
+func (ts *Tensor) MustSplitWithSizes(splitSizes []int64, dim int64, del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
@ -490,7 +490,7 @@ func (ts Tensor) MustSplitWithSizes(splitSizes []int64, dim int64, del bool) (re
 }

 // tensor *atg_unbind(tensor self, int64_t dim);
-func (ts Tensor) Unbind(dim int64) (retVal []Tensor, err error) {
+func (ts *Tensor) Unbind(dim int64) (retVal []Tensor, err error) {

 	ctensorsPtr := lib.AtgUnbind(ts.ctensor, dim)
 	if err = TorchErr(); err != nil {
@ -512,7 +512,7 @@ func (ts Tensor) Unbind(dim int64) (retVal []Tensor, err error) {
 	return retVal, nil
 }

-func (ts Tensor) MustUnbind(dim int64, del bool) (retVal []Tensor) {
+func (ts *Tensor) MustUnbind(dim int64, del bool) (retVal []Tensor) {
 	if del {
 		defer ts.MustDrop()
 	}
--- a/tensor/scalar.go
+++ b/tensor/scalar.go
@ -12,19 +12,19 @@ type Scalar struct {
 }

 // IntScalar creates a integer scalar
-func IntScalar(v int64) Scalar {
+func IntScalar(v int64) *Scalar {
 	cscalar := lib.AtsInt(v)
-	return Scalar{cscalar}
+	return &Scalar{cscalar}
 }

 // FloatScalar creates a float scalar
-func FloatScalar(v float64) Scalar {
+func FloatScalar(v float64) *Scalar {
 	cscalar := lib.AtsFloat(v)
-	return Scalar{cscalar}
+	return &Scalar{cscalar}
 }

 // ToInt returns a integer value
-func (sc Scalar) ToInt() (retVal int64, err error) {
+func (sc *Scalar) ToInt() (retVal int64, err error) {
 	retVal = lib.AtsToInt(sc.cscalar)
 	err = TorchErr()
 	if err != nil {
@ -35,7 +35,7 @@ func (sc Scalar) ToInt() (retVal int64, err error) {
 }

 // ToFloat returns a float value
-func (sc Scalar) ToFloat() (retVal float64, err error) {
+func (sc *Scalar) ToFloat() (retVal float64, err error) {
 	retVal = lib.AtsToFloat(sc.cscalar)
 	err = TorchErr()
 	if err != nil {
@ -46,7 +46,7 @@ func (sc Scalar) ToFloat() (retVal float64, err error) {
 }

 // ToString returns a string representation of scalar value
-func (sc Scalar) ToString() (retVal string, err error) {
+func (sc *Scalar) ToString() (retVal string, err error) {
 	retVal = lib.AtsToString(sc.cscalar)
 	err = TorchErr()
 	if err != nil {
@ -60,12 +60,12 @@ func (sc Scalar) ToString() (retVal string, err error) {
 //
 // TODO: Really? after running s.Drop() and s.ToInt()
 // it returns Zero.
-func (sc Scalar) Drop() (err error) {
+func (sc *Scalar) Drop() (err error) {
 	lib.AtsFree(sc.cscalar)
 	return TorchErr()
 }

-func (sc Scalar) MustDrop() {
+func (sc *Scalar) MustDrop() {
 	lib.AtsFree(sc.cscalar)
 	if err := TorchErr(); err != nil {
 		log.Fatal(err)
--- a/tensor/tensor-generated.go
+++ b/tensor/tensor-generated.go
--- a/tensor/tensor.go
+++ b/tensor/tensor.go
--- a/vision/alexnet.go
+++ b/vision/alexnet.go
@ -8,7 +8,7 @@ import (
 // AlexNet implementation
 // https://arxiv.org/abs/1404.5997

-func anConv2d(p nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Conv2D) {
+func anConv2d(p *nn.Path, cIn, cOut, ksize, padding, stride int64) *nn.Conv2D {
 	config := nn.DefaultConv2DConfig()
 	config.Stride = []int64{stride, stride}
 	config.Padding = []int64{padding, padding}
@ -16,15 +16,15 @@ func anConv2d(p nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Con
 	return nn.NewConv2D(p, cIn, cOut, ksize, config)
 }

-func anMaxPool2d(xs ts.Tensor, ksize, stride int64) (retVal ts.Tensor) {
+func anMaxPool2d(xs *ts.Tensor, ksize, stride int64) *ts.Tensor {
 	return xs.MustMaxPool2d([]int64{ksize, ksize}, []int64{stride, stride}, []int64{0, 0}, []int64{1, 1}, false, false)
 }

-func features(p nn.Path) (retVal ts.ModuleT) {
+func features(p *nn.Path) ts.ModuleT {
 	seq := nn.SeqT()
 	seq.Add(anConv2d(p.Sub("0"), 3, 64, 11, 2, 4))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
 		res := anMaxPool2d(tmp1, 3, 2)
 		tmp1.MustDrop()
@ -33,7 +33,7 @@ func features(p nn.Path) (retVal ts.ModuleT) {

 	seq.Add(anConv2d(p.Sub("3"), 64, 192, 5, 1, 2))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
 		res := anMaxPool2d(tmp1, 3, 2)
 		tmp1.MustDrop()
@ -42,19 +42,19 @@ func features(p nn.Path) (retVal ts.ModuleT) {

 	seq.Add(anConv2d(p.Sub("6"), 192, 384, 3, 1, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	seq.Add(anConv2d(p.Sub("8"), 384, 256, 3, 1, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	seq.Add(anConv2d(p.Sub("10"), 256, 256, 3, 1, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
 		res := anMaxPool2d(tmp1, 3, 2)
 		tmp1.MustDrop()
@ -64,26 +64,26 @@ func features(p nn.Path) (retVal ts.ModuleT) {
 	return seq
 }

-func classifier(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func classifier(p *nn.Path, nclasses int64) ts.ModuleT {
 	seq := nn.SeqT()

-	seq.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	seq.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

 	seq.Add(nn.NewLinear(p.Sub("1"), 256*6*6, 4096, nn.DefaultLinearConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

-	seq.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	seq.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

 	seq.Add(nn.NewLinear(p.Sub("4"), 4096, 4096, nn.DefaultLinearConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

@ -92,12 +92,12 @@ func classifier(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
 	return seq
 }

-func AlexNet(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func AlexNet(p *nn.Path, nclasses int64) ts.ModuleT {
 	seq := nn.SeqT()

 	seq.Add(features(p.Sub("features")))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustAdaptiveAvgPool2d([]int64{6, 6}, false)
 		res := tmp1.FlatView()
 		tmp1.MustDrop()
--- a/vision/cifar.go
+++ b/vision/cifar.go
@ -24,7 +24,7 @@ const (
 	samplesPerFile int64 = 10000
 )

-func readFile(filename string) (imagesTs ts.Tensor, labelsTs ts.Tensor) {
+func readFile(filename string) (imagesTs *ts.Tensor, labelsTs *ts.Tensor) {
 	f, err := os.Open(filename)
 	if err != nil {
 		log.Fatalf("readImages errors: %v\n", err)
@ -74,7 +74,7 @@ func readFile(filename string) (imagesTs ts.Tensor, labelsTs ts.Tensor) {
 	return imagesTs, labelsTs
 }

-func CFLoadDir(dir string) (retVal Dataset) {
+func CFLoadDir(dir string) *Dataset {

 	dirAbs, err := filepath.Abs(dir)
 	if err != nil {
@ -96,11 +96,11 @@ func CFLoadDir(dir string) (retVal Dataset) {

 	for _, f := range trainFiles {
 		img, l := readFile(fmt.Sprintf("%v/%v", dirAbs, f))
-		trainImages = append(trainImages, img)
-		trainLabels = append(trainLabels, l)
+		trainImages = append(trainImages, *img)
+		trainLabels = append(trainLabels, *l)
 	}

-	return Dataset{
+	return &Dataset{
 		TrainImages: ts.MustCat(trainImages, 0),
 		TrainLabels: ts.MustCat(trainLabels, 0),
 		TestImages:  testImages,
--- a/vision/dataset.go
+++ b/vision/dataset.go
@ -12,10 +12,10 @@ import (
 )

 type Dataset struct {
-	TrainImages ts.Tensor
-	TrainLabels ts.Tensor
-	TestImages  ts.Tensor
-	TestLabels  ts.Tensor
+	TrainImages *ts.Tensor
+	TrainLabels *ts.Tensor
+	TestImages  *ts.Tensor
+	TestLabels  *ts.Tensor
 	Labels      int64
 }

@ -23,20 +23,20 @@ type Dataset struct {
 //=================

 // TrainIter creates an iterator of Iter type for train images and labels
-func (ds Dataset) TrainIter(batchSize int64) (retVal ts.Iter2) {
+func (ds *Dataset) TrainIter(batchSize int64) *ts.Iter2 {
 	return ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, batchSize)

 }

 // TestIter creates an iterator of Iter type for test images and labels
-func (ds Dataset) TestIter(batchSize int64) (retVal ts.Iter2) {
+func (ds *Dataset) TestIter(batchSize int64) *ts.Iter2 {
 	return ts.MustNewIter2(ds.TestImages, ds.TestLabels, batchSize)
 }

 // RandomFlip randomly applies horizontal flips
 // This expects a 4 dimension NCHW tensor and returns a tensor with
 // an identical shape.
-func RandomFlip(t ts.Tensor) (retVal ts.Tensor) {
+func RandomFlip(t *ts.Tensor) *ts.Tensor {

 	size := t.MustSize()

@ -53,7 +53,7 @@ func RandomFlip(t ts.Tensor) (retVal ts.Tensor) {
 		outputView := output.Idx(ts.NewSelect(int64(batchIdx)))
 		tView := t.Idx(ts.NewSelect(int64(batchIdx)))

-		var src ts.Tensor
+		var src *ts.Tensor
 		if rand.Float64() == 1.0 {
 			src = tView
 		} else {
@ -72,7 +72,7 @@ func RandomFlip(t ts.Tensor) (retVal ts.Tensor) {
 // Pad the image using reflections and take some random crops.
 // This expects a 4 dimension NCHW tensor and returns a tensor with
 // an identical shape.
-func RandomCrop(t ts.Tensor, pad int64) (retVal ts.Tensor) {
+func RandomCrop(t *ts.Tensor, pad int64) *ts.Tensor {

 	size := t.MustSize()

@ -115,7 +115,7 @@ func RandomCrop(t ts.Tensor, pad int64) (retVal ts.Tensor) {

 // Applies cutout: randomly remove some square areas in the original images.
 // https://arxiv.org/abs/1708.04552
-func RandomCutout(t ts.Tensor, sz int64) (retVal ts.Tensor) {
+func RandomCutout(t *ts.Tensor, sz int64) *ts.Tensor {

 	size := t.MustSize()

@ -168,11 +168,11 @@ func RandomCutout(t ts.Tensor, sz int64) (retVal ts.Tensor) {
 	return output
 }

-func Augmentation(t ts.Tensor, flip bool, crop int64, cutout int64) (retVal ts.Tensor) {
+func Augmentation(t *ts.Tensor, flip bool, crop int64, cutout int64) *ts.Tensor {

 	tclone := t.MustShallowClone()

-	var flipTs ts.Tensor
+	var flipTs *ts.Tensor
 	if flip {
 		flipTs = RandomFlip(tclone)
 		tclone.MustDrop()
@ -180,7 +180,7 @@ func Augmentation(t ts.Tensor, flip bool, crop int64, cutout int64) (retVal ts.T
 		flipTs = tclone
 	}

-	var cropTs ts.Tensor
+	var cropTs *ts.Tensor
 	if crop > 0 {
 		cropTs = RandomCrop(flipTs, crop)
 		flipTs.MustDrop()
@ -188,12 +188,13 @@ func Augmentation(t ts.Tensor, flip bool, crop int64, cutout int64) (retVal ts.T
 		cropTs = flipTs
 	}

+	var output *ts.Tensor
 	if cutout > 0 {
-		retVal = RandomCutout(cropTs, cutout)
+		output = RandomCutout(cropTs, cutout)
 		cropTs.MustDrop()
 	} else {
-		retVal = cropTs
+		output = cropTs
 	}

-	return retVal
+	return output
 }
--- a/vision/densenet.go
+++ b/vision/densenet.go
@ -12,7 +12,7 @@ import (
 	ts "github.com/sugarme/gotch/tensor"
 )

-func dnConv2d(p nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Conv2D) {
+func dnConv2d(p *nn.Path, cIn, cOut, ksize, padding, stride int64) *nn.Conv2D {
 	config := nn.DefaultConv2DConfig()
 	config.Stride = []int64{stride, stride}
 	config.Padding = []int64{padding, padding}
@ -21,14 +21,14 @@ func dnConv2d(p nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Con
 	return nn.NewConv2D(p, cIn, cOut, ksize, config)
 }

-func denseLayer(p nn.Path, cIn, bnSize, growth int64) (retVal ts.ModuleT) {
+func denseLayer(p *nn.Path, cIn, bnSize, growth int64) ts.ModuleT {
 	cInter := bnSize * growth
 	bn1 := nn.BatchNorm2D(p.Sub("norm1"), cIn, nn.DefaultBatchNormConfig())
 	conv1 := dnConv2d(p.Sub("conv1"), cIn, cInter, 1, 0, 1)
 	bn2 := nn.BatchNorm2D(p.Sub("norm2"), cInter, nn.DefaultBatchNormConfig())
 	conv2 := dnConv2d(p.Sub("conv2"), cInter, growth, 3, 1, 1)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		ys1 := xs.ApplyT(bn1, train)
 		ys2 := ys1.MustRelu(true)
 		ys3 := ys2.Apply(conv1)
@ -39,14 +39,14 @@ func denseLayer(p nn.Path, cIn, bnSize, growth int64) (retVal ts.ModuleT) {
 		ys := ys5.Apply(conv2)
 		ys5.MustDrop()

-		res := ts.MustCat([]ts.Tensor{xs, ys}, 1)
+		res := ts.MustCat([]ts.Tensor{*xs, *ys}, 1)
 		ys.MustDrop()

 		return res
 	})
 }

-func denseBlock(p nn.Path, cIn, bnSize, growth, nlayers int64) (retVal ts.ModuleT) {
+func denseBlock(p *nn.Path, cIn, bnSize, growth, nlayers int64) ts.ModuleT {
 	seq := nn.SeqT()

 	for i := 0; i < int(nlayers); i++ {
@ -56,25 +56,25 @@ func denseBlock(p nn.Path, cIn, bnSize, growth, nlayers int64) (retVal ts.Module
 	return seq
 }

-func transition(p nn.Path, cIn, cOut int64) (retVal ts.ModuleT) {
+func transition(p *nn.Path, cIn, cOut int64) ts.ModuleT {
 	seq := nn.SeqT()

 	seq.Add(nn.BatchNorm2D(p.Sub("norm"), cIn, nn.DefaultBatchNormConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	seq.Add(dnConv2d(p.Sub("conv"), cIn, cOut, 1, 0, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.AvgPool2DDefault(2, false)
 	}))

 	return seq
 }

-func densenet(p nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth int64) (retVal ts.ModuleT) {
+func densenet(p *nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth int64) ts.ModuleT {
 	fp := p.Sub("features")
 	seq := nn.SeqT()

@ -82,7 +82,7 @@ func densenet(p nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth in

 	seq.Add(nn.BatchNorm2D(fp.Sub("norm0"), cIn, nn.DefaultBatchNormConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustRelu(false)
 		return tmp.MustMaxPool2d([]int64{3, 3}, []int64{2, 2}, []int64{1, 1}, []int64{1, 1}, false, true)
 	}))
@ -101,7 +101,7 @@ func densenet(p nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth in

 	seq.Add(nn.BatchNorm2D(fp.Sub("norm5"), nfeat, nn.DefaultBatchNormConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
 		tmp2 := tmp1.MustAvgPool2d([]int64{7, 7}, []int64{1, 1}, []int64{0, 0}, false, true, 1, true)
 		res := tmp2.FlatView()
@ -114,18 +114,18 @@ func densenet(p nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth in
 	return seq
 }

-func DenseNet121(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func DenseNet121(p *nn.Path, nclasses int64) ts.ModuleT {
 	return densenet(p, 64, 4, 32, []int64{6, 12, 24, 16}, nclasses)
 }

-func DenseNet161(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func DenseNet161(p *nn.Path, nclasses int64) ts.ModuleT {
 	return densenet(p, 96, 4, 48, []int64{6, 12, 36, 24}, nclasses)
 }

-func DenseNet169(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func DenseNet169(p *nn.Path, nclasses int64) ts.ModuleT {
 	return densenet(p, 64, 4, 32, []int64{6, 12, 32, 32}, nclasses)
 }

-func DenseNet201(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func DenseNet201(p *nn.Path, nclasses int64) ts.ModuleT {
 	return densenet(p, 64, 4, 32, []int64{6, 12, 48, 32}, nclasses)
 }
--- a/vision/efficientnet.go
+++ b/vision/efficientnet.go
@ -23,8 +23,8 @@ type BlockArgs struct {
 	Stride       int64
 }

-func ba(k, r, i, o, er int64, sr float64, s int64) (retVal BlockArgs) {
-	return BlockArgs{
+func ba(k, r, i, o, er int64, sr float64, s int64) *BlockArgs {
+	return &BlockArgs{
 		KernelSize:   k,
 		NumRepeat:    r,
 		InputFilters: i,
@ -37,13 +37,13 @@ func ba(k, r, i, o, er int64, sr float64, s int64) (retVal BlockArgs) {

 func blockArgs() (retVal []BlockArgs) {
 	return []BlockArgs{
-		ba(3, 1, 32, 16, 1, 0.25, 1),
-		ba(3, 2, 16, 24, 6, 0.25, 2),
-		ba(5, 2, 24, 40, 6, 0.25, 2),
-		ba(3, 3, 40, 80, 6, 0.25, 2),
-		ba(5, 3, 80, 112, 6, 0.25, 1),
-		ba(5, 4, 112, 192, 6, 0.25, 2),
-		ba(3, 1, 192, 320, 6, 0.25, 1),
+		*ba(3, 1, 32, 16, 1, 0.25, 1),
+		*ba(3, 2, 16, 24, 6, 0.25, 2),
+		*ba(5, 2, 24, 40, 6, 0.25, 2),
+		*ba(3, 3, 40, 80, 6, 0.25, 2),
+		*ba(5, 3, 80, 112, 6, 0.25, 1),
+		*ba(5, 4, 112, 192, 6, 0.25, 2),
+		*ba(3, 1, 192, 320, 6, 0.25, 1),
 	}
 }

@ -54,12 +54,12 @@ type params struct {
 	Dropout float64
 }

-func (p params) roundRepeats(repeats int64) (retVal int64) {
+func (p *params) roundRepeats(repeats int64) int64 {

 	return int64(math.Ceil(p.Depth * float64(repeats)))
 }

-func (p params) roundFilters(filters int64) (retVal int64) {
+func (p *params) roundFilters(filters int64) int64 {
 	var divisor int64 = 8
 	filF := p.Width * float64(filters)
 	filI := int64(filF + float64(divisor)/2.0)
@ -74,11 +74,11 @@ func (p params) roundFilters(filters int64) (retVal int64) {
 }

 // Conv2D with same padding
-func enConv2d(vs nn.Path, i, o, k int64, c nn.Conv2DConfig, train bool) (retVal ts.ModuleT) {
+func enConv2d(vs *nn.Path, i, o, k int64, c *nn.Conv2DConfig, train bool) ts.ModuleT {
 	conv2d := nn.NewConv2D(vs, i, o, k, c)
 	s := c.Stride

-	return nn.NewFunc(func(xs ts.Tensor) (res ts.Tensor) {
+	return nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		size := xs.MustSize()
 		ih := size[2]
 		iw := size[3]
@ -94,6 +94,7 @@ func enConv2d(vs nn.Path, i, o, k int64, c nn.Conv2DConfig, train bool) (retVal
 			padW = ((ow - 1) * s[0]) + k - iw
 		}

+		var res *ts.Tensor
 		if padW > 0 || padH > 0 {
 			zeroP2D := xs.MustZeroPad2d(padW/2, padW-padW/2, padH/2, padH-padH/2, false)
 			res = zeroP2D.ApplyT(conv2d, train)
@ -106,8 +107,8 @@ func enConv2d(vs nn.Path, i, o, k int64, c nn.Conv2DConfig, train bool) (retVal
 	})
 }

-func newParams(width, depth float64, res int64, dropout float64) (retVal params) {
-	return params{
+func newParams(width, depth float64, res int64, dropout float64) *params {
+	return &params{
 		width,
 		depth,
 		res,
@ -115,39 +116,39 @@ func newParams(width, depth float64, res int64, dropout float64) (retVal params)
 	}
 }

-func b0() (retVal params) {
+func b0() *params {
 	return newParams(1.0, 1.0, 224, 0.2)
 }

-func b1() (retVal params) {
+func b1() *params {
 	return newParams(1.0, 1.1, 240, 0.2)
 }

-func b2() (retVal params) {
+func b2() *params {
 	return newParams(1.1, 1.2, 260, 0.3)
 }

-func b3() (retVal params) {
+func b3() *params {
 	return newParams(1.2, 1.4, 300, 0.3)
 }

-func b4() (retVal params) {
+func b4() *params {
 	return newParams(1.4, 1.8, 380, 0.4)
 }

-func b5() (retVal params) {
+func b5() *params {
 	return newParams(1.6, 2.2, 456, 0.4)
 }

-func b6() (retVal params) {
+func b6() *params {
 	return newParams(1.8, 2.6, 528, 0.5)
 }

-func b7() (retVal params) {
+func b7() *params {
 	return newParams(2.0, 3.1, 600, 0.5)
 }

-func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
+func block(p *nn.Path, args BlockArgs) ts.ModuleT {

 	inp := args.InputFilters
 	oup := args.InputFilters * args.ExpandRatio
@ -169,7 +170,7 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
 	if args.ExpandRatio != 1 {
 		expansion.Add(enConv2d(p.Sub("_expand_conv"), inp, oup, 1, convConfigNoBias, false))
 		expansion.Add(nn.BatchNorm2D(p.Sub("_bn0"), oup, bn2d))
-		expansion.AddFn(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+		expansion.AddFn(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			return xs.Swish()
 		}))
 	}
@ -178,7 +179,7 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
 	depthwiseBn := nn.BatchNorm2D(p.Sub("_bn1"), oup, bn2d)

 	// NOTE: args.SeRatio is optional float64. Default = 0
-	var se nn.SequentialT // se will be nil if args.SeRatio == 0
+	var se *nn.SequentialT // se will be nil if args.SeRatio == 0
 	if args.SeRatio > 0 {
 		var nsc int64 = 1
 		if (float64(inp) * args.SeRatio) > 1 {
@ -188,7 +189,7 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
 		se = nn.SeqT()
 		se.Add(enConv2d(p.Sub("_se_reduce"), oup, nsc, 1, nn.DefaultConv2DConfig(), false))

-		se.AddFn(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+		se.AddFn(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			return xs.Swish()
 		}))

@ -199,8 +200,8 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {

 	projectBn := nn.BatchNorm2D(p.Sub("_bn2"), finalOup, bn2d)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
-		var ys ts.Tensor
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
+		var ys *ts.Tensor
 		if args.ExpandRatio == 1 {
 			ys = xs.MustShallowClone()
 		} else {
@ -213,7 +214,7 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
 		ys3 := ys2.Swish()
 		ys2.MustDrop()

-		var ys4 ts.Tensor
+		var ys4 *ts.Tensor
 		// NOTE: args.SeRatio is optional value.
 		if args.SeRatio == 0 {
 			ys4 = ys3
@ -238,7 +239,7 @@ func block(p nn.Path, args BlockArgs) (retVal ts.ModuleT) {
 	})
 }

-func efficientnet(p nn.Path, params params, nclasses int64) (retVal ts.ModuleT) {
+func efficientnet(p *nn.Path, params *params, nclasses int64) ts.ModuleT {

 	args := blockArgs()

@ -287,13 +288,13 @@ func efficientnet(p nn.Path, params params, nclasses int64) (retVal ts.ModuleT)

 	classifier := nn.SeqT()

-	classifier.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	classifier.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.2, train)
 	}))

 	classifier.Add(nn.NewLinear(p.Sub("_fc"), outC, nclasses, nn.DefaultLinearConfig()))

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.ApplyT(convStem, false)
 		tmp2 := tmp1.ApplyT(bn0, train)
 		tmp1.MustDrop()
@ -318,34 +319,34 @@ func efficientnet(p nn.Path, params params, nclasses int64) (retVal ts.ModuleT)

 }

-func EfficientNetB0(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB0(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b0(), nclasses)
 }

-func EfficientNetB1(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB1(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b1(), nclasses)
 }

-func EfficientNetB2(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB2(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b2(), nclasses)
 }

-func EfficientNetB3(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB3(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b3(), nclasses)
 }

-func EfficientNetB4(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB4(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b4(), nclasses)
 }

-func EfficientNetB5(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB5(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b5(), nclasses)
 }

-func EfficientNetB6(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB6(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b6(), nclasses)
 }

-func EfficientNetB7(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func EfficientNetB7(p *nn.Path, nclasses int64) ts.ModuleT {
 	return efficientnet(p, b7(), nclasses)
 }
--- a/vision/image.go
+++ b/vision/image.go
@ -14,18 +14,16 @@ import (
 )

 // (height, width, channel) -> (channel, height, width)
-func hwcToCHW(tensor ts.Tensor) (retVal ts.Tensor) {
-	var err error
-	retVal, err = tensor.Permute([]int64{2, 0, 1}, true)
+func hwcToCHW(tensor *ts.Tensor) *ts.Tensor {
+	retVal, err := tensor.Permute([]int64{2, 0, 1}, true)
 	if err != nil {
 		log.Fatalf("hwcToCHW error: %v\n", err)
 	}
 	return retVal
 }

-func chwToHWC(tensor ts.Tensor) (retVal ts.Tensor) {
-	var err error
-	retVal, err = tensor.Permute([]int64{1, 2, 0}, true)
+func chwToHWC(tensor *ts.Tensor) *ts.Tensor {
+	retVal, err := tensor.Permute([]int64{1, 2, 0}, true)
 	if err != nil {
 		log.Fatalf("hwcToCHW error: %v\n", err)
 	}
@ -35,15 +33,14 @@ func chwToHWC(tensor ts.Tensor) (retVal ts.Tensor) {
 // Load loads an image from a file.
 //
 // On success returns a tensor of shape [channel, height, width].
-func Load(path string) (retVal ts.Tensor, err error) {
-	var tensor ts.Tensor
-	tensor, err = ts.LoadHwc(path)
+func Load(path string) (*ts.Tensor, error) {
+	var tensor *ts.Tensor
+	tensor, err := ts.LoadHwc(path)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

-	retVal = hwcToCHW(tensor)
-	return retVal, nil
+	return hwcToCHW(tensor), nil
 }

 // Save saves an image to a file.
@ -53,7 +50,7 @@ func Load(path string) (retVal ts.Tensor, err error) {
 // are jpg, png, tga, and bmp.
 // The tensor input should be of kind UInt8 with values ranging from
 // 0 to 255.
-func Save(tensor ts.Tensor, path string) (err error) {
+func Save(tensor *ts.Tensor, path string) error {
 	t, err := tensor.Totype(gotch.Uint8, false) // false to keep the input tensor
 	if err != nil {
 		err = fmt.Errorf("Save - Tensor.Totype() error: %v\n", err)
@ -81,21 +78,19 @@ func Save(tensor ts.Tensor, path string) (err error) {
 //
 // This expects as input a tensor of shape [channel, height, width] and returns
 // a tensor of shape [channel, out_h, out_w].
-func Resize(t ts.Tensor, outW int64, outH int64) (retVal ts.Tensor, err error) {
+func Resize(t *ts.Tensor, outW int64, outH int64) (*ts.Tensor, error) {
 	tmpTs, err := ts.ResizeHwc(chwToHWC(t), outW, outH)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}
-	retVal = hwcToCHW(tmpTs)
-
-	return retVal, nil
+	return hwcToCHW(tmpTs), nil
 }

-func resizePreserveAspectRatioHWC(t ts.Tensor, outW int64, outH int64) (retVal ts.Tensor, err error) {
+func resizePreserveAspectRatioHWC(t *ts.Tensor, outW int64, outH int64) (*ts.Tensor, error) {
 	tsSize, err := t.Size()
 	if err != nil {
 		err = fmt.Errorf("resizePreserveAspectRatioHWC - ts.Size() method call err: %v\n", err)
-		return retVal, err
+		return nil, err
 	}

 	// TODO: check it
@ -106,7 +101,7 @@ func resizePreserveAspectRatioHWC(t ts.Tensor, outW int64, outH int64) (retVal t
 		tmpTs, err := ts.ResizeHwc(t, outW, outH)
 		if err != nil {
 			err = fmt.Errorf("resizePreserveAspectRatioHWC - ts.ResizeHwc() method call err: %v\n", err)
-			return retVal, err
+			return nil, err
 		}
 		return hwcToCHW(tmpTs), nil
 	} else {
@ -123,18 +118,19 @@ func resizePreserveAspectRatioHWC(t ts.Tensor, outW int64, outH int64) (retVal t
 		tmpTs, err := ts.ResizeHwc(t, resizeW, resizeH)
 		tensor := hwcToCHW(tmpTs)

-		var tensorW ts.Tensor
-		var tensorH ts.Tensor
+		var tensorW *ts.Tensor
+		var tensorH *ts.Tensor
 		if resizeW == outW {
 			tensorW = tensor
 		} else {
 			tensorW, err = tensor.Narrow(2, (resizeW-outW)/2, outW, true)
 			if err != nil {
 				err = fmt.Errorf("resizePreserveAspectRatioHWC - ts.Narrow() method call err: %v\n", err)
-				return retVal, err
+				return nil, err
 			}
 		}

+		var retVal *ts.Tensor
 		if int64(resizeH) == outH {
 			retVal = tensorW
 		} else {
@ -153,28 +149,28 @@ func resizePreserveAspectRatioHWC(t ts.Tensor, outW int64, outH int64) (retVal t
 // ResizePreserveAspectRatio resizes an image, preserve the aspect ratio by taking a center crop.
 //
 // This expects as input a tensor of shape [channel, height, width] and returns
-func ResizePreserveAspectRatio(t ts.Tensor, outW int64, outH int64) (retVal ts.Tensor, err error) {
+func ResizePreserveAspectRatio(t *ts.Tensor, outW int64, outH int64) (*ts.Tensor, error) {
 	return resizePreserveAspectRatioHWC(chwToHWC(t), outW, outH)
 }

 // LoadAndResize loads and resizes an image, preserve the aspect ratio by taking a center crop.
-func LoadAndResize(path string, outW int64, outH int64) (retVal ts.Tensor, err error) {
+func LoadAndResize(path string, outW int64, outH int64) (*ts.Tensor, error) {
 	tensor, err := ts.LoadHwc(path)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	return resizePreserveAspectRatioHWC(tensor, outW, outH)
 }

 // LoadDir loads all the images in a directory.
-func LoadDir(dir string, outW int64, outH int64) (retVal ts.Tensor, err error) {
+func LoadDir(dir string, outW int64, outH int64) (*ts.Tensor, error) {
 	var filePaths []string // "dir/filename.ext"
 	var tensors []ts.Tensor
 	files, err := ioutil.ReadDir(dir)
 	if err != nil {
 		err = fmt.Errorf("LoadDir - Read directory error: %v\n", err)
-		return retVal, err
+		return nil, err
 	}
 	for _, f := range files {
 		filePaths = append(filePaths, fmt.Sprintf("%v%v", dir, f.Name()))
@ -184,9 +180,9 @@ func LoadDir(dir string, outW int64, outH int64) (retVal ts.Tensor, err error) {
 		tensor, err := LoadAndResize(path, outW, outH)
 		if err != nil {
 			err = fmt.Errorf("LoadDir - LoadAndResize method call error: %v\n", err)
-			return retVal, err
+			return nil, err
 		}
-		tensors = append(tensors, tensor)
+		tensors = append(tensors, *tensor)
 	}

 	return ts.Stack(tensors, 0)
--- a/vision/imagenet.go
+++ b/vision/imagenet.go
@ -17,71 +17,71 @@ import (

 type ImageNet struct {
 	mutex *sync.Mutex
-	mean  ts.Tensor
-	std   ts.Tensor
+	mean  *ts.Tensor
+	std   *ts.Tensor
 }

-func NewImageNet() ImageNet {
-	return ImageNet{
+func NewImageNet() *ImageNet {
+	return &ImageNet{
 		mutex: &sync.Mutex{},
 		mean:  ts.MustOfSlice([]float32{0.485, 0.456, 0.406}).MustView([]int64{3, 1, 1}, true),
 		std:   ts.MustOfSlice([]float32{0.229, 0.224, 0.225}).MustView([]int64{3, 1, 1}, true),
 	}
 }

-func (in ImageNet) Normalize(tensor ts.Tensor) (retVal ts.Tensor, err error) {
+func (in *ImageNet) Normalize(tensor *ts.Tensor) (*ts.Tensor, error) {
 	in.mutex.Lock()
 	defer in.mutex.Unlock()

 	res, err := tensor.Totype(gotch.Float, false)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	resDiv1, err := res.Div1(ts.FloatScalar(float64(255.0)), true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	resMean, err := resDiv1.Sub(in.mean, true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	resStd, err := resMean.Div(in.std, true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	return resStd, nil
 }

-func (in ImageNet) UnNormalize(tensor ts.Tensor) (retVal ts.Tensor, err error) {
+func (in *ImageNet) UnNormalize(tensor *ts.Tensor) (*ts.Tensor, error) {
 	in.mutex.Lock()
 	defer in.mutex.Unlock()

 	resMul, err := tensor.Mul(in.std, true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}
 	resAdd, err := resMul.Add(in.mean, true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	resMul1, err := resAdd.Mul1(ts.FloatScalar(float64(255.0)), true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	resClamp, err := resMul1.Clamp(ts.FloatScalar(float64(0.0)), ts.FloatScalar(float64(255.0)), true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	res, err := resClamp.Totype(gotch.Uint8, true)
 	if err != nil {
-		return retVal, err
+		return nil, err
 	}

 	return res, nil
@ -90,7 +90,7 @@ func (in ImageNet) UnNormalize(tensor ts.Tensor) (retVal ts.Tensor, err error) {
 // SaveImage saves a tensor image to a path.
 //
 // NOTE: This will carry out the ImageNet unnormalization.
-func (in ImageNet) SaveImage(tensor ts.Tensor, path string) (err error) {
+func (in *ImageNet) SaveImage(tensor *ts.Tensor, path string) error {
 	unnormTs, err := in.UnNormalize(tensor)
 	if err != nil {
 		err = fmt.Errorf("ImageNet - SaveImage method call: %v", err)
@ -101,11 +101,11 @@ func (in ImageNet) SaveImage(tensor ts.Tensor, path string) (err error) {
 }

 // Load loads an image from a file and applies the ImageNet normalization.
-func (in ImageNet) LoadImage(path string) (retVal ts.Tensor, err error) {
+func (in *ImageNet) LoadImage(path string) (*ts.Tensor, error) {
 	tensor, err := Load(path)
 	if err != nil {
 		err = fmt.Errorf("ImageNet - LoadImage method call: %v", err)
-		return retVal, err
+		return nil, err
 	}

 	return in.Normalize(tensor)
@ -114,11 +114,11 @@ func (in ImageNet) LoadImage(path string) (retVal ts.Tensor, err error) {
 // LoadImageAndResize loads an image from a file and resize it to the specified width and height.
 //
 // NOTE: This will apply the ImageNet normalization.
-func (in ImageNet) LoadImageAndResize(path string, w, h int64) (retVal ts.Tensor, err error) {
+func (in *ImageNet) LoadImageAndResize(path string, w, h int64) (*ts.Tensor, error) {
 	tensor, err := LoadAndResize(path, w, h)
 	if err != nil {
 		err = fmt.Errorf("ImageNet - LoadImageAndResize method call: %v", err)
-		return retVal, err
+		return nil, err
 	}

 	return tensor, nil
@ -127,17 +127,17 @@ func (in ImageNet) LoadImageAndResize(path string, w, h int64) (retVal ts.Tensor
 // LoadImageAndResize224 loads an image from a file and resize it to 224x224.
 //
 // NOTE: This will apply the ImageNet normalization.
-func (in ImageNet) LoadImageAndResize224(path string) (retVal ts.Tensor, err error) {
+func (in *ImageNet) LoadImageAndResize224(path string) (*ts.Tensor, error) {
 	tensor, err := in.LoadImageAndResize(path, int64(224), int64(224))
 	if err != nil {
 		err = fmt.Errorf("ImageNet - LoadImageAndResize224/LoadImageAndResize method call: %v", err)
-		return retVal, err
+		return nil, err
 	}

 	return in.Normalize(tensor)
 }

-func (in ImageNet) hasSuffix(path string) (retVal bool) {
+func (in *ImageNet) hasSuffix(path string) bool {

 	ext := filepath.Ext(path)

@ -149,13 +149,13 @@ func (in ImageNet) hasSuffix(path string) (retVal bool) {
 	}
 }

-func (in ImageNet) loadImageFromDir(dir string) (retVal ts.Tensor, err error) {
+func (in *ImageNet) loadImageFromDir(dir string) (*ts.Tensor, error) {
 	var images []ts.Tensor

 	files, err := ioutil.ReadDir(dir)
 	if err != nil {
 		err = fmt.Errorf("ImageNet - loadImageFromDir method call: %v", err)
-		return retVal, err
+		return nil, err
 	}

 	for _, file := range files {
@ -166,15 +166,15 @@ func (in ImageNet) loadImageFromDir(dir string) (retVal ts.Tensor, err error) {
 		img, err := in.LoadImageAndResize224(fmt.Sprintf("%v/%v", dir, file.Name()))
 		if err != nil {
 			err = fmt.Errorf("ImageNet - loadImageFromDir method call: %v", err)
-			return retVal, err
+			return nil, err
 		}

-		images = append(images, img)
+		images = append(images, *img)
 	}

 	if len(images) == 0 {
 		err = fmt.Errorf("There no supported image files in specified directory (%v)", dir)
-		return retVal, err
+		return nil, err
 	}

 	return ts.Stack(images, int64(0))
@ -186,7 +186,7 @@ func (in ImageNet) loadImageFromDir(dir string) (retVal ts.Tensor, err error) {
 // In each of these datasets, there should be a subdirectory per class named
 // in the same way.
 // The ImageNet normalization is applied, image are resized to 224x224.
-func (in ImageNet) LoadFromDir(path string) (retVal Dataset, err error) {
+func (in *ImageNet) LoadFromDir(path string) (*Dataset, error) {

 	absPath, err := filepath.Abs(path)
 	if err != nil {
@ -203,7 +203,7 @@ func (in ImageNet) LoadFromDir(path string) (retVal Dataset, err error) {
 	subs, err := ioutil.ReadDir(validPath)
 	if err != nil {
 		err := fmt.Errorf("ImageNet - LoadFromDir method call: %v\n", err)
-		return retVal, err
+		return nil, err
 	}

 	for _, sub := range subs {
@ -230,30 +230,30 @@ func (in ImageNet) LoadFromDir(path string) (retVal Dataset, err error) {
 		trainTs, err := in.loadImageFromDir(trainDir)
 		if err != nil {
 			err := fmt.Errorf("ImageNet - LoadFromDir method call - Err at classes iterating: %v\n", err)
-			return retVal, err
+			return nil, err
 		}

 		ntrainTs := trainTs.MustSize()[0]
-		trainImages = append(trainImages, trainTs)
+		trainImages = append(trainImages, *trainTs)

 		trainLabelOnes := ts.MustOnes([]int64{ntrainTs}, gotch.Int64, gotch.CPU)
-		trainLabels = append(trainLabels, trainLabelOnes.MustMul1(ts.IntScalar(labelIndex), true))
+		trainLabels = append(trainLabels, *trainLabelOnes.MustMul1(ts.IntScalar(labelIndex), true))

 		// test
 		testDir := fmt.Sprintf("%v/%v", validPath, labelDir)
 		testTs, err := in.loadImageFromDir(testDir)
 		if err != nil {
 			err := fmt.Errorf("ImageNet - LoadFromDir method call - Err at classes interating: %v\n", err)
-			return retVal, err
+			return nil, err
 		}
 		ntestTs := testTs.MustSize()[0]
-		testImages = append(testImages, testTs)
+		testImages = append(testImages, *testTs)

 		testLabelOnes := ts.MustOnes([]int64{ntestTs}, gotch.Int64, gotch.CPU)
-		testLabels = append(testLabels, testLabelOnes.MustMul1(ts.IntScalar(labelIndex), true))
+		testLabels = append(testLabels, *testLabelOnes.MustMul1(ts.IntScalar(labelIndex), true))
 	}

-	return Dataset{
+	return &Dataset{
 		TrainImages: ts.MustCat(trainImages, 0),
 		TrainLabels: ts.MustCat(trainLabels, 0),
 		TestImages:  ts.MustCat(testImages, 0),
@ -264,7 +264,7 @@ func (in ImageNet) LoadFromDir(path string) (retVal Dataset, err error) {

 const imagenetClassCount int64 = 1000

-func (in ImageNet) ClassCount() (retVal int64) {
+func (in *ImageNet) ClassCount() int64 {
 	return imagenetClassCount
 }

@ -1271,7 +1271,7 @@ var imagenetClasses []string = []string{
 	"toilet tissue, toilet paper, bathroom tissue",
 }

-func (in ImageNet) Classes() (retVal []string) {
+func (in *ImageNet) Classes() []string {
 	return imagenetClasses
 }

@ -1281,9 +1281,9 @@ type TopItem struct {
 }

 // Returns the top k classes as well as the associated scores.
-func (in ImageNet) Top(input ts.Tensor, k int64) (retVal []TopItem) {
+func (in *ImageNet) Top(input *ts.Tensor, k int64) []TopItem {

-	var tensor ts.Tensor
+	var tensor *ts.Tensor
 	shape := input.MustSize()

 	switch {
--- a/vision/inception.go
+++ b/vision/inception.go
@ -7,7 +7,7 @@ import (
 	ts "github.com/sugarme/gotch/tensor"
 )

-func convBn(p nn.Path, cIn, cOut, ksize, pad, stride int64) (retVal ts.ModuleT) {
+func convBn(p *nn.Path, cIn, cOut, ksize, pad, stride int64) ts.ModuleT {

 	convConfig := nn.DefaultConv2DConfig()
 	convConfig.Stride = []int64{stride, stride}
@ -24,14 +24,14 @@ func convBn(p nn.Path, cIn, cOut, ksize, pad, stride int64) (retVal ts.ModuleT)

 	seq.Add(nn.BatchNorm2D(p.Sub("bn"), cOut, bnConfig))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	return seq
 }

-func convBn2(p nn.Path, cIn, cOut int64, ksize []int64, pad []int64) (retVal ts.ModuleT) {
+func convBn2(p *nn.Path, cIn, cOut int64, ksize []int64, pad []int64) ts.ModuleT {
 	convConfig := nn.DefaultConv2DConfig()
 	convConfig.Padding = pad
 	convConfig.Bias = false
@ -41,22 +41,22 @@ func convBn2(p nn.Path, cIn, cOut int64, ksize []int64, pad []int64) (retVal ts.

 	seq := nn.SeqT()

-	seq.Add(nn.NewConv(p.Sub("conv"), cIn, cOut, ksize, convConfig).(nn.Conv2D))
+	seq.Add(nn.NewConv(p.Sub("conv"), cIn, cOut, ksize, convConfig).(*nn.Conv2D))

 	seq.Add(nn.BatchNorm2D(p.Sub("bn"), cOut, bnConfig))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

 	return seq
 }

-func inMaxPool2D(xs ts.Tensor, ksize, stride int64) (retVal ts.Tensor) {
+func inMaxPool2D(xs *ts.Tensor, ksize, stride int64) *ts.Tensor {
 	return xs.MustMaxPool2d([]int64{ksize, ksize}, []int64{stride, stride}, []int64{0, 0}, []int64{1, 1}, false, false)
 }

-func inceptionA(p nn.Path, cIn, cPool int64) (retVal ts.ModuleT) {
+func inceptionA(p *nn.Path, cIn, cPool int64) ts.ModuleT {
 	b1 := convBn(p.Sub("branch1x1"), cIn, 64, 1, 0, 1)
 	b21 := convBn(p.Sub("branch5x5_1"), cIn, 48, 1, 0, 1)
 	b22 := convBn(p.Sub("branch5x5_2"), 48, 64, 5, 2, 1)
@ -65,7 +65,7 @@ func inceptionA(p nn.Path, cIn, cPool int64) (retVal ts.ModuleT) {
 	b33 := convBn(p.Sub("branch3x3dbl_3"), 96, 96, 3, 1, 1)
 	bpool := convBn(p.Sub("branch_pool"), cIn, cPool, 1, 0, 1)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		b1Ts := xs.ApplyT(b1, train)

 		b2Tmp := xs.ApplyT(b21, train)
@ -81,19 +81,19 @@ func inceptionA(p nn.Path, cIn, cPool int64) (retVal ts.ModuleT) {
 		bpoolTmp := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
 		bpoolTs := bpoolTmp.ApplyT(bpool, train)

-		res := ts.MustCat([]ts.Tensor{b1Ts, b2Ts, b3Ts, bpoolTs}, 1)
+		res := ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)

 		return res
 	})
 }

-func inceptionB(p nn.Path, cIn int64) (retVal ts.ModuleT) {
+func inceptionB(p *nn.Path, cIn int64) ts.ModuleT {
 	b1 := convBn(p.Sub("branch3x3"), cIn, 384, 3, 0, 2)
 	b21 := convBn(p.Sub("branch3x3dbl_1"), cIn, 64, 1, 0, 1)
 	b22 := convBn(p.Sub("branch3x3dbl_2"), 64, 96, 3, 1, 1)
 	b23 := convBn(p.Sub("branch3x3dbl_3"), 96, 96, 3, 0, 2)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		b1Ts := xs.ApplyT(b1, train)

 		b2Tmp1 := xs.ApplyT(b21, train)
@ -104,13 +104,13 @@ func inceptionB(p nn.Path, cIn int64) (retVal ts.ModuleT) {

 		bpoolTs := inMaxPool2D(xs, 3, 2)

-		res := ts.MustCat([]ts.Tensor{b1Ts, b2Ts, bpoolTs}, 1)
+		res := ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *bpoolTs}, 1)

 		return res
 	})
 }

-func inceptionC(p nn.Path, cIn int64, c7 int64) (retVal ts.ModuleT) {
+func inceptionC(p *nn.Path, cIn int64, c7 int64) ts.ModuleT {

 	b1 := convBn(p.Sub("branch1x1"), cIn, 192, 1, 0, 1)

@ -126,7 +126,7 @@ func inceptionC(p nn.Path, cIn int64, c7 int64) (retVal ts.ModuleT) {

 	bpool := convBn(p.Sub("branch_pool"), cIn, 192, 1, 0, 1)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) (res ts.Tensor) {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		b1Ts := xs.ApplyT(b1, train)

 		b2Tmp1 := xs.ApplyT(b21, train)
@ -148,14 +148,11 @@ func inceptionC(p nn.Path, cIn int64, c7 int64) (retVal ts.ModuleT) {
 		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
 		bpoolTs := bpTmp1.ApplyT(bpool, train)

-		res = ts.MustCat([]ts.Tensor{b1Ts, b2Ts, b3Ts, bpoolTs}, 1)
-
-		return res
-
+		return ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)
 	})
 }

-func inceptionD(p nn.Path, cIn int64) (retVal ts.ModuleT) {
+func inceptionD(p *nn.Path, cIn int64) ts.ModuleT {

 	b11 := convBn(p.Sub("branch3x3_1"), cIn, 192, 1, 0, 1)
 	b12 := convBn(p.Sub("branch3x3_2"), 192, 320, 3, 0, 2)
@ -165,7 +162,7 @@ func inceptionD(p nn.Path, cIn int64) (retVal ts.ModuleT) {
 	b23 := convBn2(p.Sub("branch7x7x3_3"), 192, 192, []int64{7, 1}, []int64{3, 0})
 	b24 := convBn(p.Sub("branch7x7x3_4"), 192, 192, 3, 0, 2)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		b1Tmp := xs.ApplyT(b11, train)
 		b1Ts := b1Tmp.ApplyT(b12, train)
 		b1Tmp.MustDrop()
@ -180,12 +177,12 @@ func inceptionD(p nn.Path, cIn int64) (retVal ts.ModuleT) {

 		bpoolTs := inMaxPool2D(xs, 3, 2)

-		return ts.MustCat([]ts.Tensor{b1Ts, b2Ts, bpoolTs}, 1)
+		return ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *bpoolTs}, 1)

 	})
 }

-func inceptionE(p nn.Path, cIn int64) (retVal ts.ModuleT) {
+func inceptionE(p *nn.Path, cIn int64) ts.ModuleT {
 	b1 := convBn(p.Sub("branch1x1"), cIn, 320, 1, 0, 1)

 	b21 := convBn(p.Sub("branch3x3_1"), cIn, 384, 1, 0, 1)
@ -199,37 +196,37 @@ func inceptionE(p nn.Path, cIn int64) (retVal ts.ModuleT) {

 	bpool := convBn(p.Sub("branch_pool"), cIn, 192, 1, 0, 1)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		b1Ts := xs.ApplyT(b1, train)

 		b2Tmp := xs.ApplyT(b21, train)
 		b2aTs := b2Tmp.ApplyT(b22a, train)
 		b2bTs := b2Tmp.ApplyT(b22b, train)
-		b2Ts := ts.MustCat([]ts.Tensor{b2aTs, b2bTs}, 1)
+		b2Ts := ts.MustCat([]ts.Tensor{*b2aTs, *b2bTs}, 1)

 		b3Tmp1 := xs.ApplyT(b31, train)
 		b3Tmp2 := b3Tmp1.ApplyT(b32, train)
 		b3Tmp1.MustDrop()
 		b3aTs := b3Tmp2.ApplyT(b33a, train)
 		b3bTs := b3Tmp2.ApplyT(b33b, train)
-		b3Ts := ts.MustCat([]ts.Tensor{b3aTs, b3bTs}, 1)
+		b3Ts := ts.MustCat([]ts.Tensor{*b3aTs, *b3bTs}, 1)

 		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
 		bpoolTs := bpTmp1.ApplyT(bpool, train)

-		return ts.MustCat([]ts.Tensor{b1Ts, b2Ts, b3Ts, bpoolTs}, 1)
+		return ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)
 	})

 }

-func InceptionV3(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func InceptionV3(p *nn.Path, nclasses int64) ts.ModuleT {
 	seq := nn.SeqT()

 	seq.Add(convBn(p.Sub("Conv2d_1a_3x3"), 3, 32, 3, 0, 2))
 	seq.Add(convBn(p.Sub("Conv2d_2a_3x3"), 32, 32, 3, 0, 1))
 	seq.Add(convBn(p.Sub("Conv2d_2b_3x3"), 32, 64, 3, 1, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustRelu(false)
 		res := inMaxPool2D(tmp, 3, 2)
 		tmp.MustDrop()
@ -239,7 +236,7 @@ func InceptionV3(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
 	seq.Add(convBn(p.Sub("Conv2d_3b_1x1"), 64, 80, 1, 0, 1))
 	seq.Add(convBn(p.Sub("Conv2d_4a_3x3"), 80, 192, 3, 0, 1))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustRelu(false)
 		res := inMaxPool2D(tmp, 3, 2)
 		tmp.MustDrop()
@ -262,7 +259,7 @@ func InceptionV3(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
 	seq.Add(inceptionE(p.Sub("Mixed_7b"), 1280))
 	seq.Add(inceptionE(p.Sub("Mixed_7c"), 2048))

-	seq.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	seq.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.MustAdaptiveAvgPool2d([]int64{1, 1}, false)
 		tmp2 := ts.MustDropout(tmp1, 0.5, train)
 		tmp1.MustDrop()
--- a/vision/mnist.go
+++ b/vision/mnist.go
@ -52,7 +52,7 @@ func checkMagicNumber(f *os.File, wantNumber int) (err error) {
 	return nil
 }

-func readLabels(filename string) (retVal ts.Tensor) {
+func readLabels(filename string) *ts.Tensor {

 	f, err := os.Open(filename)
 	if err != nil {
@ -82,12 +82,10 @@ func readLabels(filename string) (retVal ts.Tensor) {
 		log.Fatal(err)
 	}

-	retVal = labelsTs.MustTotype(gotch.Int64, true)
-
-	return retVal
+	return labelsTs.MustTotype(gotch.Int64, true)
 }

-func readImages(filename string) (retVal ts.Tensor) {
+func readImages(filename string) *ts.Tensor {
 	f, err := os.Open(filename)
 	if err != nil {
 		log.Fatalf("readImages errors: %v\n", err)
@ -125,13 +123,12 @@ func readImages(filename string) (retVal ts.Tensor) {
 		err = fmt.Errorf("create images tensor err.")
 		log.Fatal(err)
 	}
-	retVal = imagesTs.MustView([]int64{int64(samples), int64(rows * cols)}, true).MustTotype(gotch.Float, true).MustDiv1(ts.FloatScalar(255.0), true)

-	return retVal
+	return imagesTs.MustView([]int64{int64(samples), int64(rows * cols)}, true).MustTotype(gotch.Float, true).MustDiv1(ts.FloatScalar(255.0), true)
 }

 // LoadMNISTDir loads all MNIST data from a given directory to Dataset
-func LoadMNISTDir(dir string) (retVal Dataset) {
+func LoadMNISTDir(dir string) *Dataset {
 	const (
 		trainLabels = "train-labels-idx1-ubyte"
 		trainImages = "train-images-idx3-ubyte"
@ -149,7 +146,7 @@ func LoadMNISTDir(dir string) (retVal Dataset) {
 	testImagesTs := readImages(testImagesFile)
 	testLabelsTs := readLabels(testLabelsFile)

-	return Dataset{
+	return &Dataset{
 		TrainImages: trainImagesTs,
 		TrainLabels: trainLabelsTs,
 		TestImages:  testImagesTs,
--- a/vision/mobilenet.go
+++ b/vision/mobilenet.go
@ -12,7 +12,7 @@ import (
 )

 // Conv2D + BatchNorm2D + ReLU6
-func cbr(p nn.Path, cIn, cOut, ks, stride, g int64) (retVal ts.ModuleT) {
+func cbr(p *nn.Path, cIn, cOut, ks, stride, g int64) ts.ModuleT {
 	config := nn.DefaultConv2DConfig()
 	config.Stride = []int64{stride, stride}
 	pad := (ks - 1) / 2
@ -26,7 +26,7 @@ func cbr(p nn.Path, cIn, cOut, ks, stride, g int64) (retVal ts.ModuleT) {

 	seq.Add(nn.BatchNorm2D(p.Sub("1"), cOut, nn.DefaultBatchNormConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp := xs.MustRelu(false)
 		res := tmp.MustClampMax(ts.FloatScalar(6.0), true)
 		return res
@ -36,7 +36,7 @@ func cbr(p nn.Path, cIn, cOut, ks, stride, g int64) (retVal ts.ModuleT) {
 }

 // Inverted Residual block.
-func inv(p nn.Path, cIn, cOut, stride, er int64) (retVal ts.ModuleT) {
+func inv(p *nn.Path, cIn, cOut, stride, er int64) ts.ModuleT {
 	cHidden := er * cIn
 	seq := nn.SeqT()

@ -54,7 +54,7 @@ func inv(p nn.Path, cIn, cOut, stride, er int64) (retVal ts.ModuleT) {

 	seq.Add(nn.BatchNorm2D(p.Sub(fmt.Sprintf("%v", id+2)), cOut, nn.DefaultBatchNormConfig()))

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		ys := xs.ApplyT(seq, train)
 		if stride == 1 && cIn == cOut {
 			res := ys.MustAdd(xs, true)
@ -75,7 +75,7 @@ var invertedResidualSettings [][]int64 = [][]int64{
 	{6, 320, 1, 1},
 }

-func MobileNetV2(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func MobileNetV2(p *nn.Path, nclasses int64) ts.ModuleT {
 	fp := p.Sub("features")
 	cp := p.Sub("classifier")
 	cIn := int64(32)
@ -108,13 +108,13 @@ func MobileNetV2(p nn.Path, nclasses int64) (retVal ts.ModuleT) {

 	classifier := nn.SeqT()

-	classifier.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	classifier.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

 	classifier.Add(nn.NewLinear(cp.Sub("1"), 1280, nclasses, nn.DefaultLinearConfig()))

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.ApplyT(features, train)

 		tmp2 := tmp1.MustMean1([]int64{2}, false, gotch.Float, true)
--- a/vision/resnet.go
+++ b/vision/resnet.go
@ -12,7 +12,7 @@ import (
 // See "Deep Residual Learning for Image Recognition" He et al. 2015
 // https://arxiv.org/abs/1512.03385

-func conv2d(path nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Conv2D) {
+func conv2d(path *nn.Path, cIn, cOut, ksize, padding, stride int64) *nn.Conv2D {
 	config := nn.DefaultConv2DConfig()
 	config.Stride = []int64{stride, stride}
 	config.Padding = []int64{padding, padding}
@ -21,21 +21,20 @@ func conv2d(path nn.Path, cIn, cOut, ksize, padding, stride int64) (retVal nn.Co
 	return nn.NewConv2D(path, cIn, cOut, ksize, config)
 }

-func downSample(path nn.Path, cIn, cOut, stride int64) (retVal ts.ModuleT) {
+func downSample(path *nn.Path, cIn, cOut, stride int64) ts.ModuleT {

 	if stride != 1 || cIn != cOut {
 		seq := nn.SeqT()
 		seq.Add(conv2d(path.Sub("0"), cIn, cOut, 1, 0, stride))
 		seq.Add(nn.BatchNorm2D(path.Sub("1"), cOut, nn.DefaultBatchNormConfig()))
-		retVal = seq
-	} else {
-		retVal = nn.SeqT()
+
+		return seq
 	}

-	return retVal
+	return nn.SeqT()
 }

-func basicBlock(path nn.Path, cIn, cOut, stride int64) (retVal ts.ModuleT) {
+func basicBlock(path *nn.Path, cIn, cOut, stride int64) ts.ModuleT {

 	conv1 := conv2d(path.Sub("conv1"), cIn, cOut, 3, 1, stride)
 	bn1 := nn.BatchNorm2D(path.Sub("bn1"), cOut, nn.DefaultBatchNormConfig())
@ -43,7 +42,7 @@ func basicBlock(path nn.Path, cIn, cOut, stride int64) (retVal ts.ModuleT) {
 	bn2 := nn.BatchNorm2D(path.Sub("bn2"), cOut, nn.DefaultBatchNormConfig())
 	downsample := downSample(path.Sub("downsample"), cIn, cOut, stride)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		c1 := xs.Apply(conv1)
 		bn1 := c1.ApplyT(bn1, train)
 		c1.MustDrop()
@ -61,7 +60,7 @@ func basicBlock(path nn.Path, cIn, cOut, stride int64) (retVal ts.ModuleT) {
 	})
 }

-func basicLayer(path nn.Path, cIn, cOut, stride, cnt int64) (retVal ts.ModuleT) {
+func basicLayer(path *nn.Path, cIn, cOut, stride, cnt int64) ts.ModuleT {

 	layer := nn.SeqT()
 	layer.Add(basicBlock(path.Sub("0"), cIn, cOut, stride))
@ -73,7 +72,7 @@ func basicLayer(path nn.Path, cIn, cOut, stride, cnt int64) (retVal ts.ModuleT)
 	return layer
 }

-func resnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal nn.FuncT) {
+func resnet(path *nn.Path, nclasses int64, c1, c2, c3, c4 int64) nn.FuncT {
 	conv1 := conv2d(path.Sub("conv1"), 3, 64, 7, 3, 2)
 	bn1 := nn.BatchNorm2D(path.Sub("bn1"), 64, nn.DefaultBatchNormConfig())
 	layer1 := basicLayer(path.Sub("layer1"), 64, 64, 1, c1)
@ -86,7 +85,7 @@ func resnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal nn.FuncT
 		linearConfig := nn.DefaultLinearConfig()
 		fc := nn.NewLinear(path.Sub("fc"), 512, nclasses, linearConfig)

-		return nn.NewFuncT(func(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+		return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			c1 := xs.Apply(conv1)
 			xs.MustDrop()
 			bn1 := c1.ApplyT(bn1, train)
@ -105,14 +104,14 @@ func resnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal nn.FuncT
 			fv := avgpool.FlatView()
 			avgpool.MustDrop()

-			retVal = fv.ApplyOpt(ts.WithModule(fc))
+			retVal := fv.ApplyOpt(ts.WithModule(fc))
 			fv.MustDrop()
 			return retVal
 		})

 	} else {
 		// No final layer
-		return nn.NewFuncT(func(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+		return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			c1 := xs.Apply(conv1)
 			xs.MustDrop()
 			bn1 := c1.ApplyT(bn1, train)
@ -129,7 +128,7 @@ func resnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal nn.FuncT
 			l3.MustDrop()
 			avgpool := l4.MustAdaptiveAvgPool2d([]int64{1, 1}, false)
 			l4.MustDrop()
-			retVal = avgpool.FlatView()
+			retVal := avgpool.FlatView()
 			avgpool.MustDrop()

 			return retVal
@ -138,24 +137,24 @@ func resnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal nn.FuncT
 }

 // Creates a ResNet-18 model.
-func ResNet18(path nn.Path, numClasses int64) (retVal nn.FuncT) {
+func ResNet18(path *nn.Path, numClasses int64) nn.FuncT {
 	return resnet(path, numClasses, 2, 2, 2, 2)
 }

-func ResNet18NoFinalLayer(path nn.Path) (retVal nn.FuncT) {
+func ResNet18NoFinalLayer(path *nn.Path) nn.FuncT {
 	return resnet(path, 0, 2, 2, 2, 2)
 }

-func ResNet34(path nn.Path, numClasses int64) (retVal nn.FuncT) {
+func ResNet34(path *nn.Path, numClasses int64) nn.FuncT {
 	return resnet(path, numClasses, 3, 4, 6, 3)
 }

-func ResNet34NoFinalLayer(path nn.Path) (retVal nn.FuncT) {
+func ResNet34NoFinalLayer(path *nn.Path) nn.FuncT {
 	return resnet(path, 0, 3, 4, 6, 3)
 }

 // Bottleneck versions for ResNet 50, 101, and 152.
-func bottleneckBlock(path nn.Path, cIn, cOut, stride, e int64) (retVal ts.ModuleT) {
+func bottleneckBlock(path *nn.Path, cIn, cOut, stride, e int64) ts.ModuleT {

 	eDim := e * cOut
 	conv1 := conv2d(path.Sub("conv1"), cIn, cOut, 1, 0, 1)
@ -166,7 +165,7 @@ func bottleneckBlock(path nn.Path, cIn, cOut, stride, e int64) (retVal ts.Module
 	bn3 := nn.BatchNorm2D(path.Sub("bn3"), eDim, nn.DefaultBatchNormConfig())
 	downsample := downSample(path.Sub("downsample"), cIn, eDim, stride)

-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		c1 := xs.Apply(conv1)
 		bn1 := c1.ApplyT(bn1, train)
 		c1.MustDrop()
@ -187,7 +186,7 @@ func bottleneckBlock(path nn.Path, cIn, cOut, stride, e int64) (retVal ts.Module
 	})
 }

-func bottleneckLayer(path nn.Path, cIn, cOut, stride, cnt int64) (retVal ts.ModuleT) {
+func bottleneckLayer(path *nn.Path, cIn, cOut, stride, cnt int64) ts.ModuleT {

 	layer := nn.SeqT()
 	layer.Add(bottleneckBlock(path.Sub("0"), cIn, cOut, stride, 4))
@ -198,7 +197,7 @@ func bottleneckLayer(path nn.Path, cIn, cOut, stride, cnt int64) (retVal ts.Modu
 	return layer
 }

-func bottleneckResnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVal ts.ModuleT) {
+func bottleneckResnet(path *nn.Path, nclasses int64, c1, c2, c3, c4 int64) ts.ModuleT {
 	conv1 := conv2d(path.Sub("conv1"), 3, 64, 7, 3, 2)
 	bn1 := nn.BatchNorm2D(path.Sub("bn1"), 64, nn.DefaultBatchNormConfig())
 	layer1 := bottleneckLayer(path.Sub("layer1"), 64, 64, 1, c1)
@ -209,7 +208,7 @@ func bottleneckResnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVa
 	if nclasses > 0 {
 		fc := nn.NewLinear(path.Sub("fc"), 4*512, nclasses, nn.DefaultLinearConfig())

-		return nn.NewFuncT(func(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+		return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			c1 := xs.Apply(conv1)
 			xs.MustDrop()
 			bn1 := c1.ApplyT(bn1, train)
@ -228,12 +227,12 @@ func bottleneckResnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVa
 			fv := avgpool.FlatView()
 			avgpool.MustDrop()

-			retVal = fv.ApplyOpt(ts.WithModule(fc))
+			retVal := fv.ApplyOpt(ts.WithModule(fc))
 			fv.MustDrop()
 			return retVal
 		})
 	} else {
-		return nn.NewFuncT(func(xs ts.Tensor, train bool) (retVal ts.Tensor) {
+		return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 			c1 := xs.Apply(conv1)
 			xs.MustDrop()
 			bn1 := c1.ApplyT(bn1, train)
@ -250,7 +249,7 @@ func bottleneckResnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVa
 			l3.MustDrop()
 			avgpool := l4.MustAdaptiveAvgPool2d([]int64{1, 1}, false)
 			l4.MustDrop()
-			retVal = avgpool.FlatView()
+			retVal := avgpool.FlatView()
 			avgpool.MustDrop()

 			return retVal
@ -258,26 +257,26 @@ func bottleneckResnet(path nn.Path, nclasses int64, c1, c2, c3, c4 int64) (retVa
 	}
 }

-func ResNet50(path nn.Path, numClasses int64) (retVal ts.ModuleT) {
+func ResNet50(path *nn.Path, numClasses int64) ts.ModuleT {
 	return bottleneckResnet(path, numClasses, 3, 4, 6, 3)
 }

-func ResNet50NoFinalLayer(path nn.Path) (retVal ts.ModuleT) {
+func ResNet50NoFinalLayer(path *nn.Path) ts.ModuleT {
 	return bottleneckResnet(path, 0, 3, 4, 6, 3)
 }

-func ResNet101(path nn.Path, numClasses int64) (retVal ts.ModuleT) {
+func ResNet101(path *nn.Path, numClasses int64) ts.ModuleT {
 	return bottleneckResnet(path, numClasses, 3, 4, 23, 3)
 }

-func ResNet101NoFinalLayer(path nn.Path) (retVal ts.ModuleT) {
+func ResNet101NoFinalLayer(path *nn.Path) ts.ModuleT {
 	return bottleneckResnet(path, 0, 3, 4, 23, 3)
 }

-func ResNet152(path nn.Path, numClasses int64) (retVal ts.ModuleT) {
+func ResNet152(path *nn.Path, numClasses int64) ts.ModuleT {
 	return bottleneckResnet(path, numClasses, 3, 8, 36, 3)
 }

-func ResNet150NoFinalLayer(path nn.Path) (retVal ts.ModuleT) {
+func ResNet150NoFinalLayer(path *nn.Path) ts.ModuleT {
 	return bottleneckResnet(path, 0, 3, 8, 36, 3)
 }
--- a/vision/squeezenet.go
+++ b/vision/squeezenet.go
@ -7,11 +7,11 @@ import (
 	ts "github.com/sugarme/gotch/tensor"
 )

-func snMaxPool2D(xs ts.Tensor) (retVal ts.Tensor) {
+func snMaxPool2D(xs *ts.Tensor) *ts.Tensor {
 	return xs.MustMaxPool2d([]int64{3, 3}, []int64{2, 2}, []int64{0, 0}, []int64{1, 1}, true, false)
 }

-func fire(p nn.Path, cIn int64, cSqueeze int64, cExp1 int64, cExp3 int64) (retVal ts.ModuleT) {
+func fire(p *nn.Path, cIn int64, cSqueeze int64, cExp1 int64, cExp3 int64) ts.ModuleT {

 	cfg3 := nn.DefaultConv2DConfig()
 	cfg3.Padding = []int64{1, 1}
@ -21,7 +21,7 @@ func fire(p nn.Path, cIn int64, cSqueeze int64, cExp1 int64, cExp3 int64) (retVa
 	exp3 := nn.NewConv2D(p.Sub("expand3x3"), cSqueeze, cExp3, 3, cfg3)

 	// NOTE: train will not be used
-	return nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	return nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		tmp1 := xs.Apply(squeeze)
 		tmp2 := tmp1.MustRelu(true)

@ -31,11 +31,11 @@ func fire(p nn.Path, cIn int64, cSqueeze int64, cExp1 int64, cExp3 int64) (retVa
 		exp3Tmp := tmp2.Apply(exp3)
 		exp3Ts := exp3Tmp.MustRelu(true)

-		return ts.MustCat([]ts.Tensor{exp1Ts, exp3Ts}, 1)
+		return ts.MustCat([]ts.Tensor{*exp1Ts, *exp3Ts}, 1)
 	})
 }

-func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {
+func squeezenet(p *nn.Path, v1_0 bool, nclasses int64) ts.ModuleT {
 	fp := p.Sub("features")
 	cp := p.Sub("classifier")

@ -50,11 +50,11 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {
 	if v1_0 {
 		features.Add(nn.NewConv2D(fp.Sub("0"), 3, 96, 7, initialConvConfig))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return xs.MustRelu(false)
 		}))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -64,7 +64,7 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {

 		features.Add(fire(fp.Sub("5"), 128, 32, 128, 128))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -76,7 +76,7 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {

 		features.Add(fire(fp.Sub("10"), 384, 64, 256, 256))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -85,11 +85,11 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {
 	} else {
 		features.Add(nn.NewConv2D(fp.Sub("0"), 3, 64, 3, initialConvConfig))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return xs.MustRelu(false)
 		}))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -97,7 +97,7 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {

 		features.Add(fire(fp.Sub("4"), 128, 16, 64, 64))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -105,7 +105,7 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {

 		features.Add(fire(fp.Sub("7"), 256, 32, 128, 128))

-		features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return snMaxPool2D(xs)
 		}))

@ -118,13 +118,13 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {
 		features.Add(fire(fp.Sub("12"), 512, 64, 256, 256))
 	}

-	features.AddFnT(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	features.AddFnT(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

 	features.Add(nn.NewConv2D(cp.Sub("1"), 512, nclasses, 1, finalConvConfig))

-	features.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	features.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
 		tmp2 := tmp1.MustAdaptiveAvgPool2d([]int64{1, 1}, false)
 		tmp1.MustDrop()
@ -136,10 +136,10 @@ func squeezenet(p nn.Path, v1_0 bool, nclasses int64) (retVal ts.ModuleT) {
 	return features
 }

-func SqueezeNetV1_0(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func SqueezeNetV1_0(p *nn.Path, nclasses int64) ts.ModuleT {
 	return squeezenet(p, true, nclasses)
 }

-func SqueezeNetV1_1(p nn.Path, nclasses int64) (retVal ts.ModuleT) {
+func SqueezeNetV1_1(p *nn.Path, nclasses int64) ts.ModuleT {
 	return squeezenet(p, false, nclasses)
 }
--- a/vision/vgg.go
+++ b/vision/vgg.go
@ -11,7 +11,7 @@ import (

 // NOTE: each list element contains multiple convolutions with some specified number
 // of features followed by a single max-pool layer.
-func layersA() (retVal [][]int64) {
+func layersA() [][]int64 {
 	return [][]int64{
 		{64},
 		{128},
@ -21,7 +21,7 @@ func layersA() (retVal [][]int64) {
 	}
 }

-func layersB() (retVal [][]int64) {
+func layersB() [][]int64 {
 	return [][]int64{
 		{64, 64},
 		{128, 128},
@ -31,7 +31,7 @@ func layersB() (retVal [][]int64) {
 	}
 }

-func layersD() (retVal [][]int64) {
+func layersD() [][]int64 {
 	return [][]int64{
 		{64, 64},
 		{128, 128},
@ -41,7 +41,7 @@ func layersD() (retVal [][]int64) {
 	}
 }

-func layersE() (retVal [][]int64) {
+func layersE() [][]int64 {
 	return [][]int64{
 		{64, 64},
 		{128, 128},
@ -51,7 +51,7 @@ func layersE() (retVal [][]int64) {
 	}
 }

-func vggConv2d(path nn.Path, cIn, cOut int64) (retVal nn.Conv2D) {
+func vggConv2d(path *nn.Path, cIn, cOut int64) *nn.Conv2D {

 	config := nn.DefaultConv2DConfig()
 	config.Stride = []int64{1, 1}
@ -60,7 +60,7 @@ func vggConv2d(path nn.Path, cIn, cOut int64) (retVal nn.Conv2D) {
 	return nn.NewConv2D(path, cIn, cOut, 3, config)
 }

-func vgg(path nn.Path, config [][]int64, nclasses int64, batchNorm bool) nn.SequentialT {
+func vgg(path *nn.Path, config [][]int64, nclasses int64, batchNorm bool) *nn.SequentialT {

 	c := path.Sub("classifier")
 	seq := nn.SeqT()
@ -77,40 +77,40 @@ func vgg(path nn.Path, config [][]int64, nclasses int64, batchNorm bool) nn.Sequ
 				seq.Add(nn.BatchNorm2D(f.Sub(fmt.Sprintf("%v", bnLen)), cOut, nn.DefaultBatchNormConfig()))
 			}

-			seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+			seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 				return xs.MustRelu(false)
 			}))

 			cIn = cOut
 		} // end of inner For loop

-		seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+		seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 			return xs.MaxPool2DDefault(2, false)
 		}))

 	} // end of outer For loop

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.FlatView()
 	}))

 	seq.Add(nn.NewLinear(c.Sub(fmt.Sprint("0")), 512*7*7, 4096, nn.DefaultLinearConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

-	seq.AddFn(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	seq.AddFn(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

 	seq.Add(nn.NewLinear(c.Sub(fmt.Sprint("3")), 4096, 4096, nn.DefaultLinearConfig()))

-	seq.AddFn(nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
+	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		return xs.MustRelu(false)
 	}))

-	seq.AddFn(nn.NewFuncT(func(xs ts.Tensor, train bool) ts.Tensor {
+	seq.AddFn(nn.NewFuncT(func(xs *ts.Tensor, train bool) *ts.Tensor {
 		return ts.MustDropout(xs, 0.5, train)
 	}))

@ -119,34 +119,34 @@ func vgg(path nn.Path, config [][]int64, nclasses int64, batchNorm bool) nn.Sequ
 	return seq
 }

-func VGG11(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG11(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersA(), nclasses, false)
 }

-func VGG11BN(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG11BN(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersA(), nclasses, true)
 }

-func VGG13(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG13(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersB(), nclasses, false)
 }

-func VGG13BN(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG13BN(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersB(), nclasses, true)
 }

-func VGG16(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG16(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersD(), nclasses, false)
 }

-func VGG16BN(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG16BN(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersD(), nclasses, true)
 }

-func VGG19(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG19(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersE(), nclasses, false)
 }

-func VGG19BN(path nn.Path, nclasses int64) (retVal nn.SequentialT) {
+func VGG19BN(path *nn.Path, nclasses int64) *nn.SequentialT {
 	return vgg(path, layersE(), nclasses, true)
 }