fix(tensor/module): fixed and moved BatchAccuracyForLogits to nn/sequential; chore(example): clean-up

2020-07-11 12:53:23 +10:00 · 2020-07-11 12:53:23 +10:00 · 8b05753eb4
commit 8b05753eb4
parent 44ef7776e5
29 changed files with 281 additions and 1552 deletions
--- a/example/augmentation/main.go
+++ b/example/augmentation/main.go
@ -1,87 +0,0 @@
-package main
-
-// Training various models on the CIFAR-10 dataset.
-//
-// The dataset can be downloaded from https:www.cs.toronto.edu/~kriz/cifar.html, files
-// should be placed in the data/ directory.
-//
-// The resnet model reaches 95.4% accuracy.
-
-import (
-	"fmt"
-	// "log"
-	// "os/exec"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-	ts "github.com/sugarme/gotch/tensor"
-	"github.com/sugarme/gotch/vision"
-)
-
-func main() {
-	dir := "../../data/cifar10"
-	ds := vision.CFLoadDir(dir)
-
-	fmt.Printf("TrainImages shape: %v\n", ds.TrainImages.MustSize())
-	fmt.Printf("TrainLabel shape: %v\n", ds.TrainLabels.MustSize())
-	fmt.Printf("TestImages shape: %v\n", ds.TestImages.MustSize())
-	fmt.Printf("TestLabel shape: %v\n", ds.TestLabels.MustSize())
-	fmt.Printf("Number of labels: %v\n", ds.Labels)
-
-	// cuda := gotch.CudaBuilder(0)
-	// device := cuda.CudaIfAvailable()
-	device := gotch.CPU
-
-	var si *gotch.SI
-	si = gotch.GetSysInfo()
-	fmt.Printf("Total RAM (MB):\t %8.2f\n", float64(si.TotalRam)/1024)
-	fmt.Printf("Used RAM (MB):\t %8.2f\n", float64(si.TotalRam-si.FreeRam)/1024)
-
-	startRAM := si.TotalRam - si.FreeRam
-
-	vs := nn.NewVarStore(device)
-
-	for epoch := 0; epoch < 150; epoch++ {
-
-		iter := ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, int64(64))
-		iter.Shuffle()
-
-		for {
-			item, ok := iter.Next()
-			if !ok {
-				item.Data.MustDrop()
-				item.Label.MustDrop()
-				break
-			}
-
-			devicedData := item.Data.MustTo(vs.Device(), true)
-			devicedLabel := item.Label.MustTo(vs.Device(), true)
-			bimages := vision.Augmentation(devicedData, true, 4, 8)
-
-			devicedData.MustDrop()
-			devicedLabel.MustDrop()
-			bimages.MustDrop()
-
-		}
-
-		iter.Drop()
-
-		si = gotch.GetSysInfo()
-		memUsed := (float64(si.TotalRam-si.FreeRam) - float64(startRAM)) / 1024
-		fmt.Printf("Epoch:\t %v\t Memory Used:\t [%8.2f MiB]\n", epoch, memUsed)
-
-		/*
-		 *     // Print out GPU used
-		 *     nvidia := "nvidia-smi"
-		 *     cmd := exec.Command(nvidia)
-		 *     stdout, err := cmd.Output()
-		 *
-		 *     if err != nil {
-		 *       log.Fatal(err.Error())
-		 *     }
-		 *
-		 *     fmt.Println(string(stdout))
-		 *  */
-	}
-
-}
--- a/example/cifar/main.go
+++ b/example/cifar/main.go
@ -10,7 +10,6 @@ package main
 import (
 	"fmt"
 	"log"
-	// "os/exec"
 	"time"

 	"github.com/sugarme/gotch"
@ -80,17 +79,6 @@ func fastResnet(p nn.Path) (retVal nn.SequentialT) {
 	return seq
 }

-func learningRate(epoch int) (retVal float64) {
-	switch {
-	case epoch < 50:
-		return 0.1
-	case epoch < 100:
-		return 0.01
-	default:
-		return 0.001
-	}
-}
-
 func main() {
 	dir := "../../data/cifar10"
 	ds := vision.CFLoadDir(dir)
@ -103,50 +91,42 @@ func main() {

 	cuda := gotch.CudaBuilder(0)
 	device := cuda.CudaIfAvailable()
-	// device := gotch.CPU

 	vs := nn.NewVarStore(device)

 	net := fastResnet(vs.Root())

-	// optConfig := nn.NewSGDConfig(0.9, 0.0, 5e-4, true)
-	// opt, err := optConfig.Build(vs, 0.01)
-	// if err != nil {
-	// log.Fatal(err)
-	// }
-
 	var lossVal float64
 	startTime := time.Now()

 	var bestAccuracy float64

-	for epoch := 0; epoch < 350; epoch++ {
-		// opt.SetLR(learningRate(epoch))
+	for epoch := 0; epoch < 150; epoch++ {
 		optConfig := nn.NewSGDConfig(0.9, 0.0, 5e-4, true)
-		var opt nn.Optimizer
-		var err error
+		var (
+			opt nn.Optimizer
+			err error
+		)
 		switch {
-		case epoch < 150:
+		case epoch < 50:
 			opt, err = optConfig.Build(vs, 0.1)
 			if err != nil {
 				log.Fatal(err)
 			}
-		case epoch < 250:
+		case epoch < 100:
 			opt, err = optConfig.Build(vs, 0.01)
 			if err != nil {
 				log.Fatal(err)
 			}
-		case epoch >= 250:
+		case epoch >= 100:
 			opt, err = optConfig.Build(vs, 0.001)
 			if err != nil {
 				log.Fatal(err)
 			}
 		}

-		// iter := ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, int64(64))
-		iter := ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, int64(128))
+		iter := ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, int64(64))
 		iter.Shuffle()
-		// iter = iter.ToDevice(device)

 		for {
 			item, ok := iter.Next()
@ -171,63 +151,14 @@ func main() {
 			loss.MustDrop()
 		}

-		vs.Freeze()
-		testAcc := batchAccuracyForLogits(net, ds.TestImages, ds.TestLabels, vs.Device(), 100)
-		vs.Unfreeze()
+		testAcc := nn.BatchAccuracyForLogits(vs, net, ds.TestImages, ds.TestLabels, vs.Device(), 512)
 		fmt.Printf("Epoch:\t %v\t Loss: \t %.3f \tAcc: %10.2f%%\n", epoch, lossVal, testAcc*100.0)
-		// fmt.Printf("Epoch: %10.0d\tLoss:%10.3f\n", epoch, lossVal)
 		if testAcc > bestAccuracy {
 			bestAccuracy = testAcc
 		}
 		iter.Drop()
-
-		/*
-		 *     // Print out GPU used
-		 *     nvidia := "nvidia-smi"
-		 *     cmd := exec.Command(nvidia)
-		 *     stdout, err := cmd.Output()
-		 *
-		 *     if err != nil {
-		 *       log.Fatal(err.Error())
-		 *     }
-		 *
-		 *     fmt.Println(string(stdout))
-		 *  */
 	}

-	// testAcc := ts.BatchAccuracyForLogits(net, ds.TestImages, ds.TestLabels, vs.Device(), 512)
 	fmt.Printf("Best Accuracy: %10.2f%%\n", bestAccuracy*100.0)
 	fmt.Printf("Taken time:\t%.2f mins\n", time.Since(startTime).Minutes())
 }
-
-func batchAccuracyForLogits(m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
-
-	var (
-		sumAccuracy float64 = 0.0
-		sampleCount float64 = 0.0
-	)
-
-	iter2 := ts.MustNewIter2(xs, ys, int64(batchSize))
-	for {
-		item, ok := iter2.Next()
-		if !ok {
-			break
-		}
-
-		size := float64(item.Data.MustSize()[0])
-		bImages := item.Data.MustTo(d, true)
-		bLabels := item.Label.MustTo(d, true)
-
-		logits := m.ForwardT(bImages, false)
-		acc := logits.AccuracyForLogits(bLabels)
-		sumAccuracy += acc.Values()[0] * size
-		sampleCount += size
-
-		bImages.MustDrop()
-		bLabels.MustDrop()
-		acc.MustDrop()
-	}
-
-	return sumAccuracy / sampleCount
-
-}
--- a/example/cuda/main.go
+++ b/example/cuda/main.go
@ -1,16 +0,0 @@
-package main
-
-import (
-	"fmt"
-
-	"github.com/sugarme/gotch"
-)
-
-func main() {
-
-	var d gotch.Cuda
-	fmt.Printf("Cuda device count: %v\n", d.DeviceCount())
-	fmt.Printf("Cuda is available: %v\n", d.IsAvailable())
-	fmt.Printf("Cudnn is available: %v\n", d.CudnnIsAvailable())
-
-}
--- a/example/tensor-memory/sysinfo.go
+++ b/example/tensor-memory/sysinfo.go
@ -1,8 +1,8 @@
-// A wrapper around the linux syscall sysinfo(2).
 package main

+// helper to debug memory blow-up
+
 import (
-	"fmt"
 	"sync"
 	"syscall"
 	"time"
@ -34,7 +34,7 @@ var sis = &SI{}
 // https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/zsyscall_linux_amd64.go#L1050
 // https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/ztypes_linux_amd64.go#L528
 // https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/ztypes_linux_arm.go#L502
-func Get() *SI {
+func CPUInfo() *SI {

 	/*
 	   // Note: uint64 is uint32 on 32 bit CPUs
@ -89,36 +89,3 @@ func Get() *SI {

 	return sis
 }
-
-// Make the "fmt" Stringer interface happy.
-func (si SI) String() string {
-	// XXX: Is the copy of SI done atomic? Not sure.
-	// Without an outer lock this may print a junk.
-	return fmt.Sprintf("uptime\t\t%v\nload\t\t%2.2f %2.2f %2.2f\nprocs\t\t%d\n"+
-		"ram  total\t%d kB\nram  free\t%d kB\nram  buffer\t%d kB\n"+
-		"swap total\t%d kB\nswap free\t%d kB",
-		//"high ram total\t%d kB\nhigh ram free\t%d kB\n"
-		si.Uptime, si.Loads[0], si.Loads[1], si.Loads[2], si.Procs,
-		si.TotalRam, si.FreeRam, si.BufferRam,
-		si.TotalSwap, si.FreeSwap,
-		// archaic si.TotalHighRam, si.FreeHighRam
-	)
-}
-
-/*
-Convert to string in a thread safe way.
- Output:
-	uptime		279h6m21s
-	load		0.12 0.04 0.05
-	procs		143
-	ram  total	383752 kB
-	ram  free	254980 kB
-	ram  buffer	7640 kB
-	swap total	887800 kB
-	swap free	879356 kB
-*/
-func (si *SI) ToString() string {
-	defer si.mu.Unlock()
-	si.mu.Lock()
-	return si.String()
-}
--- a/example/debug-memory/gpu.go
+++ b/example/debug-memory/gpu.go
@ -0,0 +1,20 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"os/exec"
+)
+
+func GPUInfo() {
+	// Print out GPU used
+	nvidia := "nvidia-smi"
+	cmd := exec.Command(nvidia)
+	stdout, err := cmd.Output()
+
+	if err != nil {
+		log.Fatal(err.Error())
+	}
+
+	fmt.Println(string(stdout))
+}
--- a/example/debug-memory/main.go
+++ b/example/debug-memory/main.go
@ -0,0 +1,90 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+
+	"github.com/sugarme/gotch"
+	ts "github.com/sugarme/gotch/tensor"
+)
+
+var device string
+
+func createTensors(samples int) []ts.Tensor {
+	n := int(10e6)
+	var data []float64
+	for i := 0; i < n; i++ {
+		data = append(data, float64(i))
+	}
+
+	var tensors []ts.Tensor
+	s := ts.FloatScalar(float64(0.23))
+
+	for i := 0; i < 1; i++ {
+		t := ts.MustOfSlice(data).MustMul1(s, true)
+
+		tensors = append(tensors, t)
+	}
+
+	return tensors
+}
+
+func dropTensors(tensors []ts.Tensor) {
+	for _, t := range tensors {
+		t.MustDrop()
+	}
+}
+
+func init() {
+	flag.StringVar(&device, "device", "CPU", "Select CPU or GPU to use")
+
+}
+
+func main() {
+	// TODO: create flags to load tensor to device(CPU, GPU) and get CPU or GPU
+	// infor accordingly
+	flag.Parse()
+
+	switch device {
+	case "CPU":
+		var si *SI
+		si = CPUInfo()
+		fmt.Printf("Total RAM (MB):\t %8.2f\n", float64(si.TotalRam)/1024)
+		fmt.Printf("Used RAM (MB):\t %8.2f\n", float64(si.TotalRam-si.FreeRam)/1024)
+		startRAM := si.TotalRam - si.FreeRam
+		epochs := 50
+		for i := 0; i < epochs; i++ {
+			tensors := createTensors(10000)
+			dropTensors(tensors)
+
+			si = CPUInfo()
+			fmt.Printf("Epoch %v\t Used: [%8.2f MiB]\n", i, (float64(si.TotalRam-si.FreeRam)-float64(startRAM))/1024)
+		}
+
+	case "GPU":
+		cuda := gotch.CudaBuilder(0)
+		gpu := cuda.CudaIfAvailable()
+
+		epochs := 50
+		for i := 0; i < epochs; i++ {
+
+			tensors := createTensors(10000)
+			var gpuTensors []ts.Tensor
+			for _, t := range tensors {
+				gpuTensors = append(gpuTensors, t.MustTo(gpu, true))
+			}
+
+			for _, t := range gpuTensors {
+				t.MustDrop()
+			}
+
+			fmt.Printf("Epoch %v\n", i)
+			GPUInfo()
+		}
+
+	default:
+		log.Fatalf("Invalid device flag (%v). It should be either CPU or GPU.", device)
+	}
+
+}
--- a/example/error/main.go
+++ b/example/error/main.go
@ -1,53 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	// Try to compare 2 tensor with incompatible dimensions
-	// and check this returns an error
-	dx := []int32{1, 2, 3}
-	dy := []int32{1, 2, 3, 4}
-	// dy := []int32{1, 2, 5}
-
-	xs, err := tensor.OfSlice(dx)
-	if err != nil {
-		log.Fatal(err)
-	}
-	ys, err := tensor.OfSlice(dy)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	xs.Print()
-	ys.Print()
-
-	fmt.Printf("xs num of dimensions: %v\n", xs.Dim())
-	fmt.Printf("ys num of dimensions: %v\n", ys.Dim())
-
-	xsize, err := xs.Size()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ysize, err := ys.Size()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Printf("xs shape: %v\n", xsize)
-	fmt.Printf("ys shape: %v\n", ysize)
-
-	res, err := xs.Eq1(ys)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	res.Print()
-
-}
--- a/example/linear-regression/main.go
+++ b/example/linear-regression/main.go
@ -1,48 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch"
-	ts "github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	// mockup data
-	var (
-		n      int = 20
-		xvals  []float32
-		yvals  []float32
-		epochs = 10
-	)
-
-	for i := 0; i < n; i++ {
-		xvals = append(xvals, float32(i))
-		yvals = append(yvals, float32(2*i+1))
-	}
-
-	xtrain, err := ts.NewTensorFromData(xvals, []int64{int64(n), 1})
-	if err != nil {
-		log.Fatal(err)
-	}
-	ytrain, err := ts.NewTensorFromData(yvals, []int64{int64(n), 1})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ws := ts.MustZeros([]int64{1, int64(n)}, gotch.Float.CInt(), gotch.CPU.CInt())
-	bs := ts.MustZeros([]int64{1, int64(n)}, gotch.Float.CInt(), gotch.CPU.CInt())
-
-	for epoch := 0; epoch < epochs; epoch++ {
-
-		logit := ws.MustMatMul(xtrain).MustAdd(bs)
-		loss := ts.NewTensor().MustLogSoftmax(-1, gotch.Float.CInt())
-
-		ws.MustGrad()
-		bs.MustGrad()
-		loss.MustBackward()
-
-	}
-}
--- a/example/linear/main.go
+++ b/example/linear/main.go
@ -1,17 +0,0 @@
-package main
-
-import (
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-)
-
-func main() {
-
-	vs := nn.NewVarStore(gotch.CPU)
-
-	path := vs.Root()
-
-	l := nn.NewLinear(path, 4, 3, nn.DefaultLinearConfig())
-
-	l.Bs.Print()
-}
--- a/example/mnist/README.md
+++ b/example/mnist/README.md
@ -13,21 +13,21 @@
 - Run with `go clean -cache -testcache && go run . -model="linear"`


- Accuraccy should be about **91.68%**.
+- Accuracy should be about **91.68%**.


 ## Neural Network (NN)

 - Run with `go clean -cache -testcache && go run . -model="nn"`

- Accuraccy should be about **TODO: update%**.
+- Accuracy should be about **94%**.


 ## Convolutional Neural Network (CNN)

 - Run with `go clean -cache -testcache && go run . -model="cnn"`

- Accuraccy should be about **TODO: update%**.
+- Accuracy should be about **99.3%**.



--- a/example/mnist/cnn.go
+++ b/example/mnist/cnn.go
@ -46,26 +46,21 @@ func (n Net) ForwardT(xs ts.Tensor, train bool) (retVal ts.Tensor) {
 	defer outView1.MustDrop()

 	outC1 := outView1.Apply(n.conv1)
-	// defer outC1.MustDrop()

 	outMP1 := outC1.MaxPool2DDefault(2, true)
 	defer outMP1.MustDrop()

 	outC2 := outMP1.Apply(n.conv2)
-	// defer outC2.MustDrop()

 	outMP2 := outC2.MaxPool2DDefault(2, true)
-	// defer outMP2.MustDrop()

 	outView2 := outMP2.MustView([]int64{-1, 1024}, true)
 	defer outView2.MustDrop()

 	outFC1 := outView2.Apply(&n.fc1)
-	// defer outFC1.MustDrop()

 	outRelu := outFC1.MustRelu(true)
 	defer outRelu.MustDrop()
-	// outRelu.Dropout_(0.5, train)
 	outDropout := ts.MustDropout(outRelu, 0.5, train)
 	defer outDropout.MustDrop()

@ -83,12 +78,14 @@ func runCNN1() {
 	cuda := gotch.CudaBuilder(0)
 	vs := nn.NewVarStore(cuda.CudaIfAvailable())
 	// vs := nn.NewVarStore(gotch.CPU)
+
 	net := newNet(vs.Root())
 	opt, err := nn.DefaultAdamConfig().Build(vs, LrCNN)
 	if err != nil {
 		log.Fatal(err)
 	}

+	var bestAccuracy float64 = 0.0
 	startTime := time.Now()

 	for epoch := 0; epoch < epochsCNN; epoch++ {
@ -102,20 +99,16 @@ func runCNN1() {
 		batches := samples / batchSize
 		batchIndex := 0
 		var epocLoss ts.Tensor
-		// var loss ts.Tensor
 		for i := 0; i < batches; i++ {
 			start := batchIndex * batchSize
 			size := batchSize
 			if samples-start < batchSize {
-				// size = samples - start
 				break
 			}
 			batchIndex += 1

 			// Indexing
 			narrowIndex := ts.NewNarrow(int64(start), int64(start+size))
-			// bImages := ds.TrainImages.Idx(narrowIndex)
-			// bLabels := ds.TrainLabels.Idx(narrowIndex)
 			bImages := imagesTs.Idx(narrowIndex)
 			bLabels := labelsTs.Idx(narrowIndex)

@ -126,7 +119,6 @@ func runCNN1() {
 			loss := logits.CrossEntropyForLogits(bLabels)

 			// loss = loss.MustSetRequiresGrad(true)
-
 			opt.BackwardStep(loss)

 			epocLoss = loss.MustShallowClone()
@ -136,112 +128,21 @@ func runCNN1() {

 			bImages.MustDrop()
 			bLabels.MustDrop()
-			// logits.MustDrop()
-			// loss.MustDrop()
 		}

 		vs.Freeze()
-		testAccuracy := batchAccuracyForLogits(net, testImages, testLabels, vs.Device(), 1024)
+		testAccuracy := nn.BatchAccuracyForLogits(vs, net, testImages, testLabels, vs.Device(), 1024)
 		vs.Unfreeze()
 		fmt.Printf("Epoch: %v\t Loss: %.2f \t Test accuracy: %.2f%%\n", epoch, epocLoss.Values()[0], testAccuracy*100.0)
+		if testAccuracy > bestAccuracy {
+			bestAccuracy = testAccuracy
+		}

-		// fmt.Printf("Epoch:\t %v\tLoss: \t %.2f\n", epoch, epocLoss.Values()[0])
 		epocLoss.MustDrop()
 		imagesTs.MustDrop()
 		labelsTs.MustDrop()
 	}

-	testAccuracy := ts.BatchAccuracyForLogitsIdx(net, testImages, testLabels, vs.Device(), 1024)
-	fmt.Printf("Test accuracy: %.2f%%\n", testAccuracy*100)
-
+	fmt.Printf("Best test accuracy: %.2f%%\n", bestAccuracy*100.0)
 	fmt.Printf("Taken time:\t%.2f mins\n", time.Since(startTime).Minutes())
 }
-
-func runCNN2() {
-
-	var ds vision.Dataset
-	ds = vision.LoadMNISTDir(MnistDirNN)
-
-	cuda := gotch.CudaBuilder(0)
-	vs := nn.NewVarStore(cuda.CudaIfAvailable())
-	net := newNet(vs.Root())
-	opt, err := nn.DefaultAdamConfig().Build(vs, LrNN)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	startTime := time.Now()
-
-	var lossVal float64
-	for epoch := 0; epoch < epochsCNN; epoch++ {
-
-		iter := ts.MustNewIter2(ds.TrainImages, ds.TrainLabels, batchCNN)
-		// iter.Shuffle()
-
-		for {
-			item, ok := iter.Next()
-			if !ok {
-				break
-			}
-
-			bImages := item.Data.MustTo(vs.Device(), true)
-			bLabels := item.Label.MustTo(vs.Device(), true)
-
-			// _ = ts.MustGradSetEnabled(true)
-
-			logits := net.ForwardT(bImages, true)
-			loss := logits.CrossEntropyForLogits(bLabels)
-
-			opt.BackwardStep(loss)
-
-			lossVal = loss.Values()[0]
-
-			bImages.MustDrop()
-			bLabels.MustDrop()
-			loss.MustDrop()
-		}
-
-		// fmt.Printf("Epoch:\t %v\tLoss: \t %.2f\n", epoch, lossVal)
-
-		vs.Freeze()
-		testAcc := batchAccuracyForLogits(net, ds.TestImages, ds.TestLabels, vs.Device(), batchCNN)
-		vs.Unfreeze()
-		fmt.Printf("Epoch:\t %v\tLoss: \t %.2f\t Accuracy: %.2f\n", epoch, lossVal, testAcc*100.0)
-	}
-
-	testAcc := ts.BatchAccuracyForLogits(net, ds.TestImages, ds.TestLabels, vs.Device(), batchCNN)
-	fmt.Printf("Loss: \t %.2f\t Accuracy: %.2f\n", lossVal, testAcc*100)
-	fmt.Printf("Taken time:\t%.2f mins\n", time.Since(startTime).Minutes())
-}
-
-func batchAccuracyForLogits(m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
-
-	var (
-		sumAccuracy float64 = 0.0
-		sampleCount float64 = 0.0
-	)
-
-	iter2 := ts.MustNewIter2(xs, ys, int64(batchSize))
-	for {
-		item, ok := iter2.Next()
-		if !ok {
-			break
-		}
-
-		size := float64(item.Data.MustSize()[0])
-		bImages := item.Data.MustTo(d, true)
-		bLabels := item.Label.MustTo(d, true)
-
-		logits := m.ForwardT(bImages, false)
-		acc := logits.AccuracyForLogits(bLabels)
-		sumAccuracy += acc.Values()[0] * size
-		sampleCount += size
-
-		bImages.MustDrop()
-		bLabels.MustDrop()
-		acc.MustDrop()
-	}
-
-	return sumAccuracy / sampleCount
-
-}
--- a/example/nn/main.go
+++ b/example/nn/main.go
@ -1,64 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-	ts "github.com/sugarme/gotch/tensor"
-)
-
-func testOptimizer() {
-
-	var data []float64
-	for i := 0; i < 15; i++ {
-		data = append(data, float64(i))
-	}
-	xs, err := ts.NewTensorFromData(data, []int64{int64(len(data)), 1})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ys := xs.MustMul1(ts.FloatScalar(0.42)).MustAdd1(ts.FloatScalar(1.337))
-
-	vs := nn.NewVarStore(gotch.CPU)
-
-	cfg := nn.LinearConfig{
-		WsInit: nn.NewConstInit(0.001),
-		BsInit: nn.NewConstInit(0.001),
-		Bias:   true,
-	}
-
-	// fmt.Printf("Number of trainable variables: %v\n", vs.Len())
-	linear := nn.NewLinear(vs.Root(), 1, 1, cfg)
-	// fmt.Printf("Trainable variables at app: %v\n", vs.TrainableVariable())
-
-	loss := xs.Apply(linear).MustMseLoss(ys, ts.ReductionMean.ToInt())
-	initialLoss := loss.MustView([]int64{-1}).MustFloat64Value([]int64{0})
-	fmt.Printf("Initial Loss: %.3f\n", initialLoss)
-
-	opt, err := nn.DefaultSGDConfig().Build(vs, 1e-2)
-	if err != nil {
-		log.Fatal("Failed building SGD optimizer")
-	}
-
-	for i := 0; i < 50; i++ {
-		// loss = xs.Apply(linear)
-		loss = linear.Forward(xs)
-		loss = loss.MustMseLoss(ys, ts.ReductionMean.ToInt())
-
-		fmt.Printf("Loss: %.3f\n", loss.MustView([]int64{-1}).MustFloat64Value([]int64{0}))
-
-		opt.BackwardStep(loss)
-
-		fmt.Printf("Bs: %.3f - Bs Grad: %.3f\n", linear.Bs.MustView([]int64{-1}).MustFloat64Value([]int64{0}), linear.Bs.MustGrad().MustFloat64Value([]int64{0}))
-		fmt.Printf("Ws: %.3f - Ws Grad: %.3f\n", linear.Ws.MustView([]int64{-1}).MustFloat64Value([]int64{0}), linear.Ws.MustGrad().MustFloat64Value([]int64{0}))
-
-	}
-
-}
-
-func main() {
-	testOptimizer()
-}
--- a/example/scalar/main.go
+++ b/example/scalar/main.go
@ -1,41 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	s := tensor.FloatScalar(float64(1.23))
-	fmt.Printf("scalar value: %v\n", s)
-
-	intVal, err := s.ToInt()
-	if err != nil {
-		panic(err)
-	}
-	floatVal, err := s.ToFloat()
-	if err != nil {
-		panic(err)
-	}
-	strVal, err := s.ToString()
-	if err != nil {
-		panic(err)
-	}
-
-	fmt.Printf("scalar to int64 value: %v\n", intVal)
-	fmt.Printf("scalar to float64 value: %v\n", floatVal)
-	fmt.Printf("scalar to string value: %v\n", strVal)
-
-	s.Drop() // will set scalar to zero
-	fmt.Printf("scalar value: %v\n", s)
-
-	zeroVal, err := s.ToInt()
-	if err != nil {
-		log.Fatalf("Panic: %v\n", err)
-	}
-
-	fmt.Printf("Won't expect this val: %v\n", zeroVal)
-}
--- a/example/seq/main.go
+++ b/example/seq/main.go
@ -1,205 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-	"math"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-	ts "github.com/sugarme/gotch/tensor"
-	"github.com/sugarme/gotch/vision"
-)
-
-func main() {
-	// noSeq()
-	withSeq()
-	// noSeq2Layers()
-
-	// seqNoVarStore()
-}
-
-func noSeq() {
-	ds := vision.LoadMNISTDir("../../data/mnist")
-
-	wsInit := nn.NewKaimingUniformInit()
-	ws := wsInit.InitTensor([]int64{10, 784}, gotch.CPU).MustT(true)
-
-	bound := 1.0 / math.Sqrt(float64(784))
-	bsInit := nn.NewUniformInit(-bound, bound)
-	bs := bsInit.InitTensor([]int64{10}, gotch.CPU)
-
-	for i := 0; i < 2000; i++ {
-		mul := ds.TrainImages.MustMatMul(ws, false)
-		logits := mul.MustAdd(bs, true)
-		loss := logits.AccuracyForLogits(ds.TrainLabels)
-
-		fmt.Printf("Epoch %v\t Loss: %.3f\n", i, loss.Values()[0])
-		loss.MustDrop()
-	}
-
-}
-
-func withSeq() {
-	seq := nn.Seq()
-	vs := nn.NewVarStore(gotch.CPU)
-	// seq.Add(nn.NewLinear(vs.Root(), 784, 10, *nn.DefaultLinearConfig()))
-	seq.Add(nn.NewLinear(vs.Root(), 784, 128, *nn.DefaultLinearConfig()))
-	seq.Add(nn.NewLinear(vs.Root(), 128, 10, *nn.DefaultLinearConfig()))
-
-	opt, err := nn.DefaultAdamConfig().Build(vs, 1e-2)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ds := vision.LoadMNISTDir("../../data/mnist")
-
-	for i := 0; i < 2000; i++ {
-		logits := seq.Forward(ds.TrainImages)
-		loss := logits.CrossEntropyForLogits(ds.TrainLabels)
-		opt.BackwardStep(loss)
-
-		testLogits := seq.Forward(ds.TestImages)
-		testAccuracy := testLogits.AccuracyForLogits(ds.TestLabels)
-
-		fmt.Printf("Epoch: %v \t Loss: %.3f \t Test accuracy: %.2f%%\n", i, loss.Values()[0], testAccuracy.Values()[0]*100)
-
-		loss.MustDrop()
-		testAccuracy.MustDrop()
-	}
-
-}
-
-func noSeq2Layers() {
-	ds := vision.LoadMNISTDir("../../data/mnist")
-
-	wsInit := nn.NewKaimingUniformInit()
-	ws1 := wsInit.InitTensor([]int64{1024, 784}, gotch.CPU).MustT(true)
-	ws2 := wsInit.InitTensor([]int64{10, 1024}, gotch.CPU).MustT(true)
-
-	bound1 := 1.0 / math.Sqrt(float64(784))
-	bsInit1 := nn.NewUniformInit(-bound1, bound1)
-	bs1 := bsInit1.InitTensor([]int64{1024}, gotch.CPU)
-
-	bound2 := 1.0 / math.Sqrt(float64(1024))
-	bsInit2 := nn.NewUniformInit(-bound2, bound2)
-	bs2 := bsInit2.InitTensor([]int64{10}, gotch.CPU)
-
-	for i := 0; i < 2000; i++ {
-		mul1 := ds.TrainImages.MustMatMul(ws1, false)
-		out1 := mul1.MustAdd(bs1, true)
-
-		mul2 := out1.MustMatMul(ws2, true)
-		logits := mul2.MustAdd(bs2, true)
-
-		loss := logits.AccuracyForLogits(ds.TrainLabels)
-
-		fmt.Printf("Epoch %v\t Loss: %.3f\n", i, loss.Values()[0])
-		loss.MustDrop()
-	}
-}
-
-func seqNoVarStore() {
-
-	ds := vision.LoadMNISTDir("../../data/mnist")
-
-	wsInit := nn.NewKaimingUniformInit()
-	ws1 := wsInit.InitTensor([]int64{1024, 784}, gotch.CPU).MustT(true)
-	ws2 := wsInit.InitTensor([]int64{10, 1024}, gotch.CPU).MustT(true)
-
-	bound1 := 1.0 / math.Sqrt(float64(784))
-	bsInit1 := nn.NewUniformInit(-bound1, bound1)
-	bs1 := bsInit1.InitTensor([]int64{1024}, gotch.CPU)
-
-	bound2 := 1.0 / math.Sqrt(float64(1024))
-	bsInit2 := nn.NewUniformInit(-bound2, bound2)
-	bs2 := bsInit2.InitTensor([]int64{10}, gotch.CPU)
-
-	l1 := Linear{&ws1, &bs1}
-	l2 := Linear{&ws2, &bs2}
-
-	seq := Seq()
-	seq.Add(l1)
-	seq.Add(l2)
-	// seq.Add1(l1)
-	// seq.Add2(l2)
-
-	for i := 0; i < 2000; i++ {
-		logits := seq.Forward(ds.TrainImages)
-
-		logits.MustDrop()
-	}
-
-}
-
-type Linear struct {
-	Ws *ts.Tensor
-	Bs *ts.Tensor
-}
-
-func (l Linear) Forward(xs ts.Tensor) ts.Tensor {
-	mul := xs.MustMatMul(*l.Ws, false)
-	return mul.MustAdd(*l.Bs, true)
-}
-
-type Sequential struct {
-	layers []ts.Module
-	l1     ts.Module
-	l2     ts.Module
-}
-
-func Seq() Sequential {
-	return Sequential{layers: make([]ts.Module, 0)}
-}
-
-// Len returns number of sub-layers embedded in this layer
-func (s *Sequential) Len() (retVal int64) {
-	return int64(len(s.layers))
-}
-
-// IsEmpty returns true if this layer does not have any sub-layers.
-func (s *Sequential) IsEmpty() (retVal bool) {
-	return len(s.layers) == 0
-}
-
-// Add appends a layer after all the current layers.
-func (s *Sequential) Add(l ts.Module) {
-
-	s.layers = append(s.layers, l)
-}
-
-func (s *Sequential) Add1(l ts.Module) {
-	s.l1 = l
-}
-
-func (s *Sequential) Add2(l ts.Module) {
-	s.l2 = l
-}
-
-func (s Sequential) Forward(xs ts.Tensor) (retVal ts.Tensor) {
-	if s.IsEmpty() {
-		return xs.MustShallowClone()
-	}
-
-	// forward sequentially
-	outs := make([]ts.Tensor, len(s.layers))
-	for i := 0; i < len(s.layers); i++ {
-		if i == 0 {
-			outs[0] = s.layers[i].Forward(xs)
-			defer outs[0].MustDrop()
-		} else if i == len(s.layers)-1 {
-			return s.layers[i].Forward(outs[i-1])
-		} else {
-			outs[i+1] = s.layers[i].Forward(outs[i-1])
-			defer outs[i+1].MustDrop()
-		}
-	}
-
-	return
-
-	// out1 := s.l1.Forward(xs)
-	// defer out1.MustDrop()
-	//
-	// return s.l2.Forward(out1)
-
-}
--- a/example/sgd/main.go
+++ b/example/sgd/main.go
@ -1,45 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-	ts "github.com/sugarme/gotch/tensor"
-)
-
-func myModule(p nn.Path, dim int64) ts.Module {
-	x1 := p.Zeros("x1", []int64{dim})
-	x2 := p.Zeros("x1", []int64{dim})
-
-	return nn.NewFunc(func(xs ts.Tensor) ts.Tensor {
-		return xs.MustMul(x1).MustAdd(xs.MustExp().MustMul(x2))
-	})
-
-}
-
-func main() {
-
-	vs := nn.NewVarStore(gotch.CPU)
-
-	m := myModule(vs.Root(), 7)
-
-	opt, err := nn.DefaultSGDConfig().Build(vs, 1e-2)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	for i := 0; i < 50; i++ {
-		xs := ts.MustZeros([]int64{7}, gotch.Float.CInt(), gotch.CPU.CInt())
-		ys := ts.MustZeros([]int64{7}, gotch.Float.CInt(), gotch.CPU.CInt())
-
-		loss := m.Forward(xs).MustSub(ys).MustPow(ts.IntScalar(2)).MustSum(gotch.Float.CInt())
-
-		opt.BackwardStep(loss)
-
-		fmt.Printf("Loss: %v\n", loss.MustView([]int64{-1}).MustFloat64Value([]int64{0}))
-
-	}
-
-}
--- a/example/tensor-copy-data/main.go
+++ b/example/tensor-copy-data/main.go
@ -1,39 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	// TODO: Check Go type of data and tensor DType
-	// For. if data is []int and DType is Bool
-	// It is still running but get wrong result.
-	data := [][]int64{
-		{1, 1, 1, 2, 2, 2, 3, 3},
-		{1, 1, 1, 2, 2, 2, 4, 4},
-	}
-	shape := []int64{2, 8}
-
-	// data := []int16{1, 1, 1, 2, 2, 2, 3, 3}
-	// shape := []int64{1, 8}
-
-	ts, err := tensor.NewTensorFromData(data, shape)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ts.Print()
-
-	numel := uint(6)
-	// dst := make([]uint8, numel)
-	var dst = make([]int64, 6)
-
-	ts.MustCopyData(dst, numel)
-
-	fmt.Println(dst)
-
-}
--- a/example/tensor-grad/main.go
+++ b/example/tensor-grad/main.go
@ -15,8 +15,8 @@ func main() {
 	xy := tensor.TensorFrom([]float64{2.0})
 	xz := tensor.TensorFrom([]float64{3.0})

-	y := x.MustMul(xy)
-	z := x.MustMul(xz)
+	y := x.MustMul(xy, false)
+	z := x.MustMul(xz, false)

 	y.Backward()
 	xgrad := x.MustGrad()
@ -31,14 +31,3 @@ func main() {
 	fmt.Printf("Previous GradMode enabled state: %v\n", isGradEnabled)

 }
-
-/* // Compute a second order derivative using run_backward.
- * let mut x = Tensor::from(42.0).set_requires_grad(true);
- * let y = &x * &x * &x + &x + &x * &x;
- * x.zero_grad();
- * let dy_over_dx = Tensor::run_backward(&[y], &[&x], true, true);
- * assert_eq!(dy_over_dx.len(), 1);
- * let dy_over_dx = &dy_over_dx[0];
- * dy_over_dx.backward();
- * let dy_over_dx2 = x.grad();
- * assert_eq!(f64::from(&dy_over_dx2), 254.0); */
--- a/example/tensor-in-place/main.go
+++ b/example/tensor-in-place/main.go
@ -1,41 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	// TODO: Check Go type of data and tensor DType
-	// For. if data is []int and DType is Bool
-	// It is still running but get wrong result.
-	data := [][]int64{
-		{1, 1, 1, 2, 2, 2, 3, 3},
-		{1, 1, 1, 2, 2, 2, 4, 4},
-	}
-	shape := []int64{2, 8}
-
-	ts, err := tensor.NewTensorFromData(data, shape)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ts, err = ts.To(gotch.CPU)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Printf("Tensor value BEFORE: %v\n", ts)
-	ts.Print()
-
-	scalarVal := tensor.IntScalar(int64(5))
-
-	ts.Fill_(scalarVal)
-
-	fmt.Printf("Tensor value AFTER: %v\n", ts)
-	ts.Print()
-}
--- a/example/tensor-index/main.go
+++ b/example/tensor-index/main.go
@ -1,60 +0,0 @@
-package main
-
-import (
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-	data := [][]int64{
-		{1, 1, 1, 2, 2, 2, 3, 3},
-		{1, 1, 1, 2, 2, 2, 4, 4},
-	}
-	shape := []int64{2, 8}
-	// shape := []int64{2, 2, 4}
-
-	ts, err := tensor.NewTensorFromData(data, shape)
-	if err != nil {
-		panic(err)
-	}
-
-	ts.Print()
-
-	// Select
-	s := tensor.NewSelect(7)
-	// selectedTs := ts.Idx(s)
-	// selectedTs.Print()
-
-	// Narrow (start inclusive, end exclusive)
-	n := tensor.NewNarrow(0, 1)
-	// narrowedTs := ts.Idx(n)
-	// narrowedTs.Print()
-
-	// InsertNewAxis
-	// i := tensor.NewInsertNewAxis()
-	// newAxisTs := ts.Idx(i)
-	// newAxisTs.Print()
-
-	// IndexSelect
-	// idxTensor := tensor.MustOfSlice([]int64{0, 1})
-	// is := tensor.NewIndexSelect(idxTensor)
-	// isTs := ts.Idx(is)
-	// isTs.Print()
-
-	// Combined
-	var tsIndexes []tensor.TensorIndexer = []tensor.TensorIndexer{n, s}
-	combinedTs := ts.Idx(tsIndexes)
-
-	combinedTs.Print()
-
-	// Copy to index
-	desTs := tensor.MustZeros([]int64{5}, gotch.Float.CInt(), gotch.CPU.CInt())
-	srcTs := tensor.MustOnes([]int64{1}, gotch.Float.CInt(), gotch.CPU.CInt())
-	idx := tensor.NewNarrow(0, 3)
-
-	// NOTE: indexing operations return view on the same memory
-	desTs.Print()
-	desTs.Idx(idx).MustView([]int64{-1}, false).Copy_(srcTs)
-	desTs.Print()
-
-}
--- a/example/tensor-index1/main.go
+++ b/example/tensor-index1/main.go
@ -1,25 +0,0 @@
-package main
-
-import (
-	"fmt"
-
-	"github.com/sugarme/gotch"
-	ts "github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	tensor := ts.MustArange1(ts.IntScalar(0), ts.IntScalar(2*3), gotch.Int64, gotch.CPU).MustView([]int64{2, 3}, true)
-
-	var idxs []ts.TensorIndexer = []ts.TensorIndexer{
-		// ts.NewNarrow(0, tensor.MustSize()[0]),
-		// ts.NewNarrow(0, tensor.MustSize()[1]),
-		ts.NewInsertNewAxis(),
-	}
-
-	result := tensor.Idx(idxs)
-
-	fmt.Printf("Original Ts shape: %v\n", tensor.MustSize())
-	fmt.Printf("Result Ts shape: %v\n", result.MustSize())
-
-}
--- a/example/tensor-iterator/main.go
+++ b/example/tensor-iterator/main.go
@ -1,32 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"reflect"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-	data := [][]int64{
-		{1, 1, 1, 2, 2, 2, 3, 3},
-		{1, 1, 1, 2, 2, 2, 4, 4},
-	}
-	shape := []int64{16}
-
-	ts, err := tensor.NewTensorFromData(data, shape)
-	if err != nil {
-		panic(err)
-	}
-
-	it, err := ts.Iter(reflect.Float64)
-	if err != nil {
-		panic(err)
-	}
-
-	for i := 0; i < int(it.Len); i++ {
-		v := it.Next()
-		fmt.Println(v)
-	}
-
-}
--- a/example/tensor-memory/main.go
+++ b/example/tensor-memory/main.go
@ -1,74 +0,0 @@
-package main
-
-import (
-	"fmt"
-	// "runtime"
-
-	ts "github.com/sugarme/gotch/tensor"
-)
-
-func createTensors(samples int) []ts.Tensor {
-	n := int(10e6)
-	var data []float64
-	for i := 0; i < n; i++ {
-		data = append(data, float64(i))
-	}
-
-	var tensors []ts.Tensor
-	s := ts.FloatScalar(float64(0.23))
-
-	// for i := 0; i < samples; i++ {
-	for i := 0; i < 1; i++ {
-		t := ts.MustOfSlice(data).MustMul1(s, true)
-
-		// t1.MustDrop()
-		// t.MustDrop()
-		// t1 = ts.Tensor{}
-		// t = ts.Tensor{}
-		// runtime.GC()
-
-		// fmt.Printf("t values: %v", t.Values())
-		// fmt.Printf("t1 values: %v", t1.Values())
-		tensors = append(tensors, t)
-	}
-
-	return tensors
-}
-
-func dropTensors(tensors []ts.Tensor) {
-	for _, t := range tensors {
-		t.MustDrop()
-	}
-}
-
-func main() {
-
-	var si *SI
-	si = Get()
-	fmt.Printf("Total RAM (MB):\t %8.2f\n", float64(si.TotalRam)/1024)
-	fmt.Printf("Used RAM (MB):\t %8.2f\n", float64(si.TotalRam-si.FreeRam)/1024)
-
-	startRAM := si.TotalRam - si.FreeRam
-
-	epochs := 50
-	// var m runtime.MemStats
-
-	for i := 0; i < epochs; i++ {
-		// runtime.ReadMemStats(&m)
-		// t0 := float64(m.Sys) / 1024 / 1024
-
-		tensors := createTensors(10000)
-
-		// runtime.ReadMemStats(&m)
-		// t1 := float64(m.Sys) / 1024 / 1024
-
-		dropTensors(tensors)
-
-		// runtime.ReadMemStats(&m)
-		// t2 := float64(m.Sys) / 1024 / 1024
-
-		// fmt.Printf("Epoch: %v \t Start Mem [%.3f MiB] \t Alloc Mem [%.3f MiB] \t Free Mem [%.3f MiB]\n", i, t0, t1, t2)
-		si = Get()
-		fmt.Printf("Epoch %v\t Used: [%8.2f MiB]\n", i, (float64(si.TotalRam-si.FreeRam)-float64(startRAM))/1024)
-	}
-}
--- a/example/tensor-run-backward/main.go
+++ b/example/tensor-run-backward/main.go
@ -1,54 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-	x := tensor.TensorFrom([]float64{2.0})
-	x = x.MustSetRequiresGrad(true)
-	x.ZeroGrad()
-
-	xmul := tensor.TensorFrom([]float64{3.0})
-	xadd := tensor.TensorFrom([]float64{5.0})
-
-	x1 := x.MustMul(xmul)
-	x2 := x1.MustMul(xmul)
-	x3 := x2.MustMul(xmul)
-
-	y := x3.MustAdd(xadd)
-
-	inputs := []tensor.Tensor{x}
-
-	dy_over_dx, err := tensor.RunBackward([]tensor.Tensor{y}, inputs, true, true)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Printf("dy_over_dx length: %v\n", len(dy_over_dx))
-
-	// dy_over_dx1 := dy_over_dx[0]
-	// err = dy_over_dx1.Backward()
-	// if err != nil {
-	// log.Fatalf("Errors:\n, %v", err)
-	// }
-
-	dy_over_dx[0].MustBackward()
-
-	x.MustGrad().Print()
-
-}
-
-/* // Compute a second order derivative using run_backward.
- * let mut x = Tensor::from(42.0).set_requires_grad(true);
- * let y = &x * &x * &x + &x + &x * &x;
- * x.zero_grad();
- * let dy_over_dx = Tensor::run_backward(&[y], &[&x], true, true);
- * assert_eq!(dy_over_dx.len(), 1);
- * let dy_over_dx = &dy_over_dx[0];
- * dy_over_dx.backward();
- * let dy_over_dx2 = x.grad();
- * assert_eq!(f64::from(&dy_over_dx2), 254.0); */
--- a/example/tensor/main.go
+++ b/example/tensor/main.go
@ -1,89 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-	"time"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	// TODO: Check Go type of data and tensor DType
-	// For. if data is []int and DType is Bool
-	// It is still running but get wrong result.
-	data := [][]int64{
-		{1, 1, 1, 2, 2, 2, 3, 3},
-		{1, 1, 1, 2, 2, 2, 4, 4},
-	}
-	shape := []int64{2, 8}
-	// shape := []int64{2, 2, 4}
-
-	// dtype := gotch.Int
-	// ts := tensor.NewTensor()
-	// sliceTensor, err := ts.FOfSlice(data, dtype)
-	// if err != nil {
-	// log.Fatal(err)
-	// }
-
-	ts, err := tensor.NewTensorFromData(data, shape)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ts.Print()
-
-	sz, err := ts.Size2()
-	if err != nil {
-		log.Fatal(err)
-	}
-	fmt.Printf("Shape: %v\n", sz)
-
-	fmt.Printf("DType: %v\n", ts.DType())
-
-	dx := [][]float64{
-		{1, 1},
-		{1, 1},
-		{1, 1},
-	}
-
-	dy := [][]float64{
-		{1, 2, 3},
-		{1, 1, 1},
-	}
-
-	xs, err := tensor.NewTensorFromData(dx, []int64{3, 2})
-	if err != nil {
-		log.Fatal(err)
-	}
-	ys, err := tensor.NewTensorFromData(dy, []int64{2, 3})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	// CPU
-	startCPUTime := time.Now()
-	for i := 1; i < 100000; i++ {
-		xs.Matmul(ys)
-	}
-	fmt.Printf("CPU time: %v\n", time.Since(startCPUTime))
-
-	// Cuda
-	device := gotch.NewCuda()
-	startGPUTime := time.Now()
-	for i := 1; i < 100000; i++ {
-		cx, err := xs.To(device)
-		if err != nil {
-			log.Fatal(err)
-		}
-		cy, err := ys.To(device)
-		if err != nil {
-			log.Fatal(err)
-		}
-		cx.Matmul(cy)
-	}
-
-	fmt.Printf("GPU time: %v\n", time.Since(startGPUTime))
-}
--- a/example/tensor1/main.go
+++ b/example/tensor1/main.go
@ -1,69 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"log"
-
-	"github.com/sugarme/gotch/tensor"
-)
-
-func main() {
-
-	ts, err := tensor.OfSlice([]float64{1.3, 29.7})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	res, err := ts.Float64Value([]int64{1})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Println(res)
-
-	resInt64, err := ts.Int64Value([]int64{1})
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Println(resInt64)
-
-	grad, err := ts.RequiresGrad()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	fmt.Printf("Requires Grad: %v\n", grad)
-
-	ele1, err := ts.DataPtr()
-	if err != nil {
-		log.Fatal(err)
-	}
-	fmt.Printf("First element address: %v\n", ele1)
-
-	fmt.Printf("Number of tensor elements: %v\n", ts.Numel())
-
-	clone := ts.MustShallowClone()
-	clone.Print()
-
-	atGet := ts.MustGet(1)
-	atGet.Print() // 29.7
-
-	atGet = ts.MustGet(0)
-	atGet.Print() // 1.3
-
-	dst, err := tensor.NewTensorFromData([]int64{1, 2}, []int64{1, 2})
-	if err != nil {
-		panic(err)
-	}
-
-	dst = dst.MustTotype(ts.DType())
-
-	tensor.MustCopy_(dst, ts)
-	dst.Print()
-
-	ts.MustDrop()
-	// The below statement will be panic as `ts` has been dropped.
-	// ts.Print()
-
-}
--- a/example/varstore/main.go
+++ b/example/varstore/main.go
@ -1,26 +0,0 @@
-package main
-
-import (
-	"fmt"
-
-	"github.com/sugarme/gotch"
-	"github.com/sugarme/gotch/nn"
-)
-
-func main() {
-
-	vs := nn.NewVarStore(gotch.CPU)
-
-	fmt.Printf("Is VarStore emptry? %v\n ", vs.IsEmpty())
-
-	path := vs.Root()
-
-	init := nn.NewKaimingUniformInit()
-
-	init.InitTensor([]int64{1, 4}, gotch.CPU).Print()
-
-	path.NewVar("layer1", []int64{1, 10}, nn.NewKaimingUniformInit())
-
-	fmt.Printf("Is VarStore emptry? %v\n ", vs.IsEmpty())
-
-}
--- a/nn/sequential.go
+++ b/nn/sequential.go
@ -3,6 +3,7 @@ package nn
 // A sequential layer used to chain multiple layers and closures.

 import (
+	"github.com/sugarme/gotch"
 	ts "github.com/sugarme/gotch/tensor"
 	// "reflect"
 )
@ -224,3 +225,104 @@ type ForwardTWith func(ts.Tensor, bool) ts.Tensor
 func (fw ForwardTWith) ForwardT(xs ts.Tensor, train bool) ts.Tensor {
 	return fw(xs, train)
 }
+
+// BatchAccuracyForLogits calculates average accuracy of test batches.
+//
+// NOTE: Pytorch uses `NoGradGuard` which is a thread local scope and
+// it sets a global flag that is checked by the backend whenever an op is done on a variable.
+// The guard itself saved the current status and set it to false in the constructor.
+// And restore the saved status in it’s destructor. That way it is similar to a with torch.no_grad(): block in python.
+// This seems not working in Go.
+// There 2 ways to get around. One is freeze VarStore, the other is
+// set manually set AutoGrad at `loss` tensor. I.e., `loss = loss.MustSetRequiresGrad(true)`
+func BatchAccuracyForLogits(vs VarStore, m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
+
+	var (
+		sumAccuracy float64 = 0.0
+		sampleCount float64 = 0.0
+	)
+
+	vs.Freeze()
+	defer vs.Unfreeze()
+
+	iter2 := ts.MustNewIter2(xs, ys, int64(batchSize))
+	for {
+		item, ok := iter2.Next()
+		if !ok {
+			break
+		}
+
+		size := float64(item.Data.MustSize()[0])
+		bImages := item.Data.MustTo(d, true)
+		bLabels := item.Label.MustTo(d, true)
+
+		logits := m.ForwardT(bImages, false)
+		acc := logits.AccuracyForLogits(bLabels)
+		sumAccuracy += acc.Values()[0] * size
+		sampleCount += size
+
+		bImages.MustDrop()
+		bLabels.MustDrop()
+		acc.MustDrop()
+	}
+
+	return sumAccuracy / sampleCount
+}
+
+// BatchAccuracyForLogitIdx is an alternative of BatchAccuracyForLogits to
+// calculate accuracy for specified batch on module weight. It uses tensor
+// indexing instead of Iter2
+func BatchAccuracyForLogitsIdx(vs VarStore, m ts.ModuleT, xs, ys ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
+	var (
+		sumAccuracy float64 = 0.0
+		sampleCount float64 = 0.0
+	)
+
+	totalSize := xs.MustSize()[0]
+	samples := int(totalSize)
+
+	index := ts.MustRandperm(int64(totalSize), gotch.Int64, gotch.CPU)
+	imagesTs := xs.MustIndexSelect(0, index, false)
+	labelsTs := ys.MustIndexSelect(0, index, false)
+
+	batches := samples / batchSize
+	batchIndex := 0
+
+	vs.Freeze()
+	defer vs.Unfreeze()
+
+	for i := 0; i < batches; i++ {
+		start := batchIndex * batchSize
+		size := batchSize
+		if samples-start < batchSize {
+			break
+		}
+		batchIndex += 1
+
+		// Indexing
+		narrowIndex := ts.NewNarrow(int64(start), int64(start+size))
+		bImages := imagesTs.Idx(narrowIndex)
+		bLabels := labelsTs.Idx(narrowIndex)
+
+		bImages = bImages.MustTo(d, true)
+		bLabels = bLabels.MustTo(d, true)
+
+		logits := m.ForwardT(bImages, true)
+		bAccuracy := logits.AccuracyForLogits(bLabels)
+
+		accuVal := bAccuracy.Values()[0]
+		bSamples := float64(xs.MustSize()[0])
+		sumAccuracy += accuVal * bSamples
+		sampleCount += bSamples
+
+		// Free up tensors on C memory
+		bImages.MustDrop()
+		bLabels.MustDrop()
+		bAccuracy.MustDrop()
+	}
+
+	imagesTs.MustDrop()
+	labelsTs.MustDrop()
+
+	return sumAccuracy / sampleCount
+}
--- a/sysinfo.go
+++ b/sysinfo.go
@ -1,126 +0,0 @@
-// A wrapper around the linux syscall sysinfo(2).
-package gotch
-
-// helper to debug memory blow-up
-
-import (
-	"fmt"
-	"sync"
-	"syscall"
-	"time"
-)
-
-// Go-ized http://man7.org/linux/man-pages/man2/sysinfo.2.html
-type SI struct {
-	Uptime       time.Duration // time since boot
-	Loads        [3]float64    // 1, 5, and 15 minute load averages, see e.g. UPTIME(1)
-	Procs        uint64        // number of current processes
-	TotalRam     uint64        // total usable main memory size [kB]
-	FreeRam      uint64        // available memory size [kB]
-	SharedRam    uint64        // amount of shared memory [kB]
-	BufferRam    uint64        // memory used by buffers [kB]
-	TotalSwap    uint64        // total swap space size [kB]
-	FreeSwap     uint64        // swap space still available [kB]
-	TotalHighRam uint64        // total high memory size [kB]
-	FreeHighRam  uint64        // available high memory size [kB]
-	mu           sync.Mutex    // ensures atomic writes; protects the following fields
-}
-
-var sis = &SI{}
-
-// Get the linux sysinfo data structure.
-//
-// Useful links in the wild web:
-// http://man7.org/linux/man-pages/man2/sysinfo.2.html
-// http://man7.org/linux/man-pages/man1/uptime.1.html
-// https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/zsyscall_linux_amd64.go#L1050
-// https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/ztypes_linux_amd64.go#L528
-// https://github.com/capnm/golang/blob/go1.1.1/src/pkg/syscall/ztypes_linux_arm.go#L502
-func GetSysInfo() *SI {
-
-	/*
-	   // Note: uint64 is uint32 on 32 bit CPUs
-	   type Sysinfo_t struct {
-	   	Uptime    int64		// Seconds since boot
-	   	Loads     [3]uint64	// 1, 5, and 15 minute load averages
-	   	Totalram  uint64	// Total usable main memory size
-	   	Freeram   uint64	// Available memory size
-	   	Sharedram uint64	// Amount of shared memory
-	   	Bufferram uint64	// Memory used by buffers
-	   	Totalswap uint64	// Total swap space size
-	   	Freeswap  uint64	// swap space still available
-	   	Procs     uint16	// Number of current processes
-	   	Pad       uint16
-	   	Pad_cgo_0 [4]byte
-	   	Totalhigh uint64	// Total high memory size
-	   	Freehigh  uint64	// Available high memory size
-	   	Unit      uint32	// Memory unit size in bytes
-	   	X_f       [0]byte
-	   	Pad_cgo_1 [4]byte	// Padding to 64 bytes
-	   }
-	*/
-
-	// ~1kB garbage
-	si := &syscall.Sysinfo_t{}
-
-	// XXX is a raw syscall thread safe?
-	err := syscall.Sysinfo(si)
-	if err != nil {
-		panic("Commander, we have a problem. syscall.Sysinfo:" + err.Error())
-	}
-	scale := 65536.0 // magic
-
-	defer sis.mu.Unlock()
-	sis.mu.Lock()
-
-	unit := uint64(si.Unit) * 1024 // kB
-
-	sis.Uptime = time.Duration(si.Uptime) * time.Second
-	sis.Loads[0] = float64(si.Loads[0]) / scale
-	sis.Loads[1] = float64(si.Loads[1]) / scale
-	sis.Loads[2] = float64(si.Loads[2]) / scale
-	sis.Procs = uint64(si.Procs)
-
-	sis.TotalRam = uint64(si.Totalram) / unit
-	sis.FreeRam = uint64(si.Freeram) / unit
-	sis.BufferRam = uint64(si.Bufferram) / unit
-	sis.TotalSwap = uint64(si.Totalswap) / unit
-	sis.FreeSwap = uint64(si.Freeswap) / unit
-	sis.TotalHighRam = uint64(si.Totalhigh) / unit
-	sis.FreeHighRam = uint64(si.Freehigh) / unit
-
-	return sis
-}
-
-// Make the "fmt" Stringer interface happy.
-func (si SI) String() string {
-	// XXX: Is the copy of SI done atomic? Not sure.
-	// Without an outer lock this may print a junk.
-	return fmt.Sprintf("uptime\t\t%v\nload\t\t%2.2f %2.2f %2.2f\nprocs\t\t%d\n"+
-		"ram  total\t%d kB\nram  free\t%d kB\nram  buffer\t%d kB\n"+
-		"swap total\t%d kB\nswap free\t%d kB",
-		//"high ram total\t%d kB\nhigh ram free\t%d kB\n"
-		si.Uptime, si.Loads[0], si.Loads[1], si.Loads[2], si.Procs,
-		si.TotalRam, si.FreeRam, si.BufferRam,
-		si.TotalSwap, si.FreeSwap,
-		// archaic si.TotalHighRam, si.FreeHighRam
-	)
-}
-
-/*
-Convert to string in a thread safe way.
- Output:
-	uptime		279h6m21s
-	load		0.12 0.04 0.05
-	procs		143
-	ram  total	383752 kB
-	ram  free	254980 kB
-	ram  buffer	7640 kB
-	swap total	887800 kB
-	swap free	879356 kB
-*/
-func (si *SI) ToString() string {
-	defer si.mu.Unlock()
-	si.mu.Lock()
-	return si.String()
-}
--- a/tensor/module.go
+++ b/tensor/module.go
@ -1,7 +1,5 @@
 package tensor

-import "github.com/sugarme/gotch"
-
 // Module interface is a container with only one method `Forward`
 //
 // The following is `module` concept from Pytorch documenation:
@ -52,108 +50,50 @@ type ModuleT interface {
 * }
 *  */

-// BatchAccuracyForLigits calculate accuracy in batch.
-//
-// TODO: It would be nice if it is one method an object that implements ModuleT
-// interface.
-func BatchAccuracyForLogits(m ModuleT, xs, ys Tensor, d gotch.Device, batchSize int) (retVal float64) {
-
-	var (
-		sumAccuracy float64 = 0.0
-		sampleCount float64 = 0.0
-	)
-
-	_ = MustGradSetEnabled(false)
-
-	iter2 := MustNewIter2(xs, ys, int64(batchSize))
-	for {
-		item, ok := iter2.Next()
-		if !ok {
-			break
-		}
-
-		size := float64(item.Data.MustSize()[0])
-		bImages := item.Data.MustTo(d, true)
-		bLabels := item.Label.MustTo(d, true)
-
-		logits := m.ForwardT(bImages, false)
-		acc := logits.AccuracyForLogits(bLabels)
-		sumAccuracy += acc.Values()[0] * size
-		sampleCount += size
-
-		bImages.MustDrop()
-		bLabels.MustDrop()
-		acc.MustDrop()
-	}
-
-	_ = MustGradSetEnabled(true)
-
-	return sumAccuracy / sampleCount
-
-}
-
-// BatchAccuracyForLogitIdx is an alternative of BatchAccuracyForLogits to
-// calculate accuracy for specified batch on module weight. It uses tensor
-// indexing instead of Iter2
-func BatchAccuracyForLogitsIdx(m ModuleT, xs, ys Tensor, d gotch.Device, batchSize int) (retVal float64) {
-	var (
-		sumAccuracy float64 = 0.0
-		sampleCount float64 = 0.0
-	)
-
-	// Switch Grad off
-	_ = NewNoGradGuard()
-
-	totalSize := xs.MustSize()[0]
-	samples := int(totalSize)
-
-	index := MustRandperm(int64(totalSize), gotch.Int64, gotch.CPU)
-	imagesTs := xs.MustIndexSelect(0, index, false)
-	labelsTs := ys.MustIndexSelect(0, index, false)
-
-	batches := samples / batchSize
-	batchIndex := 0
-
-	for i := 0; i < batches; i++ {
-		start := batchIndex * batchSize
-		size := batchSize
-		if samples-start < batchSize {
-			// size = samples - start
-			break
-		}
-		batchIndex += 1
-
-		// Indexing
-		narrowIndex := NewNarrow(int64(start), int64(start+size))
-		bImages := imagesTs.Idx(narrowIndex)
-		bLabels := labelsTs.Idx(narrowIndex)
-
-		bImages = bImages.MustTo(d, true)
-		bLabels = bLabels.MustTo(d, true)
-
-		logits := m.ForwardT(bImages, true)
-		bAccuracy := logits.AccuracyForLogits(bLabels)
-
-		accuVal := bAccuracy.Values()[0]
-		bSamples := float64(xs.MustSize()[0])
-		sumAccuracy += accuVal * bSamples
-		sampleCount += bSamples
-
-		// Free up tensors on C memory
-		bImages.MustDrop()
-		bLabels.MustDrop()
-		// logits.MustDrop()
-		bAccuracy.MustDrop()
-	}
-
-	imagesTs.MustDrop()
-	labelsTs.MustDrop()
-
-	// Switch Grad on
-	// _ = MustGradSetEnabled(true)
-
-	return sumAccuracy / sampleCount
-}
+// NOTE: this func has been moved to `nn/sequential` as `NoGradGuard`
+// seem not working in Go and the function needs to add varstore variable
+// parameter. Hence, it is moved to `nn` to avoid cycle reference.
+/*
+ * // BatchAccuracyForLigits calculate accuracy in batch.
+ * //
+ * // TODO: It would be nice if it is one method an object that implements ModuleT
+ * // interface.
+ * func BatchAccuracyForLogits(m ModuleT, xs, ys Tensor, d gotch.Device, batchSize int) (retVal float64) {
+ *
+ *   var (
+ *     sumAccuracy float64 = 0.0
+ *     sampleCount float64 = 0.0
+ *   )
+ *
+ *   _ = MustGradSetEnabled(false)
+ *
+ *   iter2 := MustNewIter2(xs, ys, int64(batchSize))
+ *   for {
+ *     item, ok := iter2.Next()
+ *     if !ok {
+ *       break
+ *     }
+ *
+ *     size := float64(item.Data.MustSize()[0])
+ *     bImages := item.Data.MustTo(d, true)
+ *     bLabels := item.Label.MustTo(d, true)
+ *
+ *     logits := m.ForwardT(bImages, false)
+ *     acc := logits.AccuracyForLogits(bLabels)
+ *     sumAccuracy += acc.Values()[0] * size
+ *     sampleCount += size
+ *
+ *     bImages.MustDrop()
+ *     bLabels.MustDrop()
+ *     acc.MustDrop()
+ *   }
+ *
+ *   _ = MustGradSetEnabled(true)
+ *
+ *   return sumAccuracy / sampleCount
+ *
+ * }
+ *  */

 // Tensor methods for Module and ModuleT:
 // ======================================