diff --git a/CHANGELOG.md b/CHANGELOG.md
index a216a06..98b1cdb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed incorrect indexing at `dutil/Dataset.Next()`
 - Added `nn.MSELoss()`
 - reworked `ts.Format()`
+- Added conv2d benchmark
 
 ## [Nofix]
 - ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.
diff --git a/ts/README.md b/ts/README.md
new file mode 100644
index 0000000..3207bc8
--- /dev/null
+++ b/ts/README.md
@@ -0,0 +1,63 @@
+# BENCHMARK
+
+## Convolution 2D
+
+Ref.
+1. https://tigress-web.princeton.edu/~jdh4/PyTorchPerformanceTuningGuide_GTC2021.pdf
+2. https://github.com/soumith/convnet-benchmarks
+
+Benchmark tensor operation `conv2d` forward propagation:
+- input shape: `[32, 64, 64, 64]`
+- kernel:            `[64, 3, 3]`
+
+goos: linux
+goarch: amd64
+pkg: github.com/sugarme/gotch/ts
+cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
+BenchmarkConv2dCPU-8                 100          21198303 ns/op
+BenchmarkConv2dCUDA-8                100           2201213 ns/op
+
+```bash
+name          time/op
+Conv2dCPU-8   21.2ms ± 0%
+Conv2dCUDA-8  2.20ms ± 0%
+```
+
+Benchmark against Pytorch 1.11 CUDA 11 
+
+```bash
+conv2d-CPU(x):   56.7 ms
+conv2d-CUDA(x):   38.0 ms
+```
+
+benchmark Python code below
+
+```python
+import torch
+import torch.nn.functional as F
+import timeit
+
+x = torch.randn(32, 64, 64, 64)
+
+def conv2dCPU(x):
+    conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False)
+    return conv1(x)
+
+def conv2dCUDA(x):
+    x = x.cuda()
+    conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False).cuda()
+    return conv1(x)
+
+t0 = timeit.Timer(
+    stmt='conv2dCPU(x)',
+    setup='from __main__ import conv2dCPU',
+    globals={'x': x})
+
+t1 = timeit.Timer(
+    stmt='conv2dCUDA(x)',
+    setup='from __main__ import conv2dCUDA',
+    globals={'x': x})
+
+print(f'conv2d-CPU(x):  {t0.timeit(100) / 100 * 1e3:>5.1f} ms')
+print(f'conv2d-CUDA(x):  {t1.timeit(100) / 100 * 1e3:>5.1f} ms')
+```
diff --git a/ts/benchmark-conv2d_test.go b/ts/benchmark-conv2d_test.go
new file mode 100644
index 0000000..d53dfc1
--- /dev/null
+++ b/ts/benchmark-conv2d_test.go
@@ -0,0 +1,72 @@
+package ts_test
+
+import (
+	"testing"
+
+	"github.com/sugarme/gotch"
+	"github.com/sugarme/gotch/ts"
+)
+
+// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
+// benchstat op-conv-bench.txt
+func BenchmarkConv2dCPU(b *testing.B) {
+	// var shape []int64 = []int64{64, 3, 224, 224}
+	var shape []int64 = []int64{32, 64, 64, 64}
+	device := gotch.CPU
+	x := ts.MustRandn(shape, gotch.Float, device)
+	// kDims := []int64{1, 3, 3, 3}
+	kDims := []int64{1, 64, 3, 3}
+	kernelTemplate := []int64{
+		1, 1, 1,
+		1, -8, 1,
+		1, 1, 1,
+	}
+	var kernelData []int64
+	for i := 0; i < int(kDims[0]*kDims[1]); i++ {
+		kernelData = append(kernelData, kernelTemplate...)
+	}
+	weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
+
+	stride := []int64{1, 1}
+	padding := []int64{0, 0}
+	dilation := []int64{1, 1}
+	for i := 0; i < b.N; i++ {
+		out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
+		if err != nil {
+			panic(err)
+		}
+		out.MustDrop()
+	}
+}
+
+// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
+// benchstat op-conv-bench.txt
+func BenchmarkConv2dCUDA(b *testing.B) {
+	// var shape []int64 = []int64{64, 3, 224, 224}
+	var shape []int64 = []int64{32, 64, 64, 64}
+	device := gotch.CudaIfAvailable()
+	x := ts.MustRandn(shape, gotch.Float, device)
+	// kDims := []int64{1, 3, 3, 3}
+	kDims := []int64{1, 64, 3, 3}
+	kernelTemplate := []int64{
+		1, 1, 1,
+		1, -8, 1,
+		1, 1, 1,
+	}
+	var kernelData []int64
+	for i := 0; i < int(kDims[0]*kDims[1]); i++ {
+		kernelData = append(kernelData, kernelTemplate...)
+	}
+	weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
+
+	stride := []int64{1, 1}
+	padding := []int64{0, 0}
+	dilation := []int64{1, 1}
+	for i := 0; i < b.N; i++ {
+		out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
+		if err != nil {
+			panic(err)
+		}
+		out.MustDrop()
+	}
+}