added Conv2D benchmark

This commit is contained in:
sugarme 2022-05-06 18:10:38 +10:00
parent 7e4799eb52
commit 670d1e9cdf
3 changed files with 136 additions and 0 deletions

View File

@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed incorrect indexing at `dutil/Dataset.Next()`
- Added `nn.MSELoss()`
- reworked `ts.Format()`
- Added conv2d benchmark
## [Nofix]
- ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.

63
ts/README.md Normal file
View File

@ -0,0 +1,63 @@
# BENCHMARK
## Convolution 2D
Ref.
1. https://tigress-web.princeton.edu/~jdh4/PyTorchPerformanceTuningGuide_GTC2021.pdf
2. https://github.com/soumith/convnet-benchmarks
Benchmark tensor operation `conv2d` forward propagation:
- input shape: `[32, 64, 64, 64]`
- kernel: `[64, 3, 3]`
goos: linux
goarch: amd64
pkg: github.com/sugarme/gotch/ts
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
BenchmarkConv2dCPU-8 100 21198303 ns/op
BenchmarkConv2dCUDA-8 100 2201213 ns/op
```bash
name time/op
Conv2dCPU-8 21.2ms ± 0%
Conv2dCUDA-8 2.20ms ± 0%
```
Benchmark against Pytorch 1.11 CUDA 11
```bash
conv2d-CPU(x): 56.7 ms
conv2d-CUDA(x): 38.0 ms
```
benchmark Python code below
```python
import torch
import torch.nn.functional as F
import timeit
x = torch.randn(32, 64, 64, 64)
def conv2dCPU(x):
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False)
return conv1(x)
def conv2dCUDA(x):
x = x.cuda()
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False).cuda()
return conv1(x)
t0 = timeit.Timer(
stmt='conv2dCPU(x)',
setup='from __main__ import conv2dCPU',
globals={'x': x})
t1 = timeit.Timer(
stmt='conv2dCUDA(x)',
setup='from __main__ import conv2dCUDA',
globals={'x': x})
print(f'conv2d-CPU(x): {t0.timeit(100) / 100 * 1e3:>5.1f} ms')
print(f'conv2d-CUDA(x): {t1.timeit(100) / 100 * 1e3:>5.1f} ms')
```

View File

@ -0,0 +1,72 @@
package ts_test
import (
"testing"
"github.com/sugarme/gotch"
"github.com/sugarme/gotch/ts"
)
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
// benchstat op-conv-bench.txt
func BenchmarkConv2dCPU(b *testing.B) {
// var shape []int64 = []int64{64, 3, 224, 224}
var shape []int64 = []int64{32, 64, 64, 64}
device := gotch.CPU
x := ts.MustRandn(shape, gotch.Float, device)
// kDims := []int64{1, 3, 3, 3}
kDims := []int64{1, 64, 3, 3}
kernelTemplate := []int64{
1, 1, 1,
1, -8, 1,
1, 1, 1,
}
var kernelData []int64
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
kernelData = append(kernelData, kernelTemplate...)
}
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
stride := []int64{1, 1}
padding := []int64{0, 0}
dilation := []int64{1, 1}
for i := 0; i < b.N; i++ {
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
if err != nil {
panic(err)
}
out.MustDrop()
}
}
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
// benchstat op-conv-bench.txt
func BenchmarkConv2dCUDA(b *testing.B) {
// var shape []int64 = []int64{64, 3, 224, 224}
var shape []int64 = []int64{32, 64, 64, 64}
device := gotch.CudaIfAvailable()
x := ts.MustRandn(shape, gotch.Float, device)
// kDims := []int64{1, 3, 3, 3}
kDims := []int64{1, 64, 3, 3}
kernelTemplate := []int64{
1, 1, 1,
1, -8, 1,
1, 1, 1,
}
var kernelData []int64
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
kernelData = append(kernelData, kernelTemplate...)
}
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
stride := []int64{1, 1}
padding := []int64{0, 0}
dilation := []int64{1, 1}
for i := 0; i < b.N; i++ {
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
if err != nil {
panic(err)
}
out.MustDrop()
}
}