added Conv2D benchmark
This commit is contained in:
parent
7e4799eb52
commit
670d1e9cdf
|
@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
- Fixed incorrect indexing at `dutil/Dataset.Next()`
|
- Fixed incorrect indexing at `dutil/Dataset.Next()`
|
||||||
- Added `nn.MSELoss()`
|
- Added `nn.MSELoss()`
|
||||||
- reworked `ts.Format()`
|
- reworked `ts.Format()`
|
||||||
|
- Added conv2d benchmark
|
||||||
|
|
||||||
## [Nofix]
|
## [Nofix]
|
||||||
- ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.
|
- ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.
|
||||||
|
|
63
ts/README.md
Normal file
63
ts/README.md
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
# BENCHMARK
|
||||||
|
|
||||||
|
## Convolution 2D
|
||||||
|
|
||||||
|
Ref.
|
||||||
|
1. https://tigress-web.princeton.edu/~jdh4/PyTorchPerformanceTuningGuide_GTC2021.pdf
|
||||||
|
2. https://github.com/soumith/convnet-benchmarks
|
||||||
|
|
||||||
|
Benchmark tensor operation `conv2d` forward propagation:
|
||||||
|
- input shape: `[32, 64, 64, 64]`
|
||||||
|
- kernel: `[64, 3, 3]`
|
||||||
|
|
||||||
|
goos: linux
|
||||||
|
goarch: amd64
|
||||||
|
pkg: github.com/sugarme/gotch/ts
|
||||||
|
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
|
||||||
|
BenchmarkConv2dCPU-8 100 21198303 ns/op
|
||||||
|
BenchmarkConv2dCUDA-8 100 2201213 ns/op
|
||||||
|
|
||||||
|
```bash
|
||||||
|
name time/op
|
||||||
|
Conv2dCPU-8 21.2ms ± 0%
|
||||||
|
Conv2dCUDA-8 2.20ms ± 0%
|
||||||
|
```
|
||||||
|
|
||||||
|
Benchmark against Pytorch 1.11 CUDA 11
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conv2d-CPU(x): 56.7 ms
|
||||||
|
conv2d-CUDA(x): 38.0 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
benchmark Python code below
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import timeit
|
||||||
|
|
||||||
|
x = torch.randn(32, 64, 64, 64)
|
||||||
|
|
||||||
|
def conv2dCPU(x):
|
||||||
|
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False)
|
||||||
|
return conv1(x)
|
||||||
|
|
||||||
|
def conv2dCUDA(x):
|
||||||
|
x = x.cuda()
|
||||||
|
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False).cuda()
|
||||||
|
return conv1(x)
|
||||||
|
|
||||||
|
t0 = timeit.Timer(
|
||||||
|
stmt='conv2dCPU(x)',
|
||||||
|
setup='from __main__ import conv2dCPU',
|
||||||
|
globals={'x': x})
|
||||||
|
|
||||||
|
t1 = timeit.Timer(
|
||||||
|
stmt='conv2dCUDA(x)',
|
||||||
|
setup='from __main__ import conv2dCUDA',
|
||||||
|
globals={'x': x})
|
||||||
|
|
||||||
|
print(f'conv2d-CPU(x): {t0.timeit(100) / 100 * 1e3:>5.1f} ms')
|
||||||
|
print(f'conv2d-CUDA(x): {t1.timeit(100) / 100 * 1e3:>5.1f} ms')
|
||||||
|
```
|
72
ts/benchmark-conv2d_test.go
Normal file
72
ts/benchmark-conv2d_test.go
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
package ts_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/sugarme/gotch"
|
||||||
|
"github.com/sugarme/gotch/ts"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
|
||||||
|
// benchstat op-conv-bench.txt
|
||||||
|
func BenchmarkConv2dCPU(b *testing.B) {
|
||||||
|
// var shape []int64 = []int64{64, 3, 224, 224}
|
||||||
|
var shape []int64 = []int64{32, 64, 64, 64}
|
||||||
|
device := gotch.CPU
|
||||||
|
x := ts.MustRandn(shape, gotch.Float, device)
|
||||||
|
// kDims := []int64{1, 3, 3, 3}
|
||||||
|
kDims := []int64{1, 64, 3, 3}
|
||||||
|
kernelTemplate := []int64{
|
||||||
|
1, 1, 1,
|
||||||
|
1, -8, 1,
|
||||||
|
1, 1, 1,
|
||||||
|
}
|
||||||
|
var kernelData []int64
|
||||||
|
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
|
||||||
|
kernelData = append(kernelData, kernelTemplate...)
|
||||||
|
}
|
||||||
|
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
|
||||||
|
|
||||||
|
stride := []int64{1, 1}
|
||||||
|
padding := []int64{0, 0}
|
||||||
|
dilation := []int64{1, 1}
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
out.MustDrop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
|
||||||
|
// benchstat op-conv-bench.txt
|
||||||
|
func BenchmarkConv2dCUDA(b *testing.B) {
|
||||||
|
// var shape []int64 = []int64{64, 3, 224, 224}
|
||||||
|
var shape []int64 = []int64{32, 64, 64, 64}
|
||||||
|
device := gotch.CudaIfAvailable()
|
||||||
|
x := ts.MustRandn(shape, gotch.Float, device)
|
||||||
|
// kDims := []int64{1, 3, 3, 3}
|
||||||
|
kDims := []int64{1, 64, 3, 3}
|
||||||
|
kernelTemplate := []int64{
|
||||||
|
1, 1, 1,
|
||||||
|
1, -8, 1,
|
||||||
|
1, 1, 1,
|
||||||
|
}
|
||||||
|
var kernelData []int64
|
||||||
|
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
|
||||||
|
kernelData = append(kernelData, kernelTemplate...)
|
||||||
|
}
|
||||||
|
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
|
||||||
|
|
||||||
|
stride := []int64{1, 1}
|
||||||
|
padding := []int64{0, 0}
|
||||||
|
dilation := []int64{1, 1}
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
out.MustDrop()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user