added Conv2D benchmark
This commit is contained in:
parent
7e4799eb52
commit
670d1e9cdf
|
@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
- Fixed incorrect indexing at `dutil/Dataset.Next()`
|
||||
- Added `nn.MSELoss()`
|
||||
- reworked `ts.Format()`
|
||||
- Added conv2d benchmark
|
||||
|
||||
## [Nofix]
|
||||
- ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.
|
||||
|
|
63
ts/README.md
Normal file
63
ts/README.md
Normal file
|
@ -0,0 +1,63 @@
|
|||
# BENCHMARK
|
||||
|
||||
## Convolution 2D
|
||||
|
||||
Ref.
|
||||
1. https://tigress-web.princeton.edu/~jdh4/PyTorchPerformanceTuningGuide_GTC2021.pdf
|
||||
2. https://github.com/soumith/convnet-benchmarks
|
||||
|
||||
Benchmark tensor operation `conv2d` forward propagation:
|
||||
- input shape: `[32, 64, 64, 64]`
|
||||
- kernel: `[64, 3, 3]`
|
||||
|
||||
goos: linux
|
||||
goarch: amd64
|
||||
pkg: github.com/sugarme/gotch/ts
|
||||
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
|
||||
BenchmarkConv2dCPU-8 100 21198303 ns/op
|
||||
BenchmarkConv2dCUDA-8 100 2201213 ns/op
|
||||
|
||||
```bash
|
||||
name time/op
|
||||
Conv2dCPU-8 21.2ms ± 0%
|
||||
Conv2dCUDA-8 2.20ms ± 0%
|
||||
```
|
||||
|
||||
Benchmark against Pytorch 1.11 CUDA 11
|
||||
|
||||
```bash
|
||||
conv2d-CPU(x): 56.7 ms
|
||||
conv2d-CUDA(x): 38.0 ms
|
||||
```
|
||||
|
||||
benchmark Python code below
|
||||
|
||||
```python
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import timeit
|
||||
|
||||
x = torch.randn(32, 64, 64, 64)
|
||||
|
||||
def conv2dCPU(x):
|
||||
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False)
|
||||
return conv1(x)
|
||||
|
||||
def conv2dCUDA(x):
|
||||
x = x.cuda()
|
||||
conv1 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=False).cuda()
|
||||
return conv1(x)
|
||||
|
||||
t0 = timeit.Timer(
|
||||
stmt='conv2dCPU(x)',
|
||||
setup='from __main__ import conv2dCPU',
|
||||
globals={'x': x})
|
||||
|
||||
t1 = timeit.Timer(
|
||||
stmt='conv2dCUDA(x)',
|
||||
setup='from __main__ import conv2dCUDA',
|
||||
globals={'x': x})
|
||||
|
||||
print(f'conv2d-CPU(x): {t0.timeit(100) / 100 * 1e3:>5.1f} ms')
|
||||
print(f'conv2d-CUDA(x): {t1.timeit(100) / 100 * 1e3:>5.1f} ms')
|
||||
```
|
72
ts/benchmark-conv2d_test.go
Normal file
72
ts/benchmark-conv2d_test.go
Normal file
|
@ -0,0 +1,72 @@
|
|||
package ts_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/sugarme/gotch"
|
||||
"github.com/sugarme/gotch/ts"
|
||||
)
|
||||
|
||||
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
|
||||
// benchstat op-conv-bench.txt
|
||||
func BenchmarkConv2dCPU(b *testing.B) {
|
||||
// var shape []int64 = []int64{64, 3, 224, 224}
|
||||
var shape []int64 = []int64{32, 64, 64, 64}
|
||||
device := gotch.CPU
|
||||
x := ts.MustRandn(shape, gotch.Float, device)
|
||||
// kDims := []int64{1, 3, 3, 3}
|
||||
kDims := []int64{1, 64, 3, 3}
|
||||
kernelTemplate := []int64{
|
||||
1, 1, 1,
|
||||
1, -8, 1,
|
||||
1, 1, 1,
|
||||
}
|
||||
var kernelData []int64
|
||||
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
|
||||
kernelData = append(kernelData, kernelTemplate...)
|
||||
}
|
||||
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
|
||||
|
||||
stride := []int64{1, 1}
|
||||
padding := []int64{0, 0}
|
||||
dilation := []int64{1, 1}
|
||||
for i := 0; i < b.N; i++ {
|
||||
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
out.MustDrop()
|
||||
}
|
||||
}
|
||||
|
||||
// GOMAXPROCS=8 go test -bench=BenchmarkConv2d -benchtime=100x -run=^a | tee op-conv-bench.txt
|
||||
// benchstat op-conv-bench.txt
|
||||
func BenchmarkConv2dCUDA(b *testing.B) {
|
||||
// var shape []int64 = []int64{64, 3, 224, 224}
|
||||
var shape []int64 = []int64{32, 64, 64, 64}
|
||||
device := gotch.CudaIfAvailable()
|
||||
x := ts.MustRandn(shape, gotch.Float, device)
|
||||
// kDims := []int64{1, 3, 3, 3}
|
||||
kDims := []int64{1, 64, 3, 3}
|
||||
kernelTemplate := []int64{
|
||||
1, 1, 1,
|
||||
1, -8, 1,
|
||||
1, 1, 1,
|
||||
}
|
||||
var kernelData []int64
|
||||
for i := 0; i < int(kDims[0]*kDims[1]); i++ {
|
||||
kernelData = append(kernelData, kernelTemplate...)
|
||||
}
|
||||
weight := ts.MustOfSlice(kernelData).MustView(kDims, true).MustTotype(gotch.Float, true).MustTo(device, true)
|
||||
|
||||
stride := []int64{1, 1}
|
||||
padding := []int64{0, 0}
|
||||
dilation := []int64{1, 1}
|
||||
for i := 0; i < b.N; i++ {
|
||||
out, err := ts.Conv2d(x, weight, ts.NewTensor(), stride, padding, dilation, 1)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
out.MustDrop()
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user