gotch/vision/resnet.go

package vision

import (
	"fmt"

	nn "github.com/sugarme/gotch/nn"
	ts "github.com/sugarme/gotch/tensor"
)

// ResNet implementation.
//
// See "Deep Residual Learning for Image Recognition" He et al. 2015
// https://arxiv.org/abs/1512.03385

func layerZero(p *nn.Path) ts.ModuleT {
	conv1 := conv2dNoBias(p.Sub("conv1"), 3, 64, 7, 3, 2)
	bn1 := nn.BatchNorm2D(p.Sub("bn1"), 64, nn.DefaultBatchNormConfig())
	layer0 := nn.SeqT()
	layer0.Add(conv1)
	layer0.Add(bn1)
	layer0.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
		return xs.MustRelu(false)
	}))
	layer0.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
		return xs.MustMaxPool2d([]int64{3, 3}, []int64{2, 2}, []int64{1, 1}, []int64{1, 1}, false, false)
	}))

	return layer0
}

func basicLayer(path *nn.Path, cIn, cOut, stride, cnt int64) ts.ModuleT {
	layer := nn.SeqT()
	layer.Add(newBasicBlock(path.Sub("0"), cIn, cOut, stride))
	for blockIndex := 1; blockIndex < int(cnt); blockIndex++ {
		layer.Add(newBasicBlock(path.Sub(fmt.Sprint(blockIndex)), cOut, cOut, 1))
	}

	return layer
}

func conv2d(p *nn.Path, cIn, cOut, ksize, padding, stride int64) *nn.Conv2D {
	config := nn.DefaultConv2DConfig()
	config.Stride = []int64{stride, stride}
	config.Padding = []int64{padding, padding}

	return nn.NewConv2D(p, cIn, cOut, ksize, config)
}

func conv2dNoBias(p *nn.Path, cIn, cOut, ksize, padding, stride int64) *nn.Conv2D {
	config := nn.DefaultConv2DConfig()
	config.Bias = false
	config.Stride = []int64{stride, stride}
	config.Padding = []int64{padding, padding}

	return nn.NewConv2D(p, cIn, cOut, ksize, config)
}

func downSample(path *nn.Path, cIn, cOut, stride int64) ts.ModuleT {
	if stride != 1 || cIn != cOut {
		seq := nn.SeqT()
		seq.Add(conv2dNoBias(path.Sub("0"), cIn, cOut, 1, 0, stride))
		seq.Add(nn.BatchNorm2D(path.Sub("1"), cOut, nn.DefaultBatchNormConfig()))

		return seq
	}
	return nn.SeqT()
}

type basicBlock struct {
	Conv1      *nn.Conv2D
	Bn1        *nn.BatchNorm
	Conv2      *nn.Conv2D
	Bn2        *nn.BatchNorm
	Downsample ts.ModuleT
}

func newBasicBlock(path *nn.Path, cIn, cOut, stride int64) *basicBlock {
	conv1 := conv2dNoBias(path.Sub("conv1"), cIn, cOut, 3, 1, stride)
	bn1 := nn.BatchNorm2D(path.Sub("bn1"), cOut, nn.DefaultBatchNormConfig())
	conv2 := conv2dNoBias(path.Sub("conv2"), cOut, cOut, 3, 1, 1)
	bn2 := nn.BatchNorm2D(path.Sub("bn2"), cOut, nn.DefaultBatchNormConfig())
	downsample := downSample(path.Sub("downsample"), cIn, cOut, stride)

	return &basicBlock{conv1, bn1, conv2, bn2, downsample}
}

func (bb *basicBlock) ForwardT(x *ts.Tensor, train bool) *ts.Tensor {
	c1 := bb.Conv1.ForwardT(x, train)
	bn1Ts := bb.Bn1.ForwardT(c1, train)
	c1.MustDrop()
	relu := bn1Ts.MustRelu(true)
	c2 := bb.Conv2.ForwardT(relu, train)
	relu.MustDrop()
	bn2Ts := bb.Bn2.ForwardT(c2, train)
	c2.MustDrop()
	dsl := bb.Downsample.ForwardT(x, train)
	dslAdd := dsl.MustAdd(bn2Ts, true)
	bn2Ts.MustDrop()
	res := dslAdd.MustRelu(true)

	return res
}

func resnet(p *nn.Path, nclasses int64, c1, c2, c3, c4 int64) nn.FuncT {
	seq := nn.SeqT()
	layer0 := layerZero(p)
	layer1 := basicLayer(p.Sub("layer1"), 64, 64, 1, 3)
	layer2 := basicLayer(p.Sub("layer2"), 64, 128, 2, 4)
	layer3 := basicLayer(p.Sub("layer3"), 128, 256, 2, 6)
	layer4 := basicLayer(p.Sub("layer4"), 256, 512, 2, 3)
	seq.Add(layer0)
	seq.Add(layer1)
	seq.Add(layer2)
	seq.Add(layer3)
	seq.Add(layer4)

	if nclasses > 0 {
		// With final layer
		linearConfig := nn.DefaultLinearConfig()
		fc := nn.NewLinear(p.Sub("fc"), 512, nclasses, linearConfig)
		return nn.NewFuncT(func(x *ts.Tensor, train bool) *ts.Tensor {
			output := seq.ForwardT(x, train)
			avgpool := output.MustAdaptiveAvgPool2d([]int64{1, 1}, true)
			fv := avgpool.FlatView()
			avgpool.MustDrop()
			retVal := fv.ApplyOpt(ts.WithModule(fc))
			fv.MustDrop()

			return retVal
		})
	} else {
		// no final layer
		return nn.NewFuncT(func(x *ts.Tensor, train bool) *ts.Tensor {
			output := seq.ForwardT(x, train)
			avgpool := output.MustAdaptiveAvgPool2d([]int64{1, 1}, true)
			retVal := avgpool.FlatView()
			avgpool.MustDrop()

			return retVal
		})
	}
}

type bottleneckBlock struct {
	Conv1      *nn.Conv2D
	Bn1        *nn.BatchNorm
	Conv2      *nn.Conv2D
	Bn2        *nn.BatchNorm
	Conv3      *nn.Conv2D
	Bn3        *nn.BatchNorm
	Downsample ts.ModuleT
}

// ForwardT implements ModuleT for bottleneckBlock.
func (b *bottleneckBlock) ForwardT(xs *ts.Tensor, train bool) *ts.Tensor {
	c1 := xs.Apply(b.Conv1)
	bn1 := c1.ApplyT(b.Bn1, train)
	c1.MustDrop()
	relu1 := bn1.MustRelu(true)
	c2 := relu1.Apply(b.Conv2)
	relu1.MustDrop()
	bn2 := c2.ApplyT(b.Bn2, train)
	relu2 := bn2.MustRelu(true)
	c3 := relu2.Apply(b.Conv3)
	relu2.MustDrop()
	bn3 := c3.ApplyT(b.Bn3, train)

	dsl := xs.ApplyT(b.Downsample, train)
	add := dsl.MustAdd(bn3, true)
	bn3.MustDrop()
	res := add.MustRelu(true)
	return res
}

// Bottleneck versions for ResNet 50, 101, and 152.
func newBottleneckBlock(path *nn.Path, cIn, cOut, stride, e int64) *bottleneckBlock {
	eDim := e * cOut
	conv1 := conv2d(path.Sub("conv1"), cIn, cOut, 1, 0, 1)
	bn1 := nn.BatchNorm2D(path.Sub("bn1"), cOut, nn.DefaultBatchNormConfig())
	conv2 := conv2d(path.Sub("conv2"), cOut, cOut, 3, 1, stride)
	bn2 := nn.BatchNorm2D(path.Sub("bn2"), cOut, nn.DefaultBatchNormConfig())
	conv3 := conv2d(path.Sub("conv3"), cOut, eDim, 1, 0, 1)
	bn3 := nn.BatchNorm2D(path.Sub("bn3"), eDim, nn.DefaultBatchNormConfig())
	downsample := downSample(path.Sub("downsample"), cIn, eDim, stride)

	return &bottleneckBlock{
		Conv1:      conv1,
		Bn1:        bn1,
		Conv2:      conv2,
		Bn2:        bn2,
		Conv3:      conv3,
		Bn3:        bn3,
		Downsample: downsample,
	}
}

func bottleneckLayer(path *nn.Path, cIn, cOut, stride, cnt int64) ts.ModuleT {
	layer := nn.SeqT()
	layer.Add(newBottleneckBlock(path.Sub("0"), cIn, cOut, stride, 4))
	for blockIndex := 1; blockIndex < int(cnt); blockIndex++ {
		layer.Add(newBottleneckBlock(path.Sub(fmt.Sprint(blockIndex)), (cOut * 4), cOut, 1, 4))
	}

	return layer
}

func bottleneckResnet(path *nn.Path, nclasses int64, c1, c2, c3, c4 int64) ts.ModuleT {
	conv1 := conv2d(path.Sub("conv1"), 3, 64, 7, 3, 2)
	bn1 := nn.BatchNorm2D(path.Sub("bn1"), 64, nn.DefaultBatchNormConfig())

	layer1 := bottleneckLayer(path.Sub("layer1"), 64, 64, 1, c1)
	layer2 := bottleneckLayer(path.Sub("layer2"), 4*64, 128, 2, c2)
	layer3 := bottleneckLayer(path.Sub("layer3"), 4*128, 256, 2, c3)
	layer4 := bottleneckLayer(path.Sub("layer4"), 4*256, 512, 2, c4)

	seq := nn.SeqT()
	seq.Add(conv1)
	seq.Add(bn1)
	seq.Add(layer1)
	seq.Add(layer2)
	seq.Add(layer3)
	seq.Add(layer4)

	if nclasses > 0 {
		// With final layer
		linearConfig := nn.DefaultLinearConfig()
		fc := nn.NewLinear(path.Sub("fc"), 4*512, nclasses, linearConfig)
		return nn.NewFuncT(func(x *ts.Tensor, train bool) *ts.Tensor {
			output := seq.ForwardT(x, train)
			avgpool := output.MustAdaptiveAvgPool2d([]int64{1, 1}, true)
			fv := avgpool.FlatView()
			avgpool.MustDrop()
			retVal := fv.ApplyOpt(ts.WithModule(fc))
			fv.MustDrop()

			return retVal
		})
	} else {
		// no final layer
		return nn.NewFuncT(func(x *ts.Tensor, train bool) *ts.Tensor {
			output := seq.ForwardT(x, train)
			avgpool := output.MustAdaptiveAvgPool2d([]int64{1, 1}, true)
			retVal := avgpool.FlatView()
			avgpool.MustDrop()

			return retVal
		})
	}
}

// ResNet18 creates a ResNet-18 model.
func ResNet18(path *nn.Path, numClasses int64) nn.FuncT {
	return resnet(path, numClasses, 2, 2, 2, 2)
}

// ResNet18 creates a ResNet-18 model without final fully connfected layer.
func ResNet18NoFinalLayer(path *nn.Path) nn.FuncT {
	return resnet(path, 0, 2, 2, 2, 2)
}

// ResNet34 creates a ResNet-34 model.
func ResNet34(path *nn.Path, numClasses int64) nn.FuncT {
	return resnet(path, numClasses, 3, 4, 6, 3)
}

// ResNet34 creates a ResNet-34 model without final fully connfected layer.
func ResNet34NoFinalLayer(path *nn.Path) nn.FuncT {
	return resnet(path, 0, 3, 4, 6, 3)
}

// ResNet50 creates a ResNet-50 model.
func ResNet50(path *nn.Path, numClasses int64) ts.ModuleT {
	return bottleneckResnet(path, numClasses, 3, 4, 6, 3)
}

// ResNet50 creates a ResNet-50 model without final fully connfected layer.
func ResNet50NoFinalLayer(path *nn.Path) ts.ModuleT {
	return bottleneckResnet(path, 0, 3, 4, 6, 3)
}

// ResNet101 creates a ResNet-101 model.
func ResNet101(path *nn.Path, numClasses int64) ts.ModuleT {
	return bottleneckResnet(path, numClasses, 3, 4, 23, 3)
}

// ResNet101 creates a ResNet-101 model without final fully connfected layer.
func ResNet101NoFinalLayer(path *nn.Path) ts.ModuleT {
	return bottleneckResnet(path, 0, 3, 4, 23, 3)
}

// ResNet152 creates a ResNet-152 model.
func ResNet152(path *nn.Path, numClasses int64) ts.ModuleT {
	return bottleneckResnet(path, numClasses, 3, 8, 36, 3)
}

// ResNet150 creates a ResNet-150 model without final fully connfected layer.
func ResNet150NoFinalLayer(path *nn.Path) ts.ModuleT {
	return bottleneckResnet(path, 0, 3, 8, 36, 3)
}