diff --git a/CHANGELOG.md b/CHANGELOG.md index ace57f7..12e7217 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +## [0.3.9] - [#24], [#26]: fixed memory leak. - [#30]: fixed varstore.Save() randomly panic - segmentfault - [#32]: nn.Seq Forward return nil tensor if length of layers = 1 +- [#36]: resolved image augmentation ## [0.3.8] diff --git a/example/augmentation/README.md b/example/augmentation/README.md new file mode 100644 index 0000000..f20d32e --- /dev/null +++ b/example/augmentation/README.md @@ -0,0 +1,31 @@ +# Image Augmentation Example + +This example demonstrates how to use image augmentation functions. It is implemented as similar as possible to [original Pytorch vision/transform](https://pytorch.org/vision/stable/transforms.html#). + +There are 2 APIs (`aug.Compose` and `aug.OneOf`) to compose augmentation methods as shown in the example: + +```go + t, err := aug.Compose( + aug.WithRandomVFlip(0.5), + aug.WithRandomHFlip(0.5), + aug.WithRandomCutout(), + aug.OneOf( + 0.3, + aug.WithColorJitter(0.3, 0.3, 0.3, 0.4), + aug.WithRandomGrayscale(1.0), + ), + aug.OneOf( + 0.3, + aug.WithGaussianBlur([]int64{5, 5}, []float64{1.0, 2.0}), + aug.WithRandomAffine(), + ), + ) + if err != nil { + panic(err) + } + + out := t.Transform(imgTs) +``` + + + diff --git a/example/augmentation/bb.png b/example/augmentation/bb.png new file mode 100644 index 0000000..6b13541 Binary files /dev/null and b/example/augmentation/bb.png differ diff --git a/example/augmentation/main.go b/example/augmentation/main.go new file mode 100644 index 0000000..c50ec1d --- /dev/null +++ b/example/augmentation/main.go @@ -0,0 +1,69 @@ +package main + +import ( + "fmt" + + "github.com/sugarme/gotch" + "github.com/sugarme/gotch/vision" + "github.com/sugarme/gotch/vision/aug" +) + +func main() { + n := 360 + for i := 1; i <= n; i++ { + img, err := vision.Load("./bb.png") + if err != nil { + panic(err) + } + + device := gotch.CudaIfAvailable() + // device := gotch.CPU + imgTs := img.MustTo(device, true) + // t, err := aug.Compose(aug.WithResize(512, 512)) // NOTE. WithResize just works on CPU. + // t, err := aug.Compose(aug.WithRandRotate(0, 360), aug.WithColorJitter(0.3, 0.3, 0.3, 0.4)) + // t, err := aug.Compose(aug.WithGaussianBlur([]int64{5, 5}, []float64{1.0, 2.0}), aug.WithRandRotate(0, 360), aug.WithColorJitter(0.3, 0.3, 0.3, 0.3)) + // t, err := aug.Compose(aug.WithRandomCrop([]int64{320, 320}, []int64{10, 10}, true, "constant")) + // t, err := aug.Compose(aug.WithCenterCrop([]int64{320, 320})) + // t, err := aug.Compose(aug.WithRandomCutout(aug.WithCutoutValue([]int64{124, 96, 255}), aug.WithCutoutScale([]float64{0.01, 0.1}), aug.WithCutoutRatio([]float64{0.5, 0.5}))) + // t, err := aug.Compose(aug.WithRandomPerspective(aug.WithPerspectiveScale(0.6), aug.WithPerspectivePvalue(0.8))) + // t, err := aug.Compose(aug.WithRandomAffine(aug.WithAffineDegree([]int64{0, 15}), aug.WithAffineShear([]float64{0, 15}))) + // t, err := aug.Compose(aug.WithRandomGrayscale(0.5)) + // t, err := aug.Compose(aug.WithRandomSolarize(aug.WithSolarizeThreshold(125), aug.WithSolarizePvalue(0.5))) + // t, err := aug.Compose(aug.WithRandomInvert(0.5)) + // t, err := aug.Compose(aug.WithRandomPosterize(aug.WithPosterizeBits(2), aug.WithPosterizePvalue(1.0))) + // t, err := aug.Compose(aug.WithRandomAutocontrast()) + // t, err := aug.Compose(aug.WithRandomAdjustSharpness(aug.WithSharpnessPvalue(0.3), aug.WithSharpnessFactor(10))) + // t, err := aug.Compose(aug.WithRandomEqualize(1.0)) + // t, err := aug.Compose(aug.WithNormalize(aug.WithNormalizeMean([]float64{0.485, 0.456, 0.406}), aug.WithNormalizeStd([]float64{0.229, 0.224, 0.225}))) + + t, err := aug.Compose( + aug.WithRandomVFlip(0.5), + aug.WithRandomHFlip(0.5), + aug.WithRandomCutout(), + aug.OneOf( + 0.3, + aug.WithColorJitter(0.3, 0.3, 0.3, 0.4), + aug.WithRandomGrayscale(1.0), + ), + aug.OneOf( + 0.3, + aug.WithGaussianBlur([]int64{5, 5}, []float64{1.0, 2.0}), + aug.WithRandomAffine(), + ), + ) + if err != nil { + panic(err) + } + + out := t.Transform(imgTs) + fname := fmt.Sprintf("./output/bb-%03d.png", i) + err = vision.Save(out, fname) + if err != nil { + panic(err) + } + imgTs.MustDrop() + out.MustDrop() + + fmt.Printf("%03d/%v completed.\n", i, n) + } +} diff --git a/example/augmentation/output/.gitignore b/example/augmentation/output/.gitignore new file mode 100644 index 0000000..7c9d611 --- /dev/null +++ b/example/augmentation/output/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!README.md diff --git a/example/augmentation/output/README.md b/example/augmentation/output/README.md new file mode 100644 index 0000000..b11fa64 --- /dev/null +++ b/example/augmentation/output/README.md @@ -0,0 +1 @@ +Output images will be here. diff --git a/tensor/patch.go b/tensor/patch.go index 13db623..855e372 100644 --- a/tensor/patch.go +++ b/tensor/patch.go @@ -581,7 +581,7 @@ func (ts *Tensor) Lstsq(a *Tensor, del bool) (retVal *Tensor, err error) { } func (ts *Tensor) MustLstsq(a *Tensor, del bool) (retVal *Tensor) { - retVal, err := ts.Lstsq(del) + retVal, err := ts.Lstsq(a, del) if err != nil { log.Fatal(err) } diff --git a/vision/aug/affine.go b/vision/aug/affine.go new file mode 100644 index 0000000..c8944c7 --- /dev/null +++ b/vision/aug/affine.go @@ -0,0 +1,185 @@ +package aug + +import ( + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +// RandomAffine is transformation of the image keeping center invariant. +// If the image is torch Tensor, it is expected +// to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// - degrees (sequence or number): Range of degrees to select from. +// If degrees is a number instead of sequence like (min, max), the range of degrees +// will be (-degrees, +degrees). Set to 0 to deactivate rotations. +// - translate (tuple, optional): tuple of maximum absolute fraction for horizontal +// and vertical translations. For example translate=(a, b), then horizontal shift +// is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is +// randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default. +// - scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is +// randomly sampled from the range a <= scale <= b. Will keep original scale by default. +// - shear (sequence or number, optional): Range of degrees to select from. +// If shear is a number, a shear parallel to the x axis in the range (-shear, +shear) +// will be applied. Else if shear is a sequence of 2 values a shear parallel to the x axis in the +// range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, +// a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. +// Will not apply shear by default. +// - interpolation (InterpolationMode): Desired interpolation enum defined by +// :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. +// If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. +// For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. +// - fill (sequence or number): Pixel fill value for the area outside the transformed +// image. Default is ``0``. If given a number, the value is used for all bands respectively. +// Please use the ``interpolation`` parameter instead. +// .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters +type RandomAffine struct { + degree []int64 // degree range + translate []float64 + scale []float64 // scale range + shear []float64 + interpolationMode string + fillValue []float64 +} + +func (ra *RandomAffine) getParams(imageSize []int64) (float64, []int64, float64, []float64) { + angleTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + angleTs.MustUniform_(float64(ra.degree[0]), float64(ra.degree[1])) + angle := angleTs.Float64Values()[0] + angleTs.MustDrop() + + var translations []int64 = []int64{0, 0} + if ra.translate != nil { + maxDX := ra.translate[0] * float64(imageSize[0]) + maxDY := ra.translate[1] * float64(imageSize[1]) + dx := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + dx.MustUniform_(-maxDX, maxDX) + tx := dx.Float64Values()[0] + dx.MustDrop() + + dy := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + dy.MustUniform_(-maxDY, maxDY) + ty := dx.Float64Values()[0] + dy.MustDrop() + + translations = []int64{int64(tx), int64(ty)} // should we use math.Round here??? + } + + scale := 1.0 + if ra.scale != nil { + scaleTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + scaleTs.MustUniform_(ra.scale[0], ra.scale[1]) + scale = scaleTs.Float64Values()[0] + scaleTs.MustDrop() + } + + var ( + shearX, shearY float64 = 0.0, 0.0 + ) + if ra.shear != nil { + shearXTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + shearXTs.MustUniform_(ra.shear[0], ra.shear[1]) + shearX = shearXTs.Float64Values()[0] + shearXTs.MustDrop() + + if len(ra.shear) == 4 { + shearYTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + shearYTs.MustUniform_(ra.shear[2], ra.shear[3]) + shearY = shearYTs.Float64Values()[0] + shearYTs.MustDrop() + } + } + + var shear []float64 = []float64{shearX, shearY} + + return angle, translations, scale, shear +} + +func (ra *RandomAffine) Forward(x *ts.Tensor) *ts.Tensor { + w, h := getImageSize(x) + angle, translations, scale, shear := ra.getParams([]int64{w, h}) + + out := affine(x, angle, translations, scale, shear, ra.interpolationMode, ra.fillValue) + + return out +} + +func newRandomAffine(opts ...affineOption) *RandomAffine { + p := defaultAffineOptions() + for _, o := range opts { + o(p) + } + + return &RandomAffine{ + degree: p.degree, + translate: p.translate, + scale: p.scale, + shear: p.shear, + interpolationMode: p.interpolationMode, + fillValue: p.fillValue, + } +} + +type affineOptions struct { + degree []int64 + translate []float64 + scale []float64 + shear []float64 + interpolationMode string + fillValue []float64 +} + +type affineOption func(*affineOptions) + +func defaultAffineOptions() *affineOptions { + return &affineOptions{ + degree: []int64{-180, 180}, + translate: nil, + scale: nil, + shear: []float64{-180.0, 180.0}, + interpolationMode: "bilinear", + fillValue: []float64{0.0, 0.0, 0.0}, + } +} + +func WithAffineDegree(degree []int64) affineOption { + return func(o *affineOptions) { + o.degree = degree + } +} + +func WithAffineTranslate(translate []float64) affineOption { + return func(o *affineOptions) { + o.translate = translate + } +} + +func WithAffineScale(scale []float64) affineOption { + return func(o *affineOptions) { + o.scale = scale + } +} + +func WithAffineShear(shear []float64) affineOption { + return func(o *affineOptions) { + o.shear = shear + } +} + +func WithAffineMode(mode string) affineOption { + return func(o *affineOptions) { + o.interpolationMode = mode + } +} + +func WithAffineFillValue(fillValue []float64) affineOption { + return func(o *affineOptions) { + o.fillValue = fillValue + } +} + +func WithRandomAffine(opts ...affineOption) Option { + ra := newRandomAffine(opts...) + return func(o *Options) { + o.randomAffine = ra + } +} diff --git a/vision/aug/blur.go b/vision/aug/blur.go new file mode 100644 index 0000000..ea78524 --- /dev/null +++ b/vision/aug/blur.go @@ -0,0 +1,89 @@ +package aug + +import ( + "fmt" + "log" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +type GaussianBlur struct { + kernelSize []int64 // >= 0 && ks%2 != 0 + sigma []float64 // [0.1, 2.0] range(min, max) +} + +// ks : kernal size. Can be 1-2 element slice +// sigma: minimal and maximal standard deviation that can be chosen for blurring kernel +// range (min, max). Can be 1-2 element slice +func newGaussianBlur(ks []int64, sig []float64) *GaussianBlur { + if len(ks) == 0 || len(ks) > 2 { + err := fmt.Errorf("Kernel size should have 1-2 elements. Got %v\n", len(ks)) + log.Fatal(err) + } + for _, size := range ks { + if size <= 0 || size%2 == 0 { + err := fmt.Errorf("Kernel size should be an odd and positive number.") + log.Fatal(err) + } + } + + if len(sig) == 0 || len(sig) > 2 { + err := fmt.Errorf("Sigma should have 1-2 elements. Got %v\n", len(sig)) + log.Fatal(err) + } + + for _, s := range sig { + if s <= 0 { + err := fmt.Errorf("Sigma should be a positive number.") + log.Fatal(err) + } + } + + var kernelSize []int64 + switch len(ks) { + case 1: + kernelSize = []int64{ks[0], ks[0]} + case 2: + kernelSize = ks + default: + panic("Shouldn't reach here.") + } + + var sigma []float64 + switch len(sig) { + case 1: + sigma = []float64{sig[0], sig[0]} + case 2: + min := sig[0] + max := sig[1] + if min > max { + min = sig[1] + max = sig[0] + } + sigma = []float64{min, max} + default: + panic("Shouldn't reach here.") + } + + return &GaussianBlur{ + kernelSize: kernelSize, + sigma: sigma, + } +} + +func (b *GaussianBlur) Forward(x *ts.Tensor) *ts.Tensor { + sigmaTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + sigmaTs.MustUniform_(b.sigma[0], b.sigma[1]) + sigmaVal := sigmaTs.Float64Values()[0] + sigmaTs.MustDrop() + + return gaussianBlur(x, b.kernelSize, []float64{sigmaVal, sigmaVal}) +} + +func WithGaussianBlur(ks []int64, sig []float64) Option { + return func(o *Options) { + gb := newGaussianBlur(ks, sig) + o.gaussianBlur = gb + } +} diff --git a/vision/aug/color.go b/vision/aug/color.go new file mode 100644 index 0000000..18b5b0a --- /dev/null +++ b/vision/aug/color.go @@ -0,0 +1,77 @@ +package aug + +import ( + "math/rand" + "time" + + ts "github.com/sugarme/gotch/tensor" +) + +// Ref. https://github.com/pytorch/vision/blob/f1d734213af65dc06e777877d315973ba8386080/torchvision/transforms/functional_tensor.py + +type ColorJitter struct { + brightness float64 + contrast float64 + saturation float64 + hue float64 +} + +func defaultColorJitter() *ColorJitter { + return &ColorJitter{ + brightness: 1.0, + contrast: 1.0, + saturation: 1.0, + hue: 0.0, + } +} + +func (c *ColorJitter) setBrightness(brightness float64) { + c.brightness = brightness +} + +func (c *ColorJitter) setContrast(contrast float64) { + c.contrast = contrast +} + +func (c *ColorJitter) setSaturation(sat float64) { + c.saturation = sat +} + +func (c *ColorJitter) setHue(hue float64) { + c.hue = hue +} + +// Forward implement ts.Module by randomly picking one of brightness, contrast, +// staturation or hue function to transform input image tensor. +func (c *ColorJitter) Forward(x *ts.Tensor) *ts.Tensor { + rand.Seed(time.Now().UnixNano()) + idx := rand.Intn(4) + switch idx { + case 0: + v := randVal(getMinMax(c.brightness)) + return adjustBrightness(x, v) + case 1: + v := randVal(getMinMax(c.contrast)) + return adjustContrast(x, v) + case 2: + v := randVal(getMinMax(c.saturation)) + return adjustSaturation(x, v) + case 3: + v := randVal(0, c.hue) + return adjustHue(x, v) + default: + panic("Shouldn't reach here.") + } +} + +func WithColorJitter(brightness, contrast, sat, hue float64) Option { + c := defaultColorJitter() + c.setBrightness(brightness) + c.setContrast(contrast) + c.setSaturation(sat) + c.setHue(hue) + + return func(o *Options) { + o.colorJitter = c + } +} diff --git a/vision/aug/contrast.go b/vision/aug/contrast.go new file mode 100644 index 0000000..7b30d6c --- /dev/null +++ b/vision/aug/contrast.go @@ -0,0 +1,43 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// RandomAutocontrast autocontrasts the pixels of the given image randomly with a given probability. +// If the image is torch Tensor, it is expected +// to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// - p (float): probability of the image being autocontrasted. Default value is 0.5 +type RandomAutocontrast struct { + pvalue float64 +} + +func newRandomAutocontrast(pOpt ...float64) *RandomAutocontrast { + p := 0.5 + if len(pOpt) > 0 { + p = pOpt[0] + } + + return &RandomAutocontrast{p} +} + +func (rac *RandomAutocontrast) Forward(x *ts.Tensor) *ts.Tensor { + r := randPvalue() + var out *ts.Tensor + switch { + case r < rac.pvalue: + out = autocontrast(x) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomAutocontrast(p ...float64) Option { + rac := newRandomAutocontrast(p...) + return func(o *Options) { + o.randomAutocontrast = rac + } +} diff --git a/vision/aug/crop.go b/vision/aug/crop.go new file mode 100644 index 0000000..886f040 --- /dev/null +++ b/vision/aug/crop.go @@ -0,0 +1,124 @@ +package aug + +import ( + "fmt" + "log" + // "math" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +type RandomCrop struct { + size []int64 + padding []int64 + paddingIfNeeded bool + paddingMode string +} + +func newRandomCrop(size, padding []int64, paddingIfNeeded bool, paddingMode string) *RandomCrop { + return &RandomCrop{ + size: size, + padding: padding, + paddingIfNeeded: paddingIfNeeded, + paddingMode: paddingMode, + } +} + +// get parameters for crop +func (c *RandomCrop) params(x *ts.Tensor) (int64, int64, int64, int64) { + w, h := getImageSize(x) + th, tw := c.size[0], c.size[1] + if h+1 < th || w+1 < tw { + err := fmt.Errorf("Required crop size %v is larger then input image size %v", c.size, []int64{h, w}) + log.Fatal(err) + } + + if w == tw && h == th { + return 0, 0, h, w + } + + iTs := ts.MustRandint1(0, h-th+1, []int64{1}, gotch.Int64, gotch.CPU) + i := iTs.Int64Values()[0] + iTs.MustDrop() + + jTs := ts.MustRandint1(0, w-tw+1, []int64{1}, gotch.Int64, gotch.CPU) + j := jTs.Int64Values()[0] + jTs.MustDrop() + + return i, j, th, tw +} + +func (c *RandomCrop) Forward(x *ts.Tensor) *ts.Tensor { + var img *ts.Tensor + if c.padding != nil { + img = pad(x, c.padding, c.paddingMode) + } else { + img = x.MustShallowClone() + } + + w, h := getImageSize(x) + + var ( + paddedW *ts.Tensor + paddedWH *ts.Tensor + ) + // pad width if needed + if c.paddingIfNeeded && w < c.size[1] { + padding := []int64{c.size[1] - w, 0} + paddedW = pad(img, padding, c.paddingMode) + } else { + paddedW = img.MustShallowClone() + } + img.MustDrop() + + // pad height if needed + if c.paddingIfNeeded && h < c.size[0] { + padding := []int64{0, c.size[0] - h} + paddedWH = pad(paddedW, padding, c.paddingMode) + } else { + paddedWH = paddedW.MustShallowClone() + } + + paddedW.MustDrop() + + // i, j, h, w = self.get_params(img, self.size) + i, j, h, w := c.params(x) + out := crop(paddedWH, i, j, h, w) + paddedWH.MustDrop() + return out +} + +func WithRandomCrop(size []int64, padding []int64, paddingIfNeeded bool, paddingMode string) Option { + return func(o *Options) { + c := newRandomCrop(size, padding, paddingIfNeeded, paddingMode) + o.randomCrop = c + } +} + +// CenterCrop crops the given image at the center. +// If the image is torch Tensor, it is expected +// to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. +// If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. +type CenterCrop struct { + size []int64 +} + +func newCenterCrop(size []int64) *CenterCrop { + if len(size) != 2 { + err := fmt.Errorf("Expected size of 2 elements. Got %v\n", len(size)) + log.Fatal(err) + } + return &CenterCrop{size} +} + +func (cc *CenterCrop) Forward(x *ts.Tensor) *ts.Tensor { + return centerCrop(x, cc.size) +} + +func WithCenterCrop(size []int64) Option { + return func(o *Options) { + cc := newCenterCrop(size) + o.centerCrop = cc + } +} diff --git a/vision/aug/cutout.go b/vision/aug/cutout.go new file mode 100644 index 0000000..c0446b2 --- /dev/null +++ b/vision/aug/cutout.go @@ -0,0 +1,177 @@ +package aug + +import ( + "fmt" + "log" + "math" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +// Randomly selects a rectangle region in an torch Tensor image and erases its pixels. +// This transform does not support PIL Image. +// 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 +// +// Args: +// p: probability that the random erasing operation will be performed. +// scale: range of proportion of erased area against input image. +// ratio: range of aspect ratio of erased area. +// value: erasing value. Default is 0. If a single int, it is used to +// erase all pixels. If a tuple of length 3, it is used to erase +// R, G, B channels respectively. +// If a str of 'random', erasing each pixel with random values. +type RandomCutout struct { + pvalue float64 + scale []float64 + ratio []float64 + rgbVal []int64 // RGB value +} + +type cutoutOptions struct { + pvalue float64 + scale []float64 + ratio []float64 + rgbVal []int64 // RGB value +} + +type cutoutOption func(o *cutoutOptions) + +func defaultCutoutOptions() *cutoutOptions { + return &cutoutOptions{ + pvalue: 0.5, + scale: []float64{0.02, 0.33}, + ratio: []float64{0.3, 3.3}, + rgbVal: []int64{0, 0, 0}, + } +} + +func newRandomCutout(pvalue float64, scale, ratio []float64, rgbVal []int64) *RandomCutout { + return &RandomCutout{ + pvalue: pvalue, + scale: scale, + ratio: ratio, + rgbVal: rgbVal, + } +} + +func WithCutoutPvalue(p float64) cutoutOption { + if p < 0 || p > 1 { + log.Fatalf("Cutout p-value must be in range from 0 to 1. Got %v\n", p) + } + return func(o *cutoutOptions) { + o.pvalue = p + } +} + +func WithCutoutScale(scale []float64) cutoutOption { + if len(scale) != 2 { + log.Fatalf("Cutout scale should be in a range of 2 elments. Got %v elements\n", len(scale)) + } + return func(o *cutoutOptions) { + o.scale = scale + } +} + +func WithCutoutRatio(ratio []float64) cutoutOption { + if len(ratio) != 2 { + log.Fatalf("Cutout ratio should be in a range of 2 elments. Got %v elements\n", len(ratio)) + } + return func(o *cutoutOptions) { + o.ratio = ratio + } +} + +func WithCutoutValue(rgb []int64) cutoutOption { + var rgbVal []int64 + switch len(rgb) { + case 1: + rgbVal = []int64{rgb[0], rgb[0], rgb[0]} + case 3: + rgbVal = rgb + default: + err := fmt.Errorf("Cutout values can be single value or 3-element (RGB) value. Got %v values.", len(rgb)) + log.Fatal(err) + } + return func(o *cutoutOptions) { + o.rgbVal = rgbVal + } +} + +func (rc *RandomCutout) cutoutParams(x *ts.Tensor) (int64, int64, int64, int64, *ts.Tensor) { + dim := x.MustSize() + + imgH, imgW := dim[len(dim)-2], dim[len(dim)-1] + area := float64(imgH * imgW) + logRatio := ts.MustOfSlice(rc.ratio).MustLog(true).Float64Values() + + for i := 0; i < 10; i++ { + scaleTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + scaleTs.MustUniform_(rc.scale[0], rc.scale[1]) + scaleVal := scaleTs.Float64Values()[0] + scaleTs.MustDrop() + eraseArea := area * scaleVal + + ratioTs := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + ratioTs.MustUniform_(logRatio[0], logRatio[1]) + asTs := ratioTs.MustExp(true) + asVal := asTs.Float64Values()[0] // aspect ratio + asTs.MustDrop() + + // h = int(round(math.sqrt(erase_area * aspect_ratio))) + // w = int(round(math.sqrt(erase_area / aspect_ratio))) + h := int64(math.Round(math.Sqrt(eraseArea * asVal))) + w := int64(math.Round(math.Sqrt(eraseArea / asVal))) + if !(h < imgH && w < imgW) { + continue + } + + // v = torch.tensor(value)[:, None, None] + v := ts.MustOfSlice(rc.rgbVal).MustUnsqueeze(1, true).MustUnsqueeze(1, true) + + // i = torch.randint(0, img_h - h + 1, size=(1, )).item() + iTs := ts.MustRandint1(0, imgH-h+1, []int64{1}, gotch.Int64, gotch.CPU) + i := iTs.Int64Values()[0] + iTs.MustDrop() + // j = torch.randint(0, img_w - w + 1, size=(1, )).item() + jTs := ts.MustRandint1(0, imgW-w+1, []int64{1}, gotch.Int64, gotch.CPU) + j := jTs.Int64Values()[0] + jTs.MustDrop() + return i, j, h, w, v + } + + // return original image + img := x.MustShallowClone() + return 0, 0, imgH, imgW, img +} + +func (rc *RandomCutout) Forward(img *ts.Tensor) *ts.Tensor { + randTs := ts.MustRandn([]int64{1}, gotch.Float, gotch.CPU) + randVal := randTs.Float64Values()[0] + randTs.MustDrop() + + switch randVal < rc.pvalue { + case true: + x, y, h, w, v := rc.cutoutParams(img) + out := cutout(img, x, y, h, w, rc.rgbVal) + v.MustDrop() + return out + case false: + out := img.MustShallowClone() + return out + } + + panic("Shouldn't reach here") +} + +func WithRandomCutout(opts ...cutoutOption) Option { + params := defaultCutoutOptions() + for _, o := range opts { + o(params) + } + + return func(o *Options) { + rc := newRandomCutout(params.pvalue, params.scale, params.ratio, params.rgbVal) + o.randomCutout = rc + } +} diff --git a/vision/aug/equalize.go b/vision/aug/equalize.go new file mode 100644 index 0000000..10d65a2 --- /dev/null +++ b/vision/aug/equalize.go @@ -0,0 +1,46 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// RandomEqualize equalizes the histogram of the given image randomly with a given probability. +// If the image is torch Tensor, it is expected +// to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// - p (float): probability of the image being equalized. Default value is 0.5 +// Histogram equalization +// Ref. https://en.wikipedia.org/wiki/Histogram_equalization +type RandomEqualize struct { + pvalue float64 +} + +func newRandomEqualize(pOpt ...float64) *RandomEqualize { + p := 0.5 + if len(pOpt) > 0 { + p = pOpt[0] + } + + return &RandomEqualize{p} +} + +func (re *RandomEqualize) Forward(x *ts.Tensor) *ts.Tensor { + r := randPvalue() + + var out *ts.Tensor + switch { + case r < re.pvalue: + out = equalize(x) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomEqualize(p ...float64) Option { + re := newRandomEqualize(p...) + return func(o *Options) { + o.randomEqualize = re + } +} diff --git a/vision/aug/flip.go b/vision/aug/flip.go new file mode 100644 index 0000000..503b5cf --- /dev/null +++ b/vision/aug/flip.go @@ -0,0 +1,78 @@ +package aug + +import ( + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +// RandomHorizontalFlip horizontally flips the given image randomly with a given probability. +// +// If the image is torch Tensor, it is expected to have [..., H, W] shape, +// where ... means an arbitrary number of leading dimensions +// Args: +// p (float): probability of the image being flipped. Default value is 0.5 +type RandomHorizontalFlip struct { + pvalue float64 +} + +func newRandomHorizontalFlip(pvalue float64) *RandomHorizontalFlip { + return &RandomHorizontalFlip{ + pvalue: pvalue, + } +} + +func (hf *RandomHorizontalFlip) Forward(x *ts.Tensor) *ts.Tensor { + randTs := ts.MustRandn([]int64{1}, gotch.Float, gotch.CPU) + randVal := randTs.Float64Values()[0] + randTs.MustDrop() + switch { + case randVal < hf.pvalue: + return hflip(x) + default: + out := x.MustShallowClone() + return out + } +} + +func WithRandomHFlip(pvalue float64) Option { + return func(o *Options) { + hf := newRandomHorizontalFlip(pvalue) + o.randomHFlip = hf + } +} + +// RandomVerticalFlip vertically flips the given image randomly with a given probability. +// +// If the image is torch Tensor, it is expected to have [..., H, W] shape, +// where ... means an arbitrary number of leading dimensions +// Args: +// p (float): probability of the image being flipped. Default value is 0.5 +type RandomVerticalFlip struct { + pvalue float64 +} + +func newRandomVerticalFlip(pvalue float64) *RandomVerticalFlip { + return &RandomVerticalFlip{ + pvalue: pvalue, + } +} + +func (vf *RandomVerticalFlip) Forward(x *ts.Tensor) *ts.Tensor { + randTs := ts.MustRandn([]int64{1}, gotch.Float, gotch.CPU) + randVal := randTs.Float64Values()[0] + randTs.MustDrop() + switch { + case randVal < vf.pvalue: + return vflip(x) + default: + out := x.MustShallowClone() + return out + } +} + +func WithRandomVFlip(pvalue float64) Option { + return func(o *Options) { + vf := newRandomVerticalFlip(pvalue) + o.randomVFlip = vf + } +} diff --git a/vision/aug/function.go b/vision/aug/function.go new file mode 100644 index 0000000..8464bd9 --- /dev/null +++ b/vision/aug/function.go @@ -0,0 +1,1514 @@ +package aug + +import ( + "fmt" + "log" + "math" + "math/rand" + "time" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +func gaussianKernel1D(ks int64, sigma float64, dtype gotch.DType, device gotch.Device) *ts.Tensor { + ksHalf := (ks - 1) / 2 + x := ts.MustLinspace(ts.IntScalar(-ksHalf), ts.IntScalar(ksHalf), []int64{ks}, dtype, device) + + // pdf = torch.exp(-0.5 * (x / sigma).pow(2)) + pdf := x.MustDiv1(ts.FloatScalar(sigma), true).MustPow(ts.IntScalar(2), true).MustMul1(ts.FloatScalar(0.5), true).MustExp(true) + // kernel1d = pdf / pdf.sum() + pdfSum := pdf.MustSum(dtype, false) + kernel1d := pdf.MustDiv(pdfSum, true) + pdfSum.MustDrop() + + return kernel1d +} + +func gaussianKernel2D(ks []int64, sigma []float64, dtype gotch.DType, device gotch.Device) *ts.Tensor { + kernel1dX := gaussianKernel1D(ks[0], sigma[0], dtype, device) + kernel1dY := gaussianKernel1D(ks[1], sigma[1], dtype, device) + + // dimX := kernel1dX.MustSize() + kernel1dX.MustUnsqueeze_(0) // kernel1d_x[None, :] + dimY := kernel1dY.MustSize() + kernel1dY.MustUnsqueeze_(int64(len(dimY))) // kernel1d_y[:, None] + + kernel2d := kernel1dY.MustMm(kernel1dX, true) + kernel1dX.MustDrop() + return kernel2d +} + +func containsDType(dtype gotch.DType, dtypes []gotch.DType) bool { + for _, dt := range dtypes { + if dtype == dt { + return true + } + } + + return false +} + +func castSqueezeIn(x *ts.Tensor, reqDtypes []gotch.DType) (*ts.Tensor, bool, bool, gotch.DType) { + needSqueeze := false + xdim := x.MustSize() + var img *ts.Tensor + if len(xdim) < 4 { + img = x.MustUnsqueeze(0, false) + needSqueeze = true + } else { + img = x.MustShallowClone() + } + outDtype := x.DType() + needCast := false + if !containsDType(outDtype, reqDtypes) { + needCast = true + reqDType := reqDtypes[0] + img1 := img.MustTotype(reqDType, true) + return img1, needCast, needSqueeze, outDtype + } + return img, needCast, needSqueeze, outDtype +} + +func castSqueezeOut(x *ts.Tensor, needCast, needSqueeze bool, outDType gotch.DType) *ts.Tensor { + var ( + squeezeTs, castTs *ts.Tensor + ) + switch needSqueeze { + case true: + squeezeTs = x.MustSqueeze1(0, false) + case false: + squeezeTs = x.MustShallowClone() + } + + switch needCast { + case true: + // it is better to round before cast + if containsDType(outDType, []gotch.DType{gotch.Uint8, gotch.Int8, gotch.Int16, gotch.Int, gotch.Int64}) { + roundTs := squeezeTs.MustRound(true) + castTs = roundTs.MustTotype(outDType, true) + } else { + castTs = squeezeTs.MustTotype(outDType, true) + } + case false: + castTs = squeezeTs.MustShallowClone() + squeezeTs.MustDrop() + } + + return castTs +} + +func gaussianBlur(x *ts.Tensor, ks []int64, sigma []float64) *ts.Tensor { + dtype := gotch.Float + if x.DType() == gotch.Float || x.DType() == gotch.Double { + dtype = x.DType() + } + device := x.MustDevice() + + assertImageTensor(x) + + kernel := gaussianKernel2D(ks, sigma, dtype, device) + xdim := x.MustSize() + kdim := kernel.MustSize() + + // kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1]) + kexpand := kernel.MustExpand([]int64{xdim[len(xdim)-3], 1, kdim[0], kdim[1]}, true, true) + kdtype := kexpand.DType() + img, needCast, needSqueeze, outDType := castSqueezeIn(x, []gotch.DType{kdtype}) + + // padding = (left, right, top, bottom) + // padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2] + left := ks[0] / 2 + right := ks[0] / 2 + top := ks[1] / 2 + bottom := ks[1] / 2 + padding := []int64{left, right, top, bottom} + + // F.pad() + // https://github.com/pytorch/pytorch/blob/71f4c5c1f436258adc303b710efb3f41b2d50c4e/torch/nn/functional.py#L4070 + // img = torch_pad(img, padding, mode="reflect") + imgPad := img.MustReflectionPad2d(padding, true) // deleted img + + imgPadDim := imgPad.MustSize() + // img = conv2d(img, kernel, groups=img.shape[-3]) + // ref. https://github.com/pytorch/pytorch/blob/6060684609ebf66120db5af004b4cdafc5cccbdb/torch/nn/functional.py#L71 + imgConv2d := ts.MustConv2d(imgPad, kexpand, ts.NewTensor(), []int64{1}, []int64{0}, []int64{1}, imgPadDim[len(imgPadDim)-3]) + imgPad.MustDrop() + + // img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype) + out := castSqueezeOut(imgConv2d, needCast, needSqueeze, outDType) + imgConv2d.MustDrop() + + return out +} + +func isTorchImage(x *ts.Tensor) bool { + return x.Dim() >= 2 +} + +func assertImageTensor(x *ts.Tensor) { + if !isTorchImage(x) { + err := fmt.Errorf("Input tensor is not a torch image.") + log.Fatal(err) + } +} + +func imageChanNum(x *ts.Tensor) int64 { + ndim := x.Dim() + + switch { + case ndim == 2: + return 1 + case ndim > 2: + return x.MustSize()[0] + default: + err := fmt.Errorf("imageChanNum - Input should be 2 or more. Got %v", ndim) + log.Fatal(err) + return 0 + } +} + +func contains(item int64, list []int64) bool { + for _, i := range list { + if item == i { + return true + } + } + + return false +} + +func assertChannels(x *ts.Tensor, permitted []int64) { + c := imageChanNum(x) + if !contains(c, permitted) { + err := fmt.Errorf("Input image tensor permitted channels are %+v, but found %v", permitted, c) + log.Fatal(err) + } +} + +func blend(img1, img2 *ts.Tensor, ratio float64) *ts.Tensor { + dtype := img1.DType() + // bound := 1.0 + // if dtype == gotch.Double || dtype == gotch.Float { + // bound = 255.0 + // } + bound := 255.0 + + // (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype) + i1 := img1.MustMul1(ts.FloatScalar(ratio), false) + i2 := img2.MustMul1(ts.FloatScalar(1.0-ratio), false) + sumTs := i1.MustAdd(i2, true) + i2.MustDrop() + out := sumTs.MustClamp(ts.FloatScalar(0), ts.FloatScalar(bound), true).MustTotype(dtype, true) + return out +} + +// brightness should be in range 0.25 - 1.25 for visible view +func adjustBrightness(x *ts.Tensor, brightness float64) *ts.Tensor { + if brightness < 0 { + err := fmt.Errorf("adjustBrightness - brightness factor (%v) is not non-negative.", brightness) + log.Fatal(err) + } + + assertImageTensor(x) + assertChannels(x, []int64{1, 3}) + + zeros := x.MustZerosLike(false) + out := blend(x, zeros, brightness) + zeros.MustDrop() + + return out +} + +// randVal generates a value from uniform values from 0 to x +func randVal(from, to float64) float64 { + v := ts.MustEmpty([]int64{1}, gotch.Float, gotch.CPU) + v.MustUniform_(from, to) + randVal := v.Float64Values()[0] + v.MustDrop() + return randVal +} + +func getMinMax(x float64) (float64, float64) { + from := 0.0 + if 1-x > 0 { + from = 1 - x + } + to := 1 + x + + return from, to +} + +func rgb2Gray(x *ts.Tensor, outChanOpt ...int64) *ts.Tensor { + var outChannels int64 = 1 + if len(outChanOpt) > 0 { + outChannels = outChanOpt[0] + } + + ndim := x.Dim() + if ndim < 3 { + err := fmt.Errorf("Input image tensor should have at least 3 dimensions, but found %v", ndim) + log.Fatal(err) + } + + assertChannels(x, []int64{3}) + if !contains(outChannels, []int64{1, 3}) { + err := fmt.Errorf("Number of output channels should be either 1 or 3") + log.Fatal(err) + } + + rgbTs := x.MustUnbind(-3, false) + r := &rgbTs[0] + g := &rgbTs[1] + b := &rgbTs[2] + + // This implementation closely follows the TF one: + // https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138 + // l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype) + rmul := r.MustMul1(ts.FloatScalar(0.2989), true) + gmul := g.MustMul1(ts.FloatScalar(0.587), true) + bmul := b.MustMul1(ts.FloatScalar(0.114), true) + addTs := rmul.MustAdd(gmul, true).MustAdd(bmul, true) + gmul.MustDrop() + bmul.MustDrop() + lImg := addTs.MustTotype(x.DType(), true).MustUnsqueeze(-3, true) + + if outChannels == 3 { + return lImg.MustExpand(x.MustSize(), true, true) + } + + return lImg +} + +func adjustContrast(x *ts.Tensor, contrast float64) *ts.Tensor { + if contrast < 0 { + err := fmt.Errorf("adjustContrast - contrast factor (%v) is not non-negative.", contrast) + log.Fatal(err) + } + + assertImageTensor(x) + assertChannels(x, []int64{3}) + + grayTs := rgb2Gray(x).MustTotype(x.DType(), true) + + mean := grayTs.MustMean1([]int64{-3, -2, -1}, true, gotch.Float, true).MustTotype(x.DType(), true) + out := blend(x, mean, contrast) + mean.MustDrop() + + return out +} + +func adjustSaturation(x *ts.Tensor, sat float64) *ts.Tensor { + if sat < 0 { + err := fmt.Errorf("adjustSaturation - saturation factor (%v) is not non-negative.", sat) + log.Fatal(err) + } + assertImageTensor(x) + assertChannels(x, []int64{3}) + grayTs := rgb2Gray(x).MustTotype(x.DType(), true) + out := blend(x, grayTs, sat) + grayTs.MustDrop() + + return out +} + +func rgb2HSV(x *ts.Tensor) *ts.Tensor { + rgbTs := x.MustUnbind(-3, false) + r := &rgbTs[0] + g := &rgbTs[1] + b := &rgbTs[2] + + // # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/ + // # src/libImaging/Convert.c#L330 + // maxc = torch.max(img, dim=-3).values + // minc = torch.min(img, dim=-3).values + maxC := x.MustAmax([]int64{-3}, false, false) + minC := x.MustAmin([]int64{-3}, false, false) + + // # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN + // # from happening in the results, because + // # + S channel has division by `maxc`, which is zero only if `maxc = minc` + // # + H channel has division by `(maxc - minc)`. + // # + // # Instead of overwriting NaN afterwards, we just prevent it from occuring so + // # we don't need to deal with it in case we save the NaN in a buffer in + // # backprop, if it is ever supported, but it doesn't hurt to do so. + // eqc = maxc == minc + eqC := maxC.MustEq1(minC, false) + + // cr = maxc - minc + cr := maxC.MustSub(minC, false) + + // # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine. + ones := maxC.MustOnesLike(false) + + // s = cr / torch.where(eqc, ones, maxc) + condMaxC := ones.MustWhere1(eqC, maxC, false) + s := cr.MustDiv(condMaxC, false) + + // # Note that `eqc => maxc = minc = r = g = b`. So the following calculation + // # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it + // # would not matter what values `rc`, `gc`, and `bc` have here, and thus + // # replacing denominator with 1 when `eqc` is fine. + // cr_divisor = torch.where(eqc, ones, cr) + // rc = (maxc - r) / cr_divisor + // gc = (maxc - g) / cr_divisor + // bc = (maxc - b) / cr_divisor + crDivisor := ones.MustWhere1(eqC, cr, true) // delete ones + rc := maxC.MustSub(r, false).MustDiv(crDivisor, true) + gc := maxC.MustSub(g, false).MustDiv(crDivisor, true) + bc := maxC.MustSub(b, false).MustDiv(crDivisor, true) + + // hr = (maxc == r) * (bc - gc) + rSub := bc.MustSub(gc, false) + hr := maxC.MustEq1(r, false).MustMul(rSub, true) + rSub.MustDrop() + + // hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc) + maxcCond1 := maxC.MustNotEqual1(r, false) + hgMul := rc.MustSub(bc, false).MustAdd1(ts.FloatScalar(2.0), true) + hg := maxC.MustEq1(g, false).MustLogicalAnd(maxcCond1, true).MustMul(hgMul, true) + maxcCond1.MustDrop() + hgMul.MustDrop() + + // hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc) + maxcCond2 := maxC.MustNotEqual1(r, false) + hbMul := gc.MustSub(rc, false).MustAdd1(ts.FloatScalar(4.0), true) + hb := maxC.MustNotEqual1(g, false).MustLogicalAnd(maxcCond2, true).MustMul(hbMul, true) + maxcCond2.MustDrop() + hbMul.MustDrop() + + // h = (hr + hg + hb) + h1 := hr.MustAdd(hg, false).MustAdd(hb, true) + + // h = torch.fmod((h / 6.0 + 1.0), 1.0) + h2 := h1.MustDiv1(ts.FloatScalar(6.0), true).MustAdd1(ts.FloatScalar(1.0), true) // delete h1 + h3 := h2.MustFmod(ts.FloatScalar(1.0), true) // delete h2 + + // torch.stack((h, s, maxc), dim=-3) + out := ts.MustStack([]ts.Tensor{*h3, *s, *maxC}, -3) + + // Delete intermediate tensors + r.MustDrop() + g.MustDrop() + b.MustDrop() + h3.MustDrop() + maxC.MustDrop() + minC.MustDrop() + eqC.MustDrop() + s.MustDrop() + condMaxC.MustDrop() + cr.MustDrop() + crDivisor.MustDrop() + rc.MustDrop() + gc.MustDrop() + bc.MustDrop() + hr.MustDrop() + hg.MustDrop() + hb.MustDrop() + + return out +} + +func hsv2RGB(x *ts.Tensor) *ts.Tensor { + hsvTs := x.MustUnbind(-3, false) + h := &hsvTs[0] + s := &hsvTs[1] + v := &hsvTs[2] + + i := h.MustMul1(ts.FloatScalar(6.0), false).MustFloor(true) + f := h.MustMul1(ts.FloatScalar(0.6), false).MustSub(i, true) + + // p = torch.clamp((v * (1.0 - s)), 0.0, 1.0) + x1 := s.MustMul1(ts.FloatScalar(-1), false).MustAdd1(ts.FloatScalar(1.0), true) + p := v.MustMul(x1, false).MustClamp(ts.FloatScalar(0.0), ts.FloatScalar(1.0), true) + x1.MustDrop() + + // q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0) + x2 := s.MustMul(f, false).MustMul1(ts.FloatScalar(-1), true).MustAdd1(ts.FloatScalar(1.0), true) + q := v.MustMul(x2, false).MustClamp(ts.FloatScalar(0.0), ts.FloatScalar(1.0), true) + x2.MustDrop() + + //t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0) + // step1. s * (1.0 - f) + sub1 := f.MustMul1(ts.FloatScalar(-1), false).MustAdd1(ts.FloatScalar(1.0), true).MustMul(s, true) + // step 2: v *(1.0 - step1) + x3 := sub1.MustMul1(ts.FloatScalar(-1), true).MustAdd1(ts.FloatScalar(1.0), true).MustMul(v, true) // deleted sub1 + t := x3.MustClamp(ts.FloatScalar(0.0), ts.FloatScalar(1.0), true) // deleted x3 + + //i = i % 6 + iremainder := i.MustRemainder(ts.FloatScalar(6), true) // delete i + // torch.arange(6, device=i.device).view(-1, 1, 1) + x4 := ts.MustArange(ts.FloatScalar(6), gotch.Float, iremainder.MustDevice()).MustView([]int64{-1, 1, 1}, true) + mask := iremainder.MustUnsqueeze(-3, true).MustEq1(x4, true).MustTotype(x.DType(), true) // delete iremainder + x4.MustDrop() + + // a1 = torch.stack((v, q, p, p, t, v), dim=-3) + // a2 = torch.stack((t, v, v, q, p, p), dim=-3) + // a3 = torch.stack((p, p, t, v, v, q), dim=-3) + // a4 = torch.stack((a1, a2, a3), dim=-4) + a1 := ts.MustStack([]ts.Tensor{*v, *q, *p, *p, *t, *v}, -3) + a2 := ts.MustStack([]ts.Tensor{*t, *v, *v, *q, *p, *p}, -3) + a3 := ts.MustStack([]ts.Tensor{*p, *p, *t, *v, *v, *q}, -3) + a4 := ts.MustStack([]ts.Tensor{*a1, *a2, *a3}, -4) + + out := ts.MustEinsum("...ijk, ...xijk -> ...xjk", []ts.Tensor{*mask, *a4}) + + // Delete intermediate tensors + h.MustDrop() + s.MustDrop() + v.MustDrop() + f.MustDrop() + p.MustDrop() + q.MustDrop() + t.MustDrop() + + a1.MustDrop() + a2.MustDrop() + a3.MustDrop() + a4.MustDrop() + mask.MustDrop() + + return out +} + +// ref. https://en.wikipedia.org/wiki/HSL_and_HSV +func adjustHue(x *ts.Tensor, hue float64) *ts.Tensor { + if hue < -0.5 || hue > 0.5 { + err := fmt.Errorf("hue factor (%v) is not in [-0.5, 0.5]", hue) + log.Fatal(err) + } + assertImageTensor(x) + assertChannels(x, []int64{1, 3}) + + if c := imageChanNum(x); c == 1 { + out := x.MustShallowClone() + return out + } + + imgFl := x.MustTotype(gotch.Float, false).MustDiv1(ts.FloatScalar(255.0), true) + hsvImg := rgb2HSV(imgFl) + + hsvTs := hsvImg.MustUnbind(-3, true) + h := &hsvTs[0] + s := &hsvTs[1] + v := &hsvTs[2] + hAdj := h.MustAdd1(ts.FloatScalar(hue), false).MustRemainder(ts.FloatScalar(1.0), true) + + hsvAdj := ts.MustStack([]ts.Tensor{*hAdj, *s, *v}, -3) + + imgHueAdj := hsv2RGB(hsvAdj) + + out := imgHueAdj.MustMul1(ts.FloatScalar(255.0), true) + + imgFl.MustDrop() + h.MustDrop() + s.MustDrop() + v.MustDrop() + hAdj.MustDrop() + hsvAdj.MustDrop() + + return out +} + +func adjustGamma(x *ts.Tensor, gamma float64, gainOpt ...int64) *ts.Tensor { + // var gain int64 = 1 + // if len(gainOpt) > 0 { + // gain = gainOpt[0] + // } + // TODO + return x +} + +func RGB2HSV(x *ts.Tensor) *ts.Tensor { + return rgb2HSV(x) +} + +func HSV2RGB(x *ts.Tensor) *ts.Tensor { + return hsv2RGB(x) +} + +func pad(x *ts.Tensor, padding []int64, paddingMode string) *ts.Tensor { + switch paddingMode { + case "reflection": + return x.MustReflectionPad2d(padding, false) + case "constant": + return x.MustConstantPadNd(padding, false) + case "replicate": + return x.MustReplicationPad2d(padding, false) + case "circular": + // TODO: + // ref: https://github.com/pytorch/pytorch/blob/71f4c5c1f436258adc303b710efb3f41b2d50c4e/torch/nn/functional.py#L4493 + log.Fatal("Unsupported circular padding.") + default: + log.Fatalf("Unrecognized padding mode %q\n", paddingMode) + } + return nil +} + +func getImageSize(x *ts.Tensor) (width, height int64) { + assertImageTensor(x) + dim := x.MustSize() + return dim[len(dim)-1], dim[len(dim)-2] +} + +func makeSlice(from, to int64) []int64 { + n := from - to + var out []int64 = make([]int64, n) + for i := 0; i < int(n); i++ { + out[i] = from + int64(i) + } + return out +} + +func crop(x *ts.Tensor, top, left, height, width int64) *ts.Tensor { + // return img[..., top:top + height, left:left + width] + dim := x.MustSize() + c := dim[0] + + var chans []ts.Tensor = make([]ts.Tensor, c) + hNar := ts.NewNarrow(top, top+height) + wNar := ts.NewNarrow(left, left+width) + for i := 0; i < int(c); i++ { + cx := x.Idx(ts.NewSelect(int64(i))) + x1 := cx.Idx(hNar) + cx.MustDrop() + x1T := x1.MustT(true) + x2 := x1T.Idx(wNar) + x1T.MustDrop() + out := x2.MustT(true) + chans[i] = *out + } + + cropTs := ts.MustStack(chans, 0) + for i := range chans { + chans[i].MustDrop() + } + return cropTs +} + +// Crops the given image at the center. +// If the image is torch Tensor, it is expected +// to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. +// If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. +func centerCrop(x *ts.Tensor, size []int64) *ts.Tensor { + imgW, imgH := getImageSize(x) + cropH, cropW := size[0], size[1] + + var paddedImg *ts.Tensor + + if cropW > imgW || cropH > imgH { + // (crop_width - image_width) // 2 if crop_width > image_width else 0, + // (crop_width - image_width + 1) // 2 if crop_width > image_width else 0, + var left, top, right, bottom int64 = 0, 0, 0, 0 + if cropW > imgW { + left = (cropW - imgW) / 2 + right = (cropW - imgW + 1) / 2 + } + // (crop_height - image_height) // 2 if crop_height > image_height else 0, + // (crop_height - image_height + 1) // 2 if crop_height > image_height else 0, + if cropH > imgH { + top = (cropH - imgH) / 2 + bottom = (cropH - imgH + 1) / 2 + } + + // floatX := x.MustTotype(gotch.Float, false) + // paddedImg = pad(floatX, []int64{left, right, top, bottom}, "reflection") + // floatX.MustDrop() + + paddedImg = pad(x, []int64{left, right, top, bottom}, "constant") + imgW, imgH = getImageSize(paddedImg) + if cropW == imgW && cropH == imgH { + return paddedImg + } + } else { + paddedImg = x.MustShallowClone() + } + + // cropTop := int64(math.Floor(float64(imgH-cropH) / 2.0)) + // cropLeft := int64(math.Floor(float64(imgW-cropW) / 2.0)) + cropTop := (imgH - cropH) / 2 + cropLeft := (imgW - cropW) / 2 + + out := crop(paddedImg, cropTop, cropLeft, cropH, cropW) + paddedImg.MustDrop() + + return out +} + +// cutout erases the input Tensor Image with given value +// +// Args: +// img (Tensor Image): Tensor image of size (C, H, W) to be erased +// i (int): i in (i,j) i.e coordinates of the upper left corner. +// j (int): j in (i,j) i.e coordinates of the upper left corner. +// h (int): Height of the erased region. +// w (int): Width of the erased region. +// v: Erasing value. +func cutout(x *ts.Tensor, top, left, height, width int64, rgbVal []int64) *ts.Tensor { + output := x.MustZerosLike(false) + output.Copy_(x) + dim := output.MustSize() + for i := 0; i < int(dim[0]); i++ { + cIdx := ts.NewSelect(int64(i)) + hNar := ts.NewNarrow(top, top+height) + wNar := ts.NewNarrow(left, left+width) + srcIdx := []ts.TensorIndexer{cIdx, hNar, wNar} + view := output.Idx(srcIdx) + oneTs := view.MustOnesLike(false) + vTs := oneTs.MustMul1(ts.IntScalar(rgbVal[i]), true) + view.Copy_(vTs) + vTs.MustDrop() + view.MustDrop() + } + + // output.Print() + return output +} + +func hflip(x *ts.Tensor) *ts.Tensor { + assertImageTensor(x) + return x.MustFlip([]int64{-1}, false) +} + +func vflip(x *ts.Tensor) *ts.Tensor { + assertImageTensor(x) + return x.MustFlip([]int64{-2}, false) +} + +// Ref. https://stackoverflow.com/questions/64197754 +// Ref. https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html +func getRotMat(theta float64) (*ts.Tensor, error) { + grid := []float64{math.Cos(theta), -1 * (math.Sin(theta)), 0, math.Sin(theta), math.Cos(theta), 0} + t, err := ts.NewTensorFromData(grid, []int64{2, 3}) + if err != nil { + return nil, err + } + + return t, nil +} + +func rotImg(x *ts.Tensor, theta float64, dtype gotch.DType) (*ts.Tensor, error) { + rotMat, err := getRotMat(theta) + if err != nil { + return nil, err + } + + size := x.MustSize() + mat := rotMat.MustUnsqueeze(0, true).MustTotype(dtype, true).MustRepeat([]int64{size[0], 1, 1}, true) + grid := ts.MustAffineGridGenerator(mat, size, true).MustTo(x.MustDevice(), true) + mat.MustDrop() + + out, err := ts.GridSampler(x, grid, 1, 1, true) + if err != nil { + return nil, err + } + grid.MustDrop() + return out, nil +} + +func applyGridTransform(x, gridInput *ts.Tensor, mode string, fillValue []float64) *ts.Tensor { + dtype := gridInput.DType() + img, needCast, needSqueeze, outDtype := castSqueezeIn(x, []gotch.DType{dtype}) + + imgDim := img.MustSize() + gridDim := gridInput.MustSize() + var grid *ts.Tensor + if imgDim[0] > 1 { + // Apply same grid to a batch of images + // grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]) + grid = gridInput.MustExpand([]int64{imgDim[0], gridDim[1], gridDim[2], gridDim[3]}, true, false) + } else { + grid = gridInput.MustShallowClone() + } + + // Append a dummy mask for customized fill colors, should be faster than grid_sample() twice + // dummy = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device) + // img = torch.cat((img, dummy), dim=1) + dummy := ts.MustOnes([]int64{img.MustSize()[0], 1, img.MustSize()[2], img.MustSize()[3]}, img.DType(), img.MustDevice()) + imgCat := ts.MustCat([]ts.Tensor{*img, *dummy}, 1) + dummy.MustDrop() + img.MustDrop() + + // imgSample := gridSample(imgCat, grid, mode, "zeros", false) + var ( + modeInt int64 = 0 + paddingMode int64 = 0 + ) + + imgSample := ts.MustGridSampler(imgCat, grid, modeInt, paddingMode, false) + imgCat.MustDrop() + grid.MustDrop() + + // TODO. + // Fill with required color + // mask = img[:, -1:, :, :] # N * 1 * H * W + // img = img[:, :-1, :, :] # N * C * H * W + // mask = mask.expand_as(img) + // len_fill = len(fill) if isinstance(fill, (tuple, list)) else 1 + // fill_img = torch.tensor(fill, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img) + // if mode == 'nearest': + // mask = mask < 0.5 + // img[mask] = fill_img[mask] + // else: # 'bilinear' + // img = img * mask + (1.0 - mask) * fill_img + image := imgSample.MustNarrow(0, 0, 1, false).MustNarrow(1, 0, 3, true) + mask := imgSample.MustNarrow(0, 0, 1, false).MustNarrow(1, -1, 1, true).MustExpandAs(image, true) + imgSample.MustDrop() + fillImg := ts.MustOfSlice(fillValue).MustTotype(image.DType(), true).MustTo(image.MustDevice(), true).MustView([]int64{1, 3, 1, 1}, true).MustExpandAs(image, true) + + // img = img * mask + (1.0 - mask) * fill_img + addTs := mask.MustMul1(ts.FloatScalar(-1), false).MustAdd1(ts.FloatScalar(1.0), true).MustMul(fillImg, true) + imgOut := image.MustMul(mask, true).MustAdd(addTs, true) + addTs.MustDrop() + image.MustDrop() + mask.MustDrop() + fillImg.MustDrop() + + // out := castSqueezeOut(imgSample, needCast, needSqueeze, outDtype) + out := castSqueezeOut(imgOut, needCast, needSqueeze, outDtype) + imgOut.MustDrop() + + return out +} + +// Helper function to get the coefficients (a, b, c, d, e, f, g, h) for the perspective transforms. +// +// In Perspective Transform each pixel (x, y) in the original image gets transformed as, +// (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) ) +// Args: +// - startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners +// ``[top-left, top-right, bottom-right, bottom-left]`` of the original image. +// - endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners +// ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image. +// Returns: +// - octuple (a, b, c, d, e, f, g, h) for transforming each pixel. +func perspectiveCoeff(startPoints, endPoints [][]int64) []float64 { + size := int64(2 * len(startPoints)) + aMat := ts.MustZeros([]int64{size, 8}, gotch.Float, gotch.CPU) + for i := 0; i < len(startPoints); i++ { + p1 := endPoints[i] + p2 := startPoints[i] + // a_matrix[2 * i, :] = torch.tensor([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]]) + val1 := ts.MustOfSlice([]int64{p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]}) + // a_matrix[2 * i + 1, :] = torch.tensor([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]]) + val2 := ts.MustOfSlice([]int64{0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]}) + + idx1 := ts.NewSelect(int64(2 * i)) + aMatView1 := aMat.Idx(idx1) + aMatView1.Copy_(val1) + val1.MustDrop() + + idx2 := ts.NewSelect(int64(2*i + 1)) + aMatView2 := aMat.Idx(idx2) + aMatView2.Copy_(val2) + val2.MustDrop() + } + + var startData []int64 + for _, p := range startPoints { + startData = append(startData, p[0], p[1]) + } + + // bMat := ts.MustOfSlice(startPoints).MustTotype(gotch.Float, true).MustView([]int64{8}, true) + bMat := ts.MustOfSlice(startData).MustTotype(gotch.Float, true).MustView([]int64{8}, true) + + res := bMat.MustLstsq(aMat, true) + + aMat.MustDrop() + outputTs := res.MustSqueeze1(1, true) + output := outputTs.Float64Values() + outputTs.MustDrop() + + return output +} + +func perspectiveGrid(coef []float64, ow, oh int64, dtype gotch.DType, device gotch.Device) *ts.Tensor { + // https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/ + // src/libImaging/Geometry.c#L394 + // x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1) + // y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1) + + theta1 := ts.MustOfSlice([]float64{ + coef[0], + coef[1], + coef[2], + coef[3], + coef[4], + coef[5], + }).MustTotype(dtype, true).MustTo(device, true).MustView([]int64{1, 2, 3}, true) + + theta2 := ts.MustOfSlice([]float64{ + coef[6], + coef[7], + coef[1.0], + coef[6], + coef[7], + coef[1.0], + }).MustTotype(dtype, true).MustTo(device, true).MustView([]int64{1, 2, 3}, true) + + d := 0.5 + + baseGrid := ts.MustEmpty([]int64{1, oh, ow, 3}, dtype, device) + + // x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device) + endX := float64(ow) + d - 1.0 + xGrid := ts.MustLinspace(ts.FloatScalar(d), ts.FloatScalar(endX), []int64{ow}, dtype, device) + + // y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1) + endY := float64(oh) + d - 1.0 + yGrid := ts.MustLinspace(ts.FloatScalar(d), ts.FloatScalar(endY), []int64{oh}, dtype, device) + + // base_grid[..., 0].copy_(x_grid) + // base_grid[..., 1].copy_(y_grid) + // base_grid[..., 2].fill_(1) + baseDim := baseGrid.MustSize() + for i := 0; i < int(baseDim[1]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(0, int64(i), true).MustSelect(1, 0, true) + view.Copy_(xGrid) + view.MustDrop() + } + for i := 0; i < int(baseDim[2]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(1, int64(i), true).MustSelect(1, 1, true) + view.Copy_(yGrid) + view.MustDrop() + } + + for i := 0; i < int(baseDim[2]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(1, int64(i), true).MustSelect(1, 2, true) + // view.Fill_(ts.FloatScalar(1.0)) // NOTE. THIS CAUSES MEMORY LEAK!!! + oneTs := view.MustOnesLike(false) + view.Copy_(oneTs) + oneTs.MustDrop() + view.MustDrop() + } + + // rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device) + divTs := ts.MustOfSlice([]float64{0.5 * float64(ow), 0.5 * float64(oh)}).MustTotype(dtype, true).MustTo(device, true) + rescaledTheta1 := theta1.MustTranspose(1, 2, true).MustDiv(divTs, true) + divTs.MustDrop() + outputGrid1 := baseGrid.MustView([]int64{1, oh * ow, 3}, false).MustBmm(rescaledTheta1, true) + + // output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2)) + rescaledTheta2 := theta2.MustTranspose(1, 2, true) + outputGrid2 := baseGrid.MustView([]int64{1, oh * ow, 3}, false).MustBmm(rescaledTheta2, true) + + rescaledTheta1.MustDrop() + rescaledTheta2.MustDrop() + + outputGrid := outputGrid1.MustDiv(outputGrid2, true).MustSub1(ts.FloatScalar(1.0), true).MustView([]int64{1, oh, ow, 2}, true) + outputGrid2.MustDrop() + + baseGrid.MustDrop() + + return outputGrid +} + +func perspective(x *ts.Tensor, startPoints, endPoints [][]int64, mode string, fillValue []float64) *ts.Tensor { + coef := perspectiveCoeff(startPoints, endPoints) + + assertImageTensor(x) + // assertGridTransformInputs(x, nil, mode, fillValue, []string{"nearest", "bilinear"}, coef) + + dim := x.MustSize() + ow, oh := dim[len(dim)-1], dim[len(dim)-2] + device := x.MustDevice() + grid := perspectiveGrid(coef, ow, oh, gotch.Float, device) + + output := applyGridTransform(x, grid, mode, fillValue) + grid.MustDrop() + + return output +} + +// Apply affine transformation on the image keeping image center invariant. +// +//If the image is torch Tensor, it is expected +// to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// - img (Tensor): image to transform. +// - angle (number): rotation angle in degrees between -180 and 180, clockwise direction. +// - translate (sequence of integers): horizontal and vertical translations (post-rotation translation) +// - scale (float): overall scale +// - shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction. +// If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while +// the second value corresponds to a shear parallel to the y axis. +// - interpolation (InterpolationMode): Desired interpolation enum defined by +// :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. +// If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. +// - fill (sequence or number, optional): Pixel fill value for the area outside the transformed +// image. If given a number, the value is used for all bands respectively. +func affine(img *ts.Tensor, angle float64, translations []int64, scale float64, shear []float64, interpolationMode string, fillValue []float64) *ts.Tensor { + + var translateF []float64 + for _, v := range translations { + translateF = append(translateF, float64(v)) + } + + matrix := getInverseAffineMatrix([]float64{0.0, 0.0}, angle, translateF, scale, shear) + + dtype := gotch.Float + device := img.MustDevice() + dim := img.MustSize() + theta := ts.MustOfSlice(matrix).MustTotype(dtype, true).MustTo(device, true).MustView([]int64{1, 2, 3}, true) + + // grid will be generated on the same device as theta and img + w := dim[len(dim)-1] + h := dim[len(dim)-2] + ow := w + oh := h + + // grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2]) + grid := genAffineGrid(theta, w, h, ow, oh) + // grid := ts.MustEmpty([]int64{1, 512, 512, 2}, dtype, device) + + out := applyGridTransform(img, grid, interpolationMode, fillValue) + + grid.MustDrop() + theta.MustDrop() + + return out +} + +// Helper method to compute inverse matrix for affine transformation +// +// As it is explained in PIL.Image.rotate +// We need compute INVERSE of affine transformation matrix: M = T * C * RSS * C^-1 +// where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1] +// C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1] +// RSS is rotation with scale and shear matrix +// RSS(a, s, (sx, sy)) = +// = R(a) * S(s) * SHy(sy) * SHx(sx) +// = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(x)/cos(y) - sin(a)), 0 ] +// [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(x)/cos(y) + cos(a)), 0 ] +// [ 0 , 0 , 1 ] +// +// where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears: +// SHx(s) = [1, -tan(s)] and SHy(s) = [1 , 0] +// [0, 1 ] [-tan(s), 1] +// +// Thus, the inverse is M^-1 = C * RSS^-1 * C^-1 * T^-1 +func getInverseAffineMatrix(center []float64, angle float64, translate []float64, scale float64, shear []float64) []float64 { + + // convert to randiants + rot := angle * math.Pi / 180 + sx := shear[0] * math.Pi / 180 + sy := shear[1] * math.Pi / 180 + + cx, cy := center[0], center[1] + tx, ty := translate[0], translate[1] + + // RSS without scaling + // a = math.cos(rot - sy) / math.cos(sy) + a := math.Cos(rot-sy) / math.Cos(sy) + // b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot) + b := -math.Cos(rot-sy)*math.Tan(sx)/math.Cos(sy) - math.Sin(rot) + // c = math.sin(rot - sy) / math.cos(sy) + c := math.Sin(rot-sy) / math.Cos(sy) + // d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot) + d := -math.Sin(rot-sy)*math.Tan(sx)/math.Cos(sy) + math.Cos(rot) + + // Inverted rotation matrix with scale and shear + // det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1 + // matrix = [d, -b, 0.0, -c, a, 0.0] + var matrix []float64 = []float64{d, -b, 0.0, -c, a, 0.0} + // matrix = [x / scale for x in matrix] + var mat []float64 + for _, v := range matrix { + mat = append(mat, v/scale) + } + + // Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1 + // matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty) + mat[2] += mat[0]*(-cx-tx) + mat[1]*(-cy-ty) + // matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty) + mat[5] += mat[3]*(-cx-tx) + mat[4]*(-cy-ty) + + // Apply center translation: C * RSS^-1 * C^-1 * T^-1 + // matrix[2] += cx + mat[2] += cx + // matrix[5] += cy + mat[5] += cy + + return mat +} + +// https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/ +// AffineGridGenerator.cpp#L18 +// Difference with AffineGridGenerator is that: +// 1) we normalize grid values after applying theta +// 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate +func genAffineGrid(theta *ts.Tensor, w, h, ow, oh int64) *ts.Tensor { + d := 0.5 + dtype := theta.DType() + device := theta.MustDevice() + // base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device) + baseGrid := ts.MustEmpty([]int64{1, oh, ow, 3}, dtype, device) + + // x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device) + startX := float64(-ow)*0.5 + d + endX := float64(ow)*0.5 + d - 1.0 + xGrid := ts.MustLinspace(ts.FloatScalar(startX), ts.FloatScalar(endX), []int64{ow}, dtype, device) + + // y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1) + startY := float64(-oh)*0.5 + d + endY := float64(oh)*0.5 + d - 1.0 + yGrid := ts.MustLinspace(ts.FloatScalar(startY), ts.FloatScalar(endY), []int64{oh}, dtype, device) + + // base_grid[..., 0].copy_(x_grid) + // base_grid[..., 1].copy_(y_grid) + // base_grid[..., 2].fill_(1) + baseDim := baseGrid.MustSize() + for i := 0; i < int(baseDim[1]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(0, int64(i), true).MustSelect(1, 0, true) + view.Copy_(xGrid) + view.MustDrop() + } + for i := 0; i < int(baseDim[2]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(1, int64(i), true).MustSelect(1, 1, true) + view.Copy_(yGrid) + view.MustDrop() + } + + for i := 0; i < int(baseDim[2]); i++ { + view := baseGrid.MustSelect(0, 0, false).MustSelect(1, int64(i), true).MustSelect(1, 2, true) + // view.Fill_(ts.FloatScalar(1.0)) // NOTE. THIS CAUSES MEMORY LEAK!!!! + oneTs := view.MustOnesLike(false) + view.Copy_(oneTs) + oneTs.MustDrop() + view.MustDrop() + } + + // rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device) + + // rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device) + divTs := ts.MustOfSlice([]float64{0.5 * float64(w), 0.5 * float64(h)}).MustTotype(dtype, true).MustTo(device, true) + rescaledTheta := theta.MustTranspose(1, 2, false).MustDiv(divTs, true) + divTs.MustDrop() + + // output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta) + outputGrid := baseGrid.MustView([]int64{1, oh * ow, 3}, false).MustBmm(rescaledTheta, true).MustView([]int64{1, oh, ow, 2}, true) + + baseGrid.MustDrop() + xGrid.MustDrop() + yGrid.MustDrop() + rescaledTheta.MustDrop() + + return outputGrid +} + +// randPvalue generates a random propability value [0, 1] +func randPvalue() float64 { + rand.Seed(time.Now().UnixNano()) + var min, max float64 = 0.0, 1.0 + + r := min + rand.Float64()*(max-min) + return r +} + +func getImageChanNum(x *ts.Tensor) int64 { + dim := x.MustSize() + switch { + case len(dim) == 2: + return 1 + case len(dim) > 2: + return dim[len(dim)-3] + default: + log.Fatalf("Input image tensor should have dim of 2 or more. Got %v\n", len(dim)) + } + + log.Fatalf("Input image tensor should have dim of 2 or more. Got %v\n", len(dim)) + return -1 +} + +// solarize solarizes an RGB/grayscale image by inverting all pixel values above a threshold. +// Args: +// - img (Tensor): Image to have its colors inverted. +// If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format, +// where ... means it can have an arbitrary number of leading dimensions. +// - threshold (float): All pixels equal or above this value are inverted. +func solarize(img *ts.Tensor, threshold float64) *ts.Tensor { + assertImageTensor(img) + + dim := img.MustSize() + if len(dim) < 3 { + log.Fatalf("Input image tensor should have at least 3 dimensions. Got %v", len(dim)) + } + + assertChannels(img, []int64{1, 3}) + + invertedImg := invert(img) + + // return torch.where(img >= threshold, inverted_img, img) + conditionTs := img.MustGe(ts.FloatScalar(threshold), false) + + out := img.MustWhere1(conditionTs, invertedImg, false) + + invertedImg.MustDrop() + conditionTs.MustDrop() + + return out +} + +// invert inverts image tensor. +func invert(img *ts.Tensor) *ts.Tensor { + assertImageTensor(img) + + dim := img.MustSize() + if len(dim) < 3 { + log.Fatalf("Input image tensor should have at least 3 dimensions. Got %v", len(dim)) + } + + assertChannels(img, []int64{1, 3}) + + var bound int64 = 255 + // return bound - img + out := img.MustMul1(ts.IntScalar(-1), false).MustAdd1(ts.IntScalar(bound), true) + return out +} + +func posterize(img *ts.Tensor, bits uint8) *ts.Tensor { + assertImageTensor(img) + + dim := img.MustSize() + + if len(dim) < 3 { + log.Fatalf("Input image tensor should have at least 3 dimensions. Got %v\n", len(dim)) + } + + dtype := img.DType() + if dtype != gotch.Uint8 { + log.Fatalf("Only dtype uint8 image tensors are supported. Got %v", dtype) + } + + assertChannels(img, []int64{1, 3}) + + // mask = -int(2**(8 - bits)) # JIT-friendly for: ~(2 ** (8 - bits) - 1) + // or mask := -int64(1<<(uint8(8) - bits)) + mask := -int64(math.Exp2(float64(uint8(8) - bits))) + + out := img.MustBitwiseAnd(ts.IntScalar(mask), false) + return out +} + +func autocontrast(img *ts.Tensor) *ts.Tensor { + assertImageTensor(img) + + dim := img.MustSize() + + if len(dim) < 3 { + log.Fatalf("Input image tensor should have at least 3 dimensions. Got %v\n", len(dim)) + } + + var bound int64 = 255 + dtype := gotch.Float + + // minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype) + minTs := img.MustAmin([]int64{-2, -1}, true, false).MustTotype(dtype, true) + // maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype) + maxTs := img.MustAmax([]int64{-2, -1}, true, false).MustTotype(dtype, true) + + // eq_idxs = torch.where(minimum == maximum)[0] + // NOTE. Eq(minTs, maxTs) give [n, c, 1, 1] or [channels, 1, 1] + eqIdx := minTs.MustEq1(maxTs, false).MustSqueeze1(-1, true).MustSqueeze1(-1, true).MustTotype(gotch.Int64, true) + + // minimum[eq_idxs] = 0 + minTsView := minTs.MustIndexSelect(0, eqIdx, false) + zerosTs := minTsView.MustZerosLike(false) + minTsView.Copy_(zerosTs) + zerosTs.MustDrop() + minTsView.MustDrop() + + // maximum[eq_idxs] = bound + maxTsView := maxTs.MustIndexSelect(0, eqIdx, false) + boundTs := maxTsView.MustOnesLike(false).MustMul1(ts.IntScalar(bound), true) + maxTsView.Copy_(boundTs) + boundTs.MustDrop() + maxTsView.MustDrop() + + // scale = bound / (maximum - minimum) + scale := maxTs.MustSub(minTs, false).MustPow(ts.IntScalar(-1), true).MustMul1(ts.IntScalar(bound), true) + // + // return ((img - minimum) * scale).clamp(0, bound).to(img.dtype) + out := img.MustSub(minTs, false).MustMul(scale, true).MustClamp(ts.IntScalar(0), ts.IntScalar(bound), true).MustTotype(dtype, true) + + minTs.MustDrop() + maxTs.MustDrop() + eqIdx.MustDrop() + scale.MustDrop() + + return out +} + +func adjustSharpness(img *ts.Tensor, factor float64) *ts.Tensor { + if factor < 0 { + log.Fatalf("Sharpness factor should not be negative. Got %v", factor) + } + + assertImageTensor(img) + assertChannels(img, []int64{1, 3}) + + dim := img.MustSize() + + var out *ts.Tensor + if (dim[len(dim)-1]) <= 2 || (dim[len(dim)-2] <= 2) { + out = img.MustShallowClone() + return out + } + + // return _blend(img, _blurred_degenerate_image(img), sharpness_factor) + img1 := blurredDegenerateImage(img) + out = blend(img, img1, factor) + + img1.MustDrop() + return out +} + +func blurredDegenerateImage(img *ts.Tensor) *ts.Tensor { + dtype := gotch.Float + device := img.MustDevice() + dim := img.MustSize() + + // kernel = torch.ones((3, 3), dtype=dtype, device=img.device) + kernel := ts.MustOnes([]int64{3, 3}, dtype, device) + + // kernel[1, 1] = 5.0 - Center kernel value + kernelView := kernel.MustNarrow(1, 1, 2, false) + centerVal := kernelView.MustOnesLike(false).MustMul1(ts.FloatScalar(5.0), true) + kernelView.Copy_(centerVal) // center kernel value + centerVal.MustDrop() + kernelView.MustDrop() + + // kernel /= kernel.sum() + kernelSum := kernel.MustSum(dtype, false) + kernelS := kernel.MustDiv(kernelSum, false) + kernelSum.MustDrop() + // kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1]) + kdim := kernelS.MustSize() + kdtype := kernelS.DType() + kernelExp := kernelS.MustExpand([]int64{dim[len(dim)-3], 1, kdim[0], kdim[1]}, true, false) + + // result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype, ]) + resTmp, needCast, needSqueeze, outDtype := castSqueezeIn(img, []gotch.DType{kdtype}) + + // result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3]) + stride := []int64{1, 1} + padding := []int64{0, 0} + dilation := []int64{1, 1} + resTmpDim := resTmp.MustSize() + group := resTmpDim[len(resTmpDim)-3] + resTmp1 := ts.MustConv2d(resTmp, kernelExp, ts.NewTensor(), stride, padding, dilation, group) + + // result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype) + resTmp2 := castSqueezeOut(resTmp1, needCast, needSqueeze, outDtype) + + // result = img.clone() + out := img.MustShallowClone() + + // result[..., 1:-1, 1:-1] = result_tmp + hDim := int64(len(dim) - 2) // second last dim + wDim := int64(len(dim) - 1) // last dim + outView := out.MustNarrow(hDim, 1, dim[len(dim)-2]-2, false).MustNarrow(wDim, 1, dim[len(dim)-1]-2, true) + + outView.Copy_(resTmp2) + + outView.MustDrop() + kernelS.MustDrop() + kernelExp.MustDrop() + resTmp.MustDrop() + resTmp1.MustDrop() + resTmp2.MustDrop() + + return out +} + +func equalize(img *ts.Tensor) *ts.Tensor { + assertImageTensor(img) + + shape := img.MustSize() + ndim := len(shape) + dtype := img.DType() + + if ndim < 3 || ndim > 4 { + log.Fatalf("Input image should have 3 or 4 dimensions. Got %v", ndim) + } + + if dtype != gotch.Uint8 { + log.Fatalf("Only dtype uint8 image tensors are supported. Got %v", dtype) + } + + assertChannels(img, []int64{1, 3}) + + // single image + if ndim == 3 { + out := equalizeSingleImage(img) + return out + } + + // batched images + var images []ts.Tensor + for i := 0; i < int(shape[0]); i++ { + x := img.MustSelect(0, int64(i), false) + o := equalizeSingleImage(x) + images = append(images, *o) + x.MustDrop() + } + + out := ts.MustStack(images, 0) + + for _, x := range images { + x.MustDrop() + } + + return out +} + +func equalizeSingleImage(img *ts.Tensor) *ts.Tensor { + dim := img.MustSize() + var scaledChans []ts.Tensor + for i := 0; i < int(dim[0]); i++ { + cTs := img.MustSelect(0, int64(i), false) + scaledChan := scaleChannel(cTs) + cTs.MustDrop() + scaledChans = append(scaledChans, *scaledChan) + } + + out := ts.MustStack(scaledChans, 0) + + for _, x := range scaledChans { + x.MustDrop() + } + + return out +} + +func scaleChannel(imgChan *ts.Tensor) *ts.Tensor { + // # TODO: we should expect bincount to always be faster than histc, but this + // # isn't always the case. Once + // # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if + // # block and only use bincount. + // if img_chan.is_cuda: + // hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255) + // else: + // hist = torch.bincount(img_chan.view(-1), minlength=256) + + // hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255) + hist := imgChan.MustHistc(256, false) + + // nonzero_hist = hist[hist != 0] + nonZeroHist := hist.MustNonzero(false) // [n, 1] + + // step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode='floor') + nonZeroHistDim := nonZeroHist.MustSize() + nonZeroHistSum := nonZeroHist.MustNarrow(0, 0, nonZeroHistDim[0]-1, true).MustSum(gotch.Int64, true) + step := nonZeroHistSum.MustDiv1(ts.IntScalar(255), true) + stepVal := step.Int64Values()[0] + nonZeroHistSum.MustDrop() + + var out *ts.Tensor + // if step == 0: + // return img_chan + if stepVal == 0 { + out = imgChan.MustShallowClone() + return out + } + + // lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode='floor'),step, rounding_mode='floor') + dtype := gotch.Float + halfStep := step.MustDiv1(ts.IntScalar(2), false) + histCumSum := hist.MustCumsum(0, dtype, false) + histStep := histCumSum.MustAdd(halfStep, false) + halfStep.MustDrop() + lut := histStep.MustDiv(step, true) // deleted histStep + + // lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255) + lut1 := lut.MustConstantPadNd([]int64{1, 0}, true) // deleted lut + lut1Dim := lut1.MustSize() + + lut2 := lut1.MustNarrow(0, 0, lut1Dim[0]-1, true).MustClamp(ts.IntScalar(0), ts.IntScalar(255), true) // deleted lut1 + // return lut[img_chan.to(torch.int64)].to(torch.uint8) + // NOTE: haven't supported multi-dimentional tensor index yet. So we do a in a loop + // channel[h, w] + h := imgChan.MustSize()[0] + // w := imgChan.MustSize()[1] + var xs []ts.Tensor + for i := 0; i < int(h); i++ { + idx := imgChan.MustSelect(0, int64(i), false).MustTotype(gotch.Int64, true) + x := lut2.MustIndexSelect(0, idx, false).MustTotype(gotch.Uint8, true) + xs = append(xs, *x) + idx.MustDrop() + } + out = ts.MustStack(xs, 0) + + // delete intermediate tensors + for _, x := range xs { + x.MustDrop() + } + halfStep.MustDrop() + lut2.MustDrop() + hist.MustDrop() + step.MustDrop() + + return out +} + +// Normalize a float tensor image with mean and standard deviation. +// +// Args: +// - tensor (Tensor): Float tensor image of size (C, H, W) or (B, C, H, W) to be normalized. +// - mean (sequence): Sequence of means for each channel. +// - std (sequence): Sequence of standard deviations for each channel. +// Returns: +// - Tensor: Normalized Tensor image. +func normalize(img *ts.Tensor, mean, std []float64) *ts.Tensor { + for _, v := range std { + if v == 0 { + log.Fatalf("One of std (%v) is zero. This is invalid as it leads to division by zero.", std) + } + } + + assertImageTensor(img) + + dim := img.MustSize() + // dtype := img.DType() + device := img.MustDevice() + if len(dim) < 3 { + log.Fatalf("Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() =%v", dim) + } + + meanTs := ts.MustOfSlice(mean).MustTotype(gotch.Float, true).MustTo(device, true) + stdTs := ts.MustOfSlice(std).MustTotype(gotch.Float, true).MustTo(device, true) + + var mTs, sTs *ts.Tensor + meanSize := meanTs.MustSize() + stdSize := stdTs.MustSize() + switch len(meanSize) { + case 1: + mTs = meanTs.MustView([]int64{-1, 1, 1}, true) + case 3: + mTs = meanTs.MustShallowClone() + meanTs.MustDrop() + default: + log.Fatalf("mean must be 1 or 3 elements. Got %v\n", len(mean)) + } + + switch len(stdSize) { + case 1: + sTs = stdTs.MustView([]int64{-1, 1, 1}, true) + case 3: + sTs = stdTs.MustShallowClone() + stdTs.MustDrop() + default: + log.Fatalf("std must be 1 or 3 elements. Got %v\n", len(std)) + } + + // out := img.MustSub(mTs, false).MustDiv(sTs, true) + x := img.MustDiv1(ts.FloatScalar(255.0), false) + out := x.MustSub(mTs, false).MustDiv(sTs, true).MustMul1(ts.IntScalar(255), true) + x.MustDrop() + + mTs.MustDrop() + sTs.MustDrop() + + return out +} diff --git a/vision/aug/grayscale.go b/vision/aug/grayscale.go new file mode 100644 index 0000000..9d0ef2c --- /dev/null +++ b/vision/aug/grayscale.go @@ -0,0 +1,81 @@ +package aug + +import ( + "log" + + ts "github.com/sugarme/gotch/tensor" + // "github.com/sugarme/gotch/tensor" +) + +// GrayScale converts image to grayscale. +// If the image is torch Tensor, it is expected +// to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions +// Args: +// - num_output_channels (int): (1 or 3) number of channels desired for output image +type Grayscale struct { + outChan int64 +} + +func (gs *Grayscale) Forward(x *ts.Tensor) *ts.Tensor { + out := rgb2Gray(x, gs.outChan) + return out +} + +func newGrayscale(outChanOpt ...int64) *Grayscale { + var outChan int64 = 3 + if len(outChanOpt) > 0 { + c := outChanOpt[0] + switch c { + case 1: + outChan = 1 + case 3: + outChan = 3 + default: + log.Fatalf("Out channels should be either 1 or 3. Got %v\n", c) + } + } + return &Grayscale{outChan} +} + +// RandomGrayscale randomly converts image to grayscale with a probability of p (default 0.1). +// If the image is torch Tensor, it is expected +// to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions +// Args: +// - p (float): probability that image should be converted to grayscale. +type RandomGrayscale struct { + pvalue float64 +} + +func newRandomGrayscale(pvalueOpt ...float64) *RandomGrayscale { + pvalue := 0.1 + if len(pvalueOpt) > 0 { + pvalue = pvalueOpt[0] + } + return &RandomGrayscale{pvalue} +} + +func (rgs *RandomGrayscale) Forward(x *ts.Tensor) *ts.Tensor { + c := getImageChanNum(x) + r := randPvalue() + var out *ts.Tensor + switch { + case r < rgs.pvalue: + out = rgb2Gray(x, c) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomGrayscale(pvalueOpt ...float64) Option { + var p float64 = 0.1 + if len(pvalueOpt) > 0 { + p = pvalueOpt[0] + } + + rgs := newRandomGrayscale(p) + return func(o *Options) { + o.randomGrayscale = rgs + } +} diff --git a/vision/aug/invert.go b/vision/aug/invert.go new file mode 100644 index 0000000..ae23584 --- /dev/null +++ b/vision/aug/invert.go @@ -0,0 +1,39 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +type RandomInvert struct { + pvalue float64 +} + +func newRandomInvert(pOpt ...float64) *RandomInvert { + p := 0.5 + if len(pOpt) > 0 { + p = pOpt[0] + } + return &RandomInvert{p} +} + +func (ri *RandomInvert) Forward(x *ts.Tensor) *ts.Tensor { + r := randPvalue() + + var out *ts.Tensor + switch { + case r < ri.pvalue: + out = invert(x) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomInvert(pvalueOpt ...float64) Option { + ri := newRandomInvert(pvalueOpt...) + + return func(o *Options) { + o.randomInvert = ri + } +} diff --git a/vision/aug/normalize.go b/vision/aug/normalize.go new file mode 100644 index 0000000..eb1f513 --- /dev/null +++ b/vision/aug/normalize.go @@ -0,0 +1,91 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// Normalize normalizes a tensor image with mean and standard deviation. +// Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` +// channels, this transform will normalize each channel of the input +// ``torch.*Tensor`` i.e., +// ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` +// .. note:: +// This transform acts out of place, i.e., it does not mutate the input tensor. +// Args: +// - mean (sequence): Sequence of means for each channel. +// - std (sequence): Sequence of standard deviations for each channel. +type Normalize struct { + mean []float64 // should be from 0 to 1 + std []float64 // should be > 0 and <= 1 +} + +type normalizeOptions struct { + mean []float64 + std []float64 +} + +type normalizeOption func(*normalizeOptions) + +// Mean and SD can be calculated for specific dataset as follow: +/* + mean = 0.0 + meansq = 0.0 + count = 0 + + for index, data in enumerate(train_loader): + mean = data.sum() + meansq = meansq + (data**2).sum() + count += np.prod(data.shape) + + total_mean = mean/count + total_var = (meansq/count) - (total_mean**2) + total_std = torch.sqrt(total_var) + print("mean: " + str(total_mean)) + print("std: " + str(total_std)) +*/ + +// For example. ImageNet dataset has RGB mean and standard error: +// meanVals := []float64{0.485, 0.456, 0.406} +// sdVals := []float64{0.229, 0.224, 0.225} +func defaultNormalizeOptions() *normalizeOptions { + return &normalizeOptions{ + mean: []float64{0, 0, 0}, + std: []float64{1, 1, 1}, + } +} + +func WithNormalizeStd(std []float64) normalizeOption { + return func(o *normalizeOptions) { + o.std = std + } +} + +func WithNormalizeMean(mean []float64) normalizeOption { + return func(o *normalizeOptions) { + o.mean = mean + } +} + +func newNormalize(opts ...normalizeOption) *Normalize { + p := defaultNormalizeOptions() + for _, o := range opts { + o(p) + } + + return &Normalize{ + mean: p.mean, + std: p.std, + } +} + +func (n *Normalize) Forward(x *ts.Tensor) *ts.Tensor { + out := normalize(x, n.mean, n.std) + return out +} + +func WithNormalize(opts ...normalizeOption) Option { + n := newNormalize(opts...) + return func(o *Options) { + o.normalize = n + } +} diff --git a/vision/aug/pad.go b/vision/aug/pad.go new file mode 100644 index 0000000..6674395 --- /dev/null +++ b/vision/aug/pad.go @@ -0,0 +1 @@ +package aug diff --git a/vision/aug/perspective.go b/vision/aug/perspective.go new file mode 100644 index 0000000..69ab194 --- /dev/null +++ b/vision/aug/perspective.go @@ -0,0 +1,190 @@ +package aug + +import ( + // "fmt" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +// RandomPerspective performs a random perspective transformation of the given image with a given probability. +// If the image is torch Tensor, it is expected +// to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1. +// Default is 0.5. +// p (float): probability of the image being transformed. Default is 0.5. +// interpolation (InterpolationMode): Desired interpolation enum defined by +// :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. +// If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. +// For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable. +// fill (sequence or number): Pixel fill value for the area outside the transformed +// image. Default is ``0``. If given a number, the value is used for all bands respectively. +type RandomPerspective struct { + distortionScale float64 // range [0, 1] + pvalue float64 // range [0, 1] + interpolationMode string + fillValue []float64 +} + +type perspectiveOptions struct { + distortionScale float64 // range [0, 1] + pvalue float64 // range [0, 1] + interpolationMode string + fillValue []float64 +} + +func defaultPerspectiveOptions() *perspectiveOptions { + return &perspectiveOptions{ + distortionScale: 0.5, + pvalue: 0.5, + interpolationMode: "bilinear", + fillValue: []float64{0.0, 0.0, 0.0}, + } +} + +type perspectiveOption func(*perspectiveOptions) + +func WithPerspectivePvalue(p float64) perspectiveOption { + return func(o *perspectiveOptions) { + o.pvalue = p + } +} + +func WithPerspectiveScale(s float64) perspectiveOption { + return func(o *perspectiveOptions) { + o.distortionScale = s + } +} + +func WithPerspectiveMode(m string) perspectiveOption { + return func(o *perspectiveOptions) { + o.interpolationMode = m + } +} + +func WithPerspectiveValue(v []float64) perspectiveOption { + return func(o *perspectiveOptions) { + o.fillValue = v + } +} + +func newRandomPerspective(opts ...perspectiveOption) *RandomPerspective { + params := defaultPerspectiveOptions() + for _, opt := range opts { + opt(params) + } + + return &RandomPerspective{ + distortionScale: params.distortionScale, + pvalue: params.pvalue, + interpolationMode: params.interpolationMode, + fillValue: params.fillValue, + } +} + +// Get parameters for ``perspective`` for a random perspective transform. +// +// Args: +// - width (int): width of the image. +// - height (int): height of the image. +// Returns: +// - List containing [top-left, top-right, bottom-right, bottom-left] of the original image, +// - List containing [top-left, top-right, bottom-right, bottom-left] of the transformed image. +func (rp *RandomPerspective) getParams(w, h int64) ([][]int64, [][]int64) { + halfH := h / 2 + halfW := w / 2 + + var ( + topLeft []int64 + topRight []int64 + bottomRight []int64 + bottomLeft []int64 + ) + + // topleft = [ + // int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1, )).item()), + // int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1, )).item()) + // ] + tlVal1 := int64(rp.distortionScale*float64(halfW)) + 1 + tlTs1 := ts.MustRandint1(0, tlVal1, []int64{1}, gotch.Int64, gotch.CPU) + tl1 := tlTs1.Int64Values()[0] + tlTs1.MustDrop() + tlVal2 := int64(rp.distortionScale*float64(halfH)) + 1 + tlTs2 := ts.MustRandint1(0, tlVal2, []int64{1}, gotch.Int64, gotch.CPU) + tl2 := tlTs2.Int64Values()[0] + tlTs2.MustDrop() + topLeft = []int64{tl1, tl2} + + // topright = [ + // int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1, )).item()), + // int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1, )).item()) + // ] + trVal1 := w - int64(rp.distortionScale*float64(halfW)) - 1 + trTs1 := ts.MustRandint1(trVal1, w, []int64{1}, gotch.Int64, gotch.CPU) + tr1 := trTs1.Int64Values()[0] + trTs1.MustDrop() + trVal2 := int64(rp.distortionScale*float64(halfH)) + 1 + trTs2 := ts.MustRandint1(0, trVal2, []int64{1}, gotch.Int64, gotch.CPU) + tr2 := trTs2.Int64Values()[0] + trTs2.MustDrop() + topRight = []int64{tr1, tr2} + + // botright = [ + // int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1, )).item()), + // int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1, )).item()) + // ] + brVal1 := w - int64(rp.distortionScale*float64(halfW)) - 1 + brTs1 := ts.MustRandint1(brVal1, w, []int64{1}, gotch.Int64, gotch.CPU) + br1 := brTs1.Int64Values()[0] + brTs1.MustDrop() + brVal2 := h - int64(rp.distortionScale*float64(halfH)) - 1 + brTs2 := ts.MustRandint1(brVal2, h, []int64{1}, gotch.Int64, gotch.CPU) + br2 := brTs2.Int64Values()[0] + brTs2.MustDrop() + bottomRight = []int64{br1, br2} + + // botleft = [ + // int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1, )).item()), + // int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1, )).item()) + // ] + blVal1 := int64(rp.distortionScale*float64(halfW)) + 1 + blTs1 := ts.MustRandint1(0, blVal1, []int64{1}, gotch.Int64, gotch.CPU) + bl1 := blTs1.Int64Values()[0] + blTs1.MustDrop() + blVal2 := h - int64(rp.distortionScale*float64(halfH)) - 1 + blTs2 := ts.MustRandint1(blVal2, h, []int64{1}, gotch.Int64, gotch.CPU) + bl2 := blTs2.Int64Values()[0] + blTs2.MustDrop() + bottomLeft = []int64{bl1, bl2} + + startPoints := [][]int64{ + {0, 0}, + {w - 1, 0}, + {w - 1, h - 1}, + {0, h - 1}, + } + + endPoints := [][]int64{ + topLeft, + topRight, + bottomRight, + bottomLeft, + } + + return startPoints, endPoints +} + +func (rp *RandomPerspective) Forward(x *ts.Tensor) *ts.Tensor { + height, width := getImageSize(x) + startPoints, endPoints := rp.getParams(height, width) + out := perspective(x, startPoints, endPoints, rp.interpolationMode, rp.fillValue) + return out +} + +func WithRandomPerspective(opts ...perspectiveOption) Option { + rp := newRandomPerspective(opts...) + return func(o *Options) { + o.randomPerspective = rp + } +} diff --git a/vision/aug/posterize.go b/vision/aug/posterize.go new file mode 100644 index 0000000..9a0b24a --- /dev/null +++ b/vision/aug/posterize.go @@ -0,0 +1,77 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// RandomPosterize posterizes the image randomly with a given probability by reducing the +// number of bits for each color channel. If the image is torch Tensor, it should be of type torch.uint8, +// and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// - bits (int): number of bits to keep for each channel (0-8) +// - p (float): probability of the image being color inverted. Default value is 0.5 +// Ref. https://en.wikipedia.org/wiki/Posterization +type RandomPosterize struct { + pvalue float64 + bits uint8 +} + +type posterizeOptions struct { + pvalue float64 + bits uint8 +} + +type posterizeOption func(*posterizeOptions) + +func defaultPosterizeOptions() *posterizeOptions { + return &posterizeOptions{ + pvalue: 0.5, + bits: 4, + } +} + +func WithPosterizePvalue(p float64) posterizeOption { + return func(o *posterizeOptions) { + o.pvalue = p + } +} + +func WithPosterizeBits(bits uint8) posterizeOption { + return func(o *posterizeOptions) { + o.bits = bits + } +} + +func newRandomPosterize(opts ...posterizeOption) *RandomPosterize { + p := defaultPosterizeOptions() + for _, o := range opts { + o(p) + } + + return &RandomPosterize{ + pvalue: p.pvalue, + bits: p.bits, + } +} + +func (rp *RandomPosterize) Forward(x *ts.Tensor) *ts.Tensor { + + r := randPvalue() + var out *ts.Tensor + switch { + case r < rp.pvalue: + out = posterize(x, rp.bits) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomPosterize(opts ...posterizeOption) Option { + rp := newRandomPosterize(opts...) + + return func(o *Options) { + o.randomPosterize = rp + } +} diff --git a/vision/aug/resize.go b/vision/aug/resize.go new file mode 100644 index 0000000..84af126 --- /dev/null +++ b/vision/aug/resize.go @@ -0,0 +1,39 @@ +package aug + +import ( + "log" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" + "github.com/sugarme/gotch/vision" +) + +type ResizeModule struct { + height int64 + width int64 +} + +func newResizeModule(h, w int64) *ResizeModule { + return &ResizeModule{h, w} +} + +// Forward implements ts.Module for RandRotateModule +func (rs *ResizeModule) Forward(x *ts.Tensor) *ts.Tensor { + imgTs := x.MustTotype(gotch.Uint8, false) + out, err := vision.Resize(imgTs, rs.width, rs.height) + if err != nil { + log.Fatal(err) + } + imgTs.MustDrop() + return out +} + +func WithResize(h, w int64) Option { + return func(o *Options) { + rs := newResizeModule(h, w) + o.resize = rs + } +} + +// TODO. +type RandomResizedCrop struct{} diff --git a/vision/aug/rotate.go b/vision/aug/rotate.go new file mode 100644 index 0000000..a5afe8b --- /dev/null +++ b/vision/aug/rotate.go @@ -0,0 +1,109 @@ +package aug + +import ( + "fmt" + "log" + "math" + "math/rand" + "time" + + "github.com/sugarme/gotch" + ts "github.com/sugarme/gotch/tensor" +) + +// RandomRotate randomly rotates a tensor image within a specifed angle range (degree). +func RandomRotate(img *ts.Tensor, min, max float64) (*ts.Tensor, error) { + if min > max { + tmp := min + min = max + max = tmp + } + if min < -360 || min > 360 || max < -360 || max > 360 { + err := fmt.Errorf("min and max should be in range from -360 to 360. Got %v and %v\n", min, max) + return nil, err + } + // device := img.MustDevice() + dtype := gotch.Double + rand.Seed(time.Now().UnixNano()) + angle := min + rand.Float64()*(max-min) + + theta := float64(angle) * (math.Pi / 180) + input := img.MustUnsqueeze(0, false).MustTotype(dtype, true) + r, err := rotImg(input, theta, dtype) + if err != nil { + return nil, err + } + input.MustDrop() + rotatedImg := r.MustSqueeze(true) + return rotatedImg, nil +} + +func Rotate(img *ts.Tensor, angle float64) (*ts.Tensor, error) { + if angle < -360 || angle > 360 { + err := fmt.Errorf("angle must be in range (-360, 360)") + return nil, err + } + dtype := gotch.Double + theta := float64(angle) * (math.Pi / 180) + input := img.MustUnsqueeze(0, false).MustTotype(dtype, true) + r, err := rotImg(input, theta, dtype) + if err != nil { + return nil, err + } + input.MustDrop() + rotatedImg := r.MustSqueeze(true) + return rotatedImg, nil +} + +// RotateModule +type RotateModule struct { + angle float64 +} + +func newRotate(angle float64) *RotateModule { + return &RotateModule{angle} +} + +// Forward implements ts.Module for RotateModule +func (r *RotateModule) Forward(x *ts.Tensor) *ts.Tensor { + out, err := Rotate(x, r.angle) + if err != nil { + log.Fatal(err) + } + + return out +} + +func WithRotate(angle float64) Option { + return func(o *Options) { + r := newRotate(angle) + o.rotate = r + } +} + +// RandomRotateModule +type RandRotateModule struct { + minAngle float64 + maxAngle float64 +} + +func newRandRotate(min, max float64) *RandRotateModule { + return &RandRotateModule{min, max} +} + +// Forward implements ts.Module for RandRotateModule +func (rr *RandRotateModule) Forward(x *ts.Tensor) *ts.Tensor { + out, err := RandomRotate(x, rr.minAngle, rr.maxAngle) + if err != nil { + log.Fatal(err) + } + + return out +} + +func WithRandRotate(minAngle, maxAngle float64) Option { + return func(o *Options) { + r := newRandRotate(minAngle, maxAngle) + o.randRotate = r + } +} diff --git a/vision/aug/sharpness.go b/vision/aug/sharpness.go new file mode 100644 index 0000000..d38fad5 --- /dev/null +++ b/vision/aug/sharpness.go @@ -0,0 +1,74 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// Adjust the sharpness of the image randomly with a given probability. If the image is torch Tensor, +// it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. +// Args: +// sharpness_factor (float): How much to adjust the sharpness. Can be +// any non negative number. 0 gives a blurred image, 1 gives the +// original image while 2 increases the sharpness by a factor of 2. +// p (float): probability of the image being color inverted. Default value is 0.5 +type RandomAdjustSharpness struct { + sharpnessFactor float64 + pvalue float64 +} + +type sharpnessOptions struct { + sharpnessFactor float64 + pvalue float64 +} + +type sharpnessOption func(*sharpnessOptions) + +func defaultSharpnessOptions() *sharpnessOptions { + return &sharpnessOptions{ + sharpnessFactor: 1.0, + pvalue: 0.5, + } +} + +func WithSharpnessPvalue(p float64) sharpnessOption { + return func(o *sharpnessOptions) { + o.pvalue = p + } +} + +func WithSharpnessFactor(f float64) sharpnessOption { + return func(o *sharpnessOptions) { + o.sharpnessFactor = f + } +} + +func newRandomAdjustSharpness(opts ...sharpnessOption) *RandomAdjustSharpness { + p := defaultSharpnessOptions() + for _, o := range opts { + o(p) + } + return &RandomAdjustSharpness{ + sharpnessFactor: p.sharpnessFactor, + pvalue: p.pvalue, + } +} + +func (ras *RandomAdjustSharpness) Forward(x *ts.Tensor) *ts.Tensor { + r := randPvalue() + var out *ts.Tensor + switch { + case r < ras.pvalue: + out = adjustSharpness(x, ras.sharpnessFactor) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomAdjustSharpness(opts ...sharpnessOption) Option { + ras := newRandomAdjustSharpness(opts...) + return func(o *Options) { + o.randomAdjustSharpness = ras + } +} diff --git a/vision/aug/solarize.go b/vision/aug/solarize.go new file mode 100644 index 0000000..729ea6f --- /dev/null +++ b/vision/aug/solarize.go @@ -0,0 +1,79 @@ +package aug + +import ( + ts "github.com/sugarme/gotch/tensor" +) + +// RandomSolarize solarizes the image randomly with a given probability by inverting all pixel +// values above a threshold. If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, +// where ... means it can have an arbitrary number of leading dimensions. +// If img is PIL Image, it is expected to be in mode "L" or "RGB". +// Args: +// - threshold (float): all pixels equal or above this value are inverted. +// - p (float): probability of the image being color inverted. Default value is 0.5 +// Ref. https://en.wikipedia.org/wiki/Solarization_(photography) +type RandomSolarize struct { + threshold float64 + pvalue float64 +} + +type solarizeOptions struct { + threshold float64 + pvalue float64 +} + +type solarizeOption func(*solarizeOptions) + +func defaultSolarizeOptions() *solarizeOptions { + return &solarizeOptions{ + threshold: 128, + pvalue: 0.5, + } +} + +func WithSolarizePvalue(p float64) solarizeOption { + return func(o *solarizeOptions) { + o.pvalue = p + } +} + +func WithSolarizeThreshold(th float64) solarizeOption { + return func(o *solarizeOptions) { + o.threshold = th + } +} + +func newRandomSolarize(opts ...solarizeOption) *RandomSolarize { + params := defaultSolarizeOptions() + + for _, o := range opts { + o(params) + } + + return &RandomSolarize{ + threshold: params.threshold, + pvalue: params.pvalue, + } +} + +func (rs *RandomSolarize) Forward(x *ts.Tensor) *ts.Tensor { + r := randPvalue() + + var out *ts.Tensor + switch { + case r < rs.pvalue: + out = solarize(x, rs.threshold) + default: + out = x.MustShallowClone() + } + + return out +} + +func WithRandomSolarize(opts ...solarizeOption) Option { + rs := newRandomSolarize(opts...) + + return func(o *Options) { + o.randomSolarize = rs + } +} diff --git a/vision/aug/transform.go b/vision/aug/transform.go new file mode 100644 index 0000000..2767d60 --- /dev/null +++ b/vision/aug/transform.go @@ -0,0 +1,188 @@ +package aug + +import ( + "math/rand" + "time" + + "github.com/sugarme/gotch/nn" + ts "github.com/sugarme/gotch/tensor" +) + +// Transformer is an interface that can transform an image tensor. +type Transformer interface { + Transform(x *ts.Tensor) *ts.Tensor +} + +// Augment is a struct composes of augmentation functions to implement Transformer interface. +type Augment struct { + augments *nn.Sequential +} + +// Transform implements Transformer interface for Augment struct. +func (a *Augment) Transform(image *ts.Tensor) *ts.Tensor { + out := a.augments.Forward(image) + return out +} + +type Options struct { + rotate *RotateModule + randRotate *RandRotateModule + resize *ResizeModule + colorJitter *ColorJitter + gaussianBlur *GaussianBlur + randomHFlip *RandomHorizontalFlip + randomVFlip *RandomVerticalFlip + randomCrop *RandomCrop + centerCrop *CenterCrop + randomCutout *RandomCutout + randomPerspective *RandomPerspective + randomAffine *RandomAffine + randomGrayscale *RandomGrayscale + randomSolarize *RandomSolarize + randomPosterize *RandomPosterize + randomInvert *RandomInvert + randomAutocontrast *RandomAutocontrast + randomAdjustSharpness *RandomAdjustSharpness + randomEqualize *RandomEqualize + normalize *Normalize +} + +func defaultOption() *Options { + return &Options{ + rotate: nil, + randRotate: nil, + resize: nil, + colorJitter: nil, + gaussianBlur: nil, + randomHFlip: nil, + randomVFlip: nil, + randomCrop: nil, + centerCrop: nil, + randomCutout: nil, + randomPerspective: nil, + randomAffine: nil, + randomGrayscale: nil, + randomSolarize: nil, + randomPosterize: nil, + randomInvert: nil, + randomAutocontrast: nil, + randomAdjustSharpness: nil, + randomEqualize: nil, + normalize: nil, + } +} + +type Option func(o *Options) + +// Compose creates a new Augment struct by adding augmentation methods. +func Compose(opts ...Option) (Transformer, error) { + augOpts := defaultOption() + for _, opt := range opts { + if opt != nil { + opt(augOpts) + } + } + + var augs *nn.Sequential = nn.Seq() + + if augOpts.rotate != nil { + augs.Add(augOpts.rotate) + } + + if augOpts.randRotate != nil { + augs.Add(augOpts.randRotate) + } + + if augOpts.resize != nil { + augs.Add(augOpts.resize) + } + + if augOpts.colorJitter != nil { + augs.Add(augOpts.colorJitter) + } + + if augOpts.gaussianBlur != nil { + augs.Add(augOpts.gaussianBlur) + } + + if augOpts.randomHFlip != nil { + augs.Add(augOpts.randomHFlip) + } + + if augOpts.randomVFlip != nil { + augs.Add(augOpts.randomVFlip) + } + + if augOpts.randomCrop != nil { + augs.Add(augOpts.randomCrop) + } + + if augOpts.centerCrop != nil { + augs.Add(augOpts.centerCrop) + } + + if augOpts.randomCutout != nil { + augs.Add(augOpts.randomCutout) + } + + if augOpts.randomPerspective != nil { + augs.Add(augOpts.randomPerspective) + } + + if augOpts.randomAffine != nil { + augs.Add(augOpts.randomAffine) + } + + if augOpts.randomGrayscale != nil { + augs.Add(augOpts.randomGrayscale) + } + + if augOpts.randomSolarize != nil { + augs.Add(augOpts.randomSolarize) + } + + if augOpts.randomPosterize != nil { + augs.Add(augOpts.randomPosterize) + } + + if augOpts.randomInvert != nil { + augs.Add(augOpts.randomInvert) + } + + if augOpts.randomAutocontrast != nil { + augs.Add(augOpts.randomAutocontrast) + } + + if augOpts.randomAdjustSharpness != nil { + augs.Add(augOpts.randomAdjustSharpness) + } + + if augOpts.randomEqualize != nil { + augs.Add(augOpts.randomEqualize) + } + + if augOpts.normalize != nil { + augs.Add(augOpts.normalize) + } + + return &Augment{augs}, nil +} + +// OneOf randomly return one transformer from list of transformers +// with a specific p value. +func OneOf(pvalue float64, tfOpts ...Option) Option { + tfsNum := len(tfOpts) + if tfsNum < 1 { + return nil + } + + randP := randPvalue() + if randP >= pvalue { + return nil + } + + rand.Seed(time.Now().UnixNano()) + idx := rand.Intn(tfsNum) + + return tfOpts[idx] +}