diff --git a/.gitignore b/.gitignore index 42ef99b..20d3f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ *.json *.pt *.ot -*.jpg target/ _build/ diff --git a/example/jit/emu.jpg b/example/jit/emu.jpg new file mode 100644 index 0000000..ec41b6c Binary files /dev/null and b/example/jit/emu.jpg differ diff --git a/example/jit/image.jpg b/example/jit/image.jpg new file mode 100644 index 0000000..157b994 Binary files /dev/null and b/example/jit/image.jpg differ diff --git a/example/jit/kangaroo.jpg b/example/jit/kangaroo.jpg new file mode 100644 index 0000000..385234b Binary files /dev/null and b/example/jit/kangaroo.jpg differ diff --git a/example/jit/koala.jpg b/example/jit/koala.jpg new file mode 100644 index 0000000..c8cfe0a Binary files /dev/null and b/example/jit/koala.jpg differ diff --git a/example/jit/pig.jpg b/example/jit/pig.jpg new file mode 100644 index 0000000..9f19ee4 Binary files /dev/null and b/example/jit/pig.jpg differ diff --git a/example/jit/wombat.jpg b/example/jit/wombat.jpg new file mode 100644 index 0000000..53fe58a Binary files /dev/null and b/example/jit/wombat.jpg differ diff --git a/example/yolo/README.md b/example/yolo/README.md new file mode 100644 index 0000000..342f5c0 --- /dev/null +++ b/example/yolo/README.md @@ -0,0 +1,16 @@ +# YOLO model + +This is an example of implementing YOLO v3 model. + +The model weights can be [download here](https://drive.google.com/file/d/16eO9o4rclD929LHweCPW_-7HjKfNKVnA/view?usp=sharing). + +Here is an example of image inference using Yolo v3 model. + +## Original Image + +![Bondi Beach - Original](bondi.jpg "Bondi Beach") + +## Yolo v3 inference + +![Bondi Beach - Yolo inference](yolo_bondi.jpg "Bondi Beach - YOLO v3") + diff --git a/example/yolo/bondi.jpg b/example/yolo/bondi.jpg new file mode 100644 index 0000000..d2dea7d Binary files /dev/null and b/example/yolo/bondi.jpg differ diff --git a/example/yolo/draw.go b/example/yolo/draw.go new file mode 100644 index 0000000..f59977d --- /dev/null +++ b/example/yolo/draw.go @@ -0,0 +1,147 @@ +package main + +import ( + "image" + "image/color" + // "image/jpeg" + "io/ioutil" + + "flag" + "log" + "os" + "path/filepath" + + "golang.org/x/image/draw" + "golang.org/x/image/font" + + "github.com/sugarme/gotch/example/yolo/freetype" + ts "github.com/sugarme/gotch/tensor" +) + +var ( + dpi = flag.Float64("dpi", 72, "screen resolution in Dots Per Inch") + fontfile = flag.String("fontfile", "luxisr.ttf", "filename of the ttf font") + hinting = flag.String("hinting", "none", "none | full") + size = flag.Float64("size", 12, "font size in points") + spacing = flag.Float64("spacing", 1.2, "line spacing (e.g. 2 means double spaced)") + wonb = flag.Bool("whiteonblack", false, "white text on a black background") + bound = flag.Bool("bound", true, "generates image with minimum size for the text") +) + +func loadImage(file string) (retVal image.Image, err error) { + imagePath, err := filepath.Abs(file) + if err != nil { + return nil, err + } + f, err := os.Open(imagePath) + if err != nil { + return nil, err + } + + img, _, err := image.Decode(f) + return img, err +} + +func textToImageTs(text []string) *ts.Tensor { + offset := 0 + + flag.Parse() + + // Read font data + fontBytes, err := ioutil.ReadFile(*fontfile) + if err != nil { + log.Println(err) + return nil + } + + f, err := freetype.ParseFont(fontBytes) + if err != nil { + log.Println(err) + return nil + } + + var width, height int + // Initialize the context. + c := freetype.NewContext() + c.SetDPI(*dpi) + c.SetFont(f) + c.SetFontSize(*size) + + switch *hinting { + default: + c.SetHinting(font.HintingNone) + case "full": + c.SetHinting(font.HintingFull) + } + + // Measure the text to calculate the minimum size of the image + if *bound { + pt := freetype.Pt(offset, offset+int(c.PointToFixed(*size)>>6)) + for _, s := range text { + ptr, err := c.MeasureString(s, pt) + if err != nil { + log.Println(err) + return nil + } + pt.Y += c.PointToFixed(*size * *spacing) + x := int(ptr.X >> 6) + if x > width { + width = x + } + } + width += offset + height = int(pt.Y)>>6 - int(c.PointToFixed(*size)>>6) + // Use default size for the image + } else { + width = 640 + height = 480 + } + + // Creates image with the specified size + fg, bg := image.Black, image.White + ruler := color.RGBA{0xdd, 0xdd, 0xdd, 0xff} + if *wonb { + fg, bg = image.White, image.Black + ruler = color.RGBA{0x22, 0x22, 0x22, 0xff} + } + rgba := image.NewRGBA(image.Rect(0, 0, width, height)) + draw.Draw(rgba, rgba.Bounds(), bg, image.ZP, draw.Src) + c.SetClip(rgba.Bounds()) + c.SetDst(rgba) + c.SetSrc(fg) + + // Draw the guidelines + for i := 0; i < 200; i++ { + rgba.Set(offset, offset+i, ruler) + rgba.Set(offset+i, offset, ruler) + } + + // Draw the text. + pt := freetype.Pt(offset, offset+int(c.PointToFixed(*size)>>6)) + for _, s := range text { + _, err = c.DrawString(s, pt) + if err != nil { + log.Println(err) + return nil + } + pt.Y += c.PointToFixed(*size * *spacing) + } + + var rgb []float64 + var r, g, b []float64 + for i := 0; i < len(rgba.Pix); i += 4 { + start := i + r = append(r, float64(rgba.Pix[start])/255.0) + g = append(g, float64(rgba.Pix[start+1])/255.0) + b = append(b, float64(rgba.Pix[start+2])/255.0) + } + + rgb = append(rgb, r...) + rgb = append(rgb, g...) + rgb = append(rgb, b...) + + w := int64(rgba.Rect.Dx()) + h := int64(rgba.Rect.Dy()) + + return ts.MustOfSlice(rgb).MustView([]int64{3, h, w}, false) +} diff --git a/example/yolo/freetype/freetype.go b/example/yolo/freetype/freetype.go new file mode 100644 index 0000000..5e3255b --- /dev/null +++ b/example/yolo/freetype/freetype.go @@ -0,0 +1,366 @@ +// Copyright 2010 The Freetype-Go Authors. All rights reserved. +// Use of this source code is governed by your choice of either the +// FreeType License or the GNU General Public License version 2 (or +// any later version), both of which can be found in the LICENSE file. + +// The freetype package provides a convenient API to draw text onto an image. +// Use the freetype/raster and freetype/truetype packages for lower level +// control over rasterization and TrueType parsing. +package freetype // import "github.com/golang/freetype" + +import ( + "errors" + "image" + "image/draw" + + "github.com/golang/freetype/raster" + "github.com/golang/freetype/truetype" + "golang.org/x/image/font" + "golang.org/x/image/math/fixed" +) + +// These constants determine the size of the glyph cache. The cache is keyed +// primarily by the glyph index modulo nGlyphs, and secondarily by sub-pixel +// position for the mask image. Sub-pixel positions are quantized to +// nXFractions possible values in both the x and y directions. +const ( + nGlyphs = 256 + nXFractions = 4 + nYFractions = 1 +) + +// An entry in the glyph cache is keyed explicitly by the glyph index and +// implicitly by the quantized x and y fractional offset. It maps to a mask +// image and an offset. +type cacheEntry struct { + valid bool + glyph truetype.Index + advanceWidth fixed.Int26_6 + mask *image.Alpha + offset image.Point +} + +// ParseFont just calls the Parse function from the freetype/truetype package. +// It is provided here so that code that imports this package doesn't need +// to also include the freetype/truetype package. +func ParseFont(b []byte) (*truetype.Font, error) { + return truetype.Parse(b) +} + +// Pt converts from a co-ordinate pair measured in pixels to a fixed.Point26_6 +// co-ordinate pair measured in fixed.Int26_6 units. +func Pt(x, y int) fixed.Point26_6 { + return fixed.Point26_6{ + X: fixed.Int26_6(x << 6), + Y: fixed.Int26_6(y << 6), + } +} + +// A Context holds the state for drawing text in a given font and size. +type Context struct { + r *raster.Rasterizer + f *truetype.Font + glyphBuf truetype.GlyphBuf + // clip is the clip rectangle for drawing. + clip image.Rectangle + // dst and src are the destination and source images for drawing. + dst draw.Image + src image.Image + // fontSize and dpi are used to calculate scale. scale is the number of + // 26.6 fixed point units in 1 em. hinting is the hinting policy. + fontSize, dpi float64 + scale fixed.Int26_6 + hinting font.Hinting + // cache is the glyph cache. + cache [nGlyphs * nXFractions * nYFractions]cacheEntry +} + +// PointToFixed converts the given number of points (as in "a 12 point font") +// into a 26.6 fixed point number of pixels. +func (c *Context) PointToFixed(x float64) fixed.Int26_6 { + return fixed.Int26_6(x * float64(c.dpi) * (64.0 / 72.0)) +} + +// drawContour draws the given closed contour with the given offset. +func (c *Context) drawContour(ps []truetype.Point, dx, dy fixed.Int26_6) { + if len(ps) == 0 { + return + } + + // The low bit of each point's Flags value is whether the point is on the + // curve. Truetype fonts only have quadratic Bézier curves, not cubics. + // Thus, two consecutive off-curve points imply an on-curve point in the + // middle of those two. + // + // See http://chanae.walon.org/pub/ttf/ttf_glyphs.htm for more details. + + // ps[0] is a truetype.Point measured in FUnits and positive Y going + // upwards. start is the same thing measured in fixed point units and + // positive Y going downwards, and offset by (dx, dy). + start := fixed.Point26_6{ + X: dx + ps[0].X, + Y: dy - ps[0].Y, + } + others := []truetype.Point(nil) + if ps[0].Flags&0x01 != 0 { + others = ps[1:] + } else { + last := fixed.Point26_6{ + X: dx + ps[len(ps)-1].X, + Y: dy - ps[len(ps)-1].Y, + } + if ps[len(ps)-1].Flags&0x01 != 0 { + start = last + others = ps[:len(ps)-1] + } else { + start = fixed.Point26_6{ + X: (start.X + last.X) / 2, + Y: (start.Y + last.Y) / 2, + } + others = ps + } + } + c.r.Start(start) + q0, on0 := start, true + for _, p := range others { + q := fixed.Point26_6{ + X: dx + p.X, + Y: dy - p.Y, + } + on := p.Flags&0x01 != 0 + if on { + if on0 { + c.r.Add1(q) + } else { + c.r.Add2(q0, q) + } + } else { + if on0 { + // No-op. + } else { + mid := fixed.Point26_6{ + X: (q0.X + q.X) / 2, + Y: (q0.Y + q.Y) / 2, + } + c.r.Add2(q0, mid) + } + } + q0, on0 = q, on + } + // Close the curve. + if on0 { + c.r.Add1(start) + } else { + c.r.Add2(q0, start) + } +} + +// rasterize returns the advance width, glyph mask and integer-pixel offset +// to render the given glyph at the given sub-pixel offsets. +// The 26.6 fixed point arguments fx and fy must be in the range [0, 1). +func (c *Context) rasterize(glyph truetype.Index, fx, fy fixed.Int26_6) ( + fixed.Int26_6, *image.Alpha, image.Point, error) { + + if err := c.glyphBuf.Load(c.f, c.scale, glyph, c.hinting); err != nil { + return 0, nil, image.Point{}, err + } + // Calculate the integer-pixel bounds for the glyph. + xmin := int(fx+c.glyphBuf.Bounds.Min.X) >> 6 + ymin := int(fy-c.glyphBuf.Bounds.Max.Y) >> 6 + xmax := int(fx+c.glyphBuf.Bounds.Max.X+0x3f) >> 6 + ymax := int(fy-c.glyphBuf.Bounds.Min.Y+0x3f) >> 6 + if xmin > xmax || ymin > ymax { + return 0, nil, image.Point{}, errors.New("freetype: negative sized glyph") + } + // A TrueType's glyph's nodes can have negative co-ordinates, but the + // rasterizer clips anything left of x=0 or above y=0. xmin and ymin are + // the pixel offsets, based on the font's FUnit metrics, that let a + // negative co-ordinate in TrueType space be non-negative in rasterizer + // space. xmin and ymin are typically <= 0. + fx -= fixed.Int26_6(xmin << 6) + fy -= fixed.Int26_6(ymin << 6) + // Rasterize the glyph's vectors. + c.r.Clear() + e0 := 0 + for _, e1 := range c.glyphBuf.Ends { + c.drawContour(c.glyphBuf.Points[e0:e1], fx, fy) + e0 = e1 + } + a := image.NewAlpha(image.Rect(0, 0, xmax-xmin, ymax-ymin)) + c.r.Rasterize(raster.NewAlphaSrcPainter(a)) + return c.glyphBuf.AdvanceWidth, a, image.Point{xmin, ymin}, nil +} + +// glyph returns the advance width, glyph mask and integer-pixel offset to +// render the given glyph at the given sub-pixel point. It is a cache for the +// rasterize method. Unlike rasterize, p's co-ordinates do not have to be in +// the range [0, 1). +func (c *Context) glyph(glyph truetype.Index, p fixed.Point26_6) ( + fixed.Int26_6, *image.Alpha, image.Point, error) { + + // Split p.X and p.Y into their integer and fractional parts. + ix, fx := int(p.X>>6), p.X&0x3f + iy, fy := int(p.Y>>6), p.Y&0x3f + // Calculate the index t into the cache array. + tg := int(glyph) % nGlyphs + tx := int(fx) / (64 / nXFractions) + ty := int(fy) / (64 / nYFractions) + t := ((tg*nXFractions)+tx)*nYFractions + ty + // Check for a cache hit. + if e := c.cache[t]; e.valid && e.glyph == glyph { + return e.advanceWidth, e.mask, e.offset.Add(image.Point{ix, iy}), nil + } + // Rasterize the glyph and put the result into the cache. + advanceWidth, mask, offset, err := c.rasterize(glyph, fx, fy) + if err != nil { + return 0, nil, image.Point{}, err + } + c.cache[t] = cacheEntry{true, glyph, advanceWidth, mask, offset} + return advanceWidth, mask, offset.Add(image.Point{ix, iy}), nil +} + +// DrawString draws s at p and returns p advanced by the text extent. The text +// is placed so that the left edge of the em square of the first character of s +// and the baseline intersect at p. The majority of the affected pixels will be +// above and to the right of the point, but some may be below or to the left. +// For example, drawing a string that starts with a 'J' in an italic font may +// affect pixels below and left of the point. +// +// p is a fixed.Point26_6 and can therefore represent sub-pixel positions. +func (c *Context) DrawString(s string, p fixed.Point26_6) (fixed.Point26_6, error) { + if c.f == nil { + return fixed.Point26_6{}, errors.New("freetype: DrawString called with a nil font") + } + prev, hasPrev := truetype.Index(0), false + for _, rune := range s { + index := c.f.Index(rune) + if hasPrev { + kern := c.f.Kern(c.scale, prev, index) + if c.hinting != font.HintingNone { + kern = (kern + 32) &^ 63 + } + p.X += kern + } + advanceWidth, mask, offset, err := c.glyph(index, p) + if err != nil { + return fixed.Point26_6{}, err + } + p.X += advanceWidth + glyphRect := mask.Bounds().Add(offset) + dr := c.clip.Intersect(glyphRect) + if !dr.Empty() { + mp := image.Point{0, dr.Min.Y - glyphRect.Min.Y} + draw.DrawMask(c.dst, dr, c.src, image.ZP, mask, mp, draw.Over) + } + prev, hasPrev = index, true + } + return p, nil +} + +// MeasureString is identical to DrawString but only measure the text. +func (c *Context) MeasureString(s string, p fixed.Point26_6) (fixed.Point26_6, error) { + if c.f == nil { + return fixed.Point26_6{}, errors.New("freetype: MeasureString called with a nil font") + } + prev, hasPrev := truetype.Index(0), false + for _, rune := range s { + index := c.f.Index(rune) + if hasPrev { + kern := c.f.Kern(c.scale, prev, index) + if c.hinting != font.HintingNone { + kern = (kern + 32) &^ 63 + } + p.X += kern + } + advanceWidth, _, _, err := c.glyph(index, p) + if err != nil { + return fixed.Point26_6{}, err + } + p.X += advanceWidth + prev, hasPrev = index, true + } + return p, nil +} + +// recalc recalculates scale and bounds values from the font size, screen +// resolution and font metrics, and invalidates the glyph cache. +func (c *Context) recalc() { + c.scale = fixed.Int26_6(c.fontSize * c.dpi * (64.0 / 72.0)) + if c.f == nil { + c.r.SetBounds(0, 0) + } else { + // Set the rasterizer's bounds to be big enough to handle the largest glyph. + b := c.f.Bounds(c.scale) + xmin := +int(b.Min.X) >> 6 + ymin := -int(b.Max.Y) >> 6 + xmax := +int(b.Max.X+63) >> 6 + ymax := -int(b.Min.Y-63) >> 6 + c.r.SetBounds(xmax-xmin, ymax-ymin) + } + for i := range c.cache { + c.cache[i] = cacheEntry{} + } +} + +// SetDPI sets the screen resolution in dots per inch. +func (c *Context) SetDPI(dpi float64) { + if c.dpi == dpi { + return + } + c.dpi = dpi + c.recalc() +} + +// SetFont sets the font used to draw text. +func (c *Context) SetFont(f *truetype.Font) { + if c.f == f { + return + } + c.f = f + c.recalc() +} + +// SetFontSize sets the font size in points (as in "a 12 point font"). +func (c *Context) SetFontSize(fontSize float64) { + if c.fontSize == fontSize { + return + } + c.fontSize = fontSize + c.recalc() +} + +// SetHinting sets the hinting policy. +func (c *Context) SetHinting(hinting font.Hinting) { + c.hinting = hinting + for i := range c.cache { + c.cache[i] = cacheEntry{} + } +} + +// SetDst sets the destination image for draw operations. +func (c *Context) SetDst(dst draw.Image) { + c.dst = dst +} + +// SetSrc sets the source image for draw operations. This is typically an +// image.Uniform. +func (c *Context) SetSrc(src image.Image) { + c.src = src +} + +// SetClip sets the clip rectangle for drawing. +func (c *Context) SetClip(clip image.Rectangle) { + c.clip = clip +} + +// TODO(nigeltao): implement Context.SetGamma. + +// NewContext creates a new Context. +func NewContext() *Context { + return &Context{ + r: raster.NewRasterizer(0, 0), + fontSize: 12, + dpi: 72, + scale: 12 << 6, + } +} diff --git a/example/yolo/luxisr.ttf b/example/yolo/luxisr.ttf new file mode 100644 index 0000000..c47fd20 Binary files /dev/null and b/example/yolo/luxisr.ttf differ diff --git a/example/yolo/main.go b/example/yolo/main.go index 2a8d949..93afd3c 100644 --- a/example/yolo/main.go +++ b/example/yolo/main.go @@ -21,8 +21,8 @@ const ( ) var ( - model string - image string + model string + imageFile string ) type Bbox struct { @@ -71,6 +71,38 @@ func drawRect(t *ts.Tensor, x1, x2, y1, y2 int64) { color.MustDrop() } +func drawLabel(t *ts.Tensor, text []string, x, y int64) { + device, err := t.Device() + if err != nil { + log.Fatal(err) + } + label := textToImageTs(text).MustTo(device, true) + + labelSize := label.MustSize() + height := labelSize[1] + width := labelSize[2] + + imageSize := t.MustSize() + lenY := height + if lenY > imageSize[1] { + lenY = imageSize[1] - y + } + + lenX := width + if lenX > imageSize[2] { + lenX = imageSize[2] - x + } + + // NOTE: `narrow` will create a tensor (view) that share same storage with + // original one. + + tmp1 := t.MustNarrow(2, x, lenX, false) + tmp2 := tmp1.MustNarrow(1, y, lenY, true) + tmp2.Copy_(label) + tmp2.MustDrop() + label.MustDrop() +} + func report(pred *ts.Tensor, img *ts.Tensor, w int64, h int64) *ts.Tensor { size2, err := pred.Size2() if err != nil { @@ -176,6 +208,9 @@ func report(pred *ts.Tensor, img *ts.Tensor, w int64, h int64) *ts.Tensor { drawRect(image, xmin, xmax, max(ymin, ymax-2), ymax) drawRect(image, xmin, min(xmax, xmin+2), ymin, ymax) drawRect(image, max(xmin, xmax-2), xmax, ymin, ymax) + + label := fmt.Sprintf("%v; %.3f\n", CocoClasses[classIndex], b.confidence) + drawLabel(image, []string{label}, xmin, ymin-15) } } @@ -187,7 +222,7 @@ func report(pred *ts.Tensor, img *ts.Tensor, w int64, h int64) *ts.Tensor { func init() { flag.StringVar(&model, "model", "../../data/yolo/yolo-v3.pt", "Yolo model weights file") - flag.StringVar(&image, "image", "../../data/yolo/bondi.jpg", "image file to infer") + flag.StringVar(&imageFile, "image", "../../data/yolo/bondi.jpg", "image file to infer") } func main() { @@ -203,7 +238,7 @@ func main() { log.Fatal(err) } - imagePath, err := filepath.Abs(image) + imagePath, err := filepath.Abs(imageFile) if err != nil { log.Fatal(err) } @@ -256,10 +291,6 @@ func main() { if err != nil { log.Fatal(err) } - - // TODO: write label/confidence val next to bouding boxes. - // Naive way is write 'write text on image' rather than on tensor. - // See this: https://stackoverflow.com/questions/38299930 } func max(v1, v2 int64) (retVal int64) { diff --git a/example/yolo/yolo_bondi.jpg b/example/yolo/yolo_bondi.jpg new file mode 100644 index 0000000..f9fae21 Binary files /dev/null and b/example/yolo/yolo_bondi.jpg differ diff --git a/go.mod b/go.mod index 3f67ae4..07e600e 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,8 @@ module github.com/sugarme/gotch go 1.14 + +require ( + github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 + golang.org/x/image v0.0.0-20200927104501-e162460cd6b5 +) diff --git a/go.sum b/go.sum index e69de29..75cd14a 100644 --- a/go.sum +++ b/go.sum @@ -0,0 +1,6 @@ +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/sugarme/playgo v0.0.0-20200730185408-03b868cebe81 h1:s43waOvGVYyjw8i+Ll2Qb/ASt+etXG7LhWetEGTLjbc= +golang.org/x/image v0.0.0-20200927104501-e162460cd6b5 h1:QelT11PB4FXiDEXucrfNckHoFxwt8USGY1ajP1ZF5lM= +golang.org/x/image v0.0.0-20200927104501-e162460cd6b5/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=