From 500caaeda74dd9c660279036293f4b2997cf0b03 Mon Sep 17 00:00:00 2001 From: Dimitri Sokolyuk Date: Sat, 9 Sep 2017 09:42:37 +0200 Subject: Add vendor --- vendor/golang.org/x/image/vector/acc_amd64.go | 34 + vendor/golang.org/x/image/vector/acc_amd64.s | 1083 ++++++++++++++++++++ vendor/golang.org/x/image/vector/acc_other.go | 17 + vendor/golang.org/x/image/vector/acc_test.go | 651 ++++++++++++ vendor/golang.org/x/image/vector/gen.go | 447 ++++++++ .../golang.org/x/image/vector/gen_acc_amd64.s.tmpl | 171 ++++ vendor/golang.org/x/image/vector/raster_fixed.go | 327 ++++++ .../golang.org/x/image/vector/raster_floating.go | 220 ++++ vendor/golang.org/x/image/vector/vector.go | 472 +++++++++ vendor/golang.org/x/image/vector/vector_test.go | 519 ++++++++++ 10 files changed, 3941 insertions(+) create mode 100644 vendor/golang.org/x/image/vector/acc_amd64.go create mode 100644 vendor/golang.org/x/image/vector/acc_amd64.s create mode 100644 vendor/golang.org/x/image/vector/acc_other.go create mode 100644 vendor/golang.org/x/image/vector/acc_test.go create mode 100644 vendor/golang.org/x/image/vector/gen.go create mode 100644 vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl create mode 100644 vendor/golang.org/x/image/vector/raster_fixed.go create mode 100644 vendor/golang.org/x/image/vector/raster_floating.go create mode 100644 vendor/golang.org/x/image/vector/vector.go create mode 100644 vendor/golang.org/x/image/vector/vector_test.go (limited to 'vendor/golang.org/x/image/vector') diff --git a/vendor/golang.org/x/image/vector/acc_amd64.go b/vendor/golang.org/x/image/vector/acc_amd64.go new file mode 100644 index 0000000..68f6e03 --- /dev/null +++ b/vendor/golang.org/x/image/vector/acc_amd64.go @@ -0,0 +1,34 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build go1.6 +// +build !noasm + +package vector + +func haveSSE4_1() bool + +var haveFixedAccumulateSIMD = haveSSE4_1() + +const haveFloatingAccumulateSIMD = true + +//go:noescape +func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) + +//go:noescape +func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) + +//go:noescape +func fixedAccumulateMaskSIMD(buf []uint32) + +//go:noescape +func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) + +//go:noescape +func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) + +//go:noescape +func floatingAccumulateMaskSIMD(dst []uint32, src []float32) diff --git a/vendor/golang.org/x/image/vector/acc_amd64.s b/vendor/golang.org/x/image/vector/acc_amd64.s new file mode 100644 index 0000000..6a424bc --- /dev/null +++ b/vendor/golang.org/x/image/vector/acc_amd64.s @@ -0,0 +1,1083 @@ +// generated by go run gen.go; DO NOT EDIT + +// +build !appengine +// +build gc +// +build go1.6 +// +build !noasm + +#include "textflag.h" + +// fl is short for floating point math. fx is short for fixed point math. + +DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff +DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff +DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 +DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 +DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff +DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff + +// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an +// XMM register to the low byte of that register's four uint32 values. It +// duplicates those bytes, effectively multiplying each uint32 by 0x101. +// +// It transforms a little-endian 16-byte XMM value from +// ijkl???????????? +// to +// ii00jj00kk00ll00 +DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 +DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 + +// gather is a PSHUFB mask that brings the second-lowest byte of the XMM +// register's four uint32 values to the low four bytes of that register. +// +// It transforms a little-endian 16-byte XMM value from +// ?i???j???k???l?? +// to +// ijkl000000000000 +DATA gather<>+0x00(SB)/8, $0x808080800d090501 +DATA gather<>+0x08(SB)/8, $0x8080808080808080 + +DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff +DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff +DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 +DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 + +GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL flOne<>(SB), (NOPTR+RODATA), $16 +GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 +GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 +GLOBL gather<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 + +// func haveSSE4_1() bool +TEXT ·haveSSE4_1(SB), NOSPLIT, $0 + MOVQ $1, AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// ---------------------------------------------------------------------------- + +// func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost65536 +// xmm6 gather +// xmm7 offset +// xmm8 scatterAndMulBy0x101 +// xmm9 fxAlmost65536 +// xmm10 inverseFFFF +TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT fxAccOpOverEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + + // gather := XMM(see above) // PSHUFB shuffle mask. + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +fxAccOpOverLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE fxAccOpOverLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + + // Merge the two registers back to one, X11, and add maskA. + PSLLQ $32, X11 + XORPS X0, X11 + PADDD X11, X2 + + // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP fxAccOpOverLoop4 + +fxAccOpOverLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE fxAccOpOverEnd + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP fxAccOpOverLoop1 + +fxAccOpOverEnd: + RET + +// ---------------------------------------------------------------------------- + +// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost65536 +// xmm6 gather +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT fxAccOpSrcEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + + // gather := XMM(see above) // PSHUFB shuffle mask. + MOVOU gather<>(SB), X6 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +fxAccOpSrcLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE fxAccOpSrcLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP fxAccOpSrcLoop4 + +fxAccOpSrcLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE fxAccOpSrcEnd + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // dst[0] = uint8(z>>8) + MOVL X2, BX + SHRL $8, BX + MOVB BX, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP fxAccOpSrcLoop1 + +fxAccOpSrcEnd: + RET + +// ---------------------------------------------------------------------------- + +// func fixedAccumulateMaskSIMD(buf []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost65536 +// xmm6 - +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24 + + MOVQ buf_base+0(FP), DI + MOVQ buf_len+8(FP), BX + MOVQ buf_base+0(FP), SI + MOVQ buf_len+8(FP), R10 + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +fxAccMaskLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE fxAccMaskLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // copy(dst[:4], z) + MOVOU X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $16, DI + ADDQ $16, SI + JMP fxAccMaskLoop4 + +fxAccMaskLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE fxAccMaskEnd + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // dst[0] = uint32(z) + MOVL X2, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $4, DI + ADDQ $4, SI + JMP fxAccMaskLoop1 + +fxAccMaskEnd: + RET + +// ---------------------------------------------------------------------------- + +// func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flSignMask +// xmm4 flOne +// xmm5 flAlmost65536 +// xmm6 gather +// xmm7 offset +// xmm8 scatterAndMulBy0x101 +// xmm9 fxAlmost65536 +// xmm10 inverseFFFF +TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT flAccOpOverEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flAlmost65536<>(SB), X5 + + // gather := XMM(see above) // PSHUFB shuffle mask. + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +flAccOpOverLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE flAccOpOverLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + + // Merge the two registers back to one, X11, and add maskA. + PSLLQ $32, X11 + XORPS X0, X11 + PADDD X11, X2 + + // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP flAccOpOverLoop4 + +flAccOpOverLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE flAccOpOverEnd + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP flAccOpOverLoop1 + +flAccOpOverEnd: + RET + +// ---------------------------------------------------------------------------- + +// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flSignMask +// xmm4 flOne +// xmm5 flAlmost65536 +// xmm6 gather +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT flAccOpSrcEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flAlmost65536<>(SB), X5 + + // gather := XMM(see above) // PSHUFB shuffle mask. + MOVOU gather<>(SB), X6 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +flAccOpSrcLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE flAccOpSrcLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP flAccOpSrcLoop4 + +flAccOpSrcLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE flAccOpSrcEnd + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // dst[0] = uint8(z>>8) + MOVL X2, BX + SHRL $8, BX + MOVB BX, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP flAccOpSrcLoop1 + +flAccOpSrcEnd: + RET + +// ---------------------------------------------------------------------------- + +// func floatingAccumulateMaskSIMD(dst []uint32, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flSignMask +// xmm4 flOne +// xmm5 flAlmost65536 +// xmm6 - +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT flAccMaskEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flAlmost65536<>(SB), X5 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +flAccMaskLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE flAccMaskLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // copy(dst[:4], z) + MOVOU X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $16, DI + ADDQ $16, SI + JMP flAccMaskLoop4 + +flAccMaskLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE flAccMaskEnd + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // dst[0] = uint32(z) + MOVL X2, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $4, DI + ADDQ $4, SI + JMP flAccMaskLoop1 + +flAccMaskEnd: + RET diff --git a/vendor/golang.org/x/image/vector/acc_other.go b/vendor/golang.org/x/image/vector/acc_other.go new file mode 100644 index 0000000..30425be --- /dev/null +++ b/vendor/golang.org/x/image/vector/acc_other.go @@ -0,0 +1,17 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc !go1.6 noasm + +package vector + +const haveFixedAccumulateSIMD = false +const haveFloatingAccumulateSIMD = false + +func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {} +func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} +func fixedAccumulateMaskSIMD(buf []uint32) {} +func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {} +func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} +func floatingAccumulateMaskSIMD(dst []uint32, src []float32) {} diff --git a/vendor/golang.org/x/image/vector/acc_test.go b/vendor/golang.org/x/image/vector/acc_test.go new file mode 100644 index 0000000..d80f765 --- /dev/null +++ b/vendor/golang.org/x/image/vector/acc_test.go @@ -0,0 +1,651 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package vector + +import ( + "bytes" + "fmt" + "math/rand" + "testing" +) + +// TestDivideByFFFF tests that dividing by 0xffff is equivalent to multiplying +// and then shifting by magic constants. The Go compiler itself issues this +// multiply-and-shift for a division by the constant value 0xffff. This trick +// is used in the asm code as the GOARCH=amd64 SIMD instructions have parallel +// multiply but not parallel divide. +// +// There's undoubtedly a justification somewhere in Hacker's Delight chapter 10 +// "Integer Division by Constants", but I don't have a more specific link. +// +// http://www.hackersdelight.org/divcMore.pdf and +// http://www.hackersdelight.org/magic.htm +func TestDivideByFFFF(t *testing.T) { + const mul, shift = 0x80008001, 47 + rng := rand.New(rand.NewSource(1)) + for i := 0; i < 20000; i++ { + u := rng.Uint32() + got := uint32((uint64(u) * mul) >> shift) + want := u / 0xffff + if got != want { + t.Fatalf("i=%d, u=%#08x: got %#08x, want %#08x", i, u, got, want) + } + } +} + +// TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash. + +func TestFixedAccumulateSIMDUnaligned(t *testing.T) { + if !haveFixedAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + dst := make([]uint8, 64) + src := make([]uint32, 64) + for d := 0; d < 16; d++ { + for s := 0; s < 16; s++ { + fixedAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32]) + } + } +} + +func TestFloatingAccumulateSIMDUnaligned(t *testing.T) { + if !haveFloatingAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + dst := make([]uint8, 64) + src := make([]float32, 64) + for d := 0; d < 16; d++ { + for s := 0; s < 16; s++ { + floatingAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32]) + } + } +} + +// TestXxxSIMDShortDst tests that the SIMD implementations don't write past the +// end of the dst buffer. + +func TestFixedAccumulateSIMDShortDst(t *testing.T) { + if !haveFixedAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + const oneQuarter = uint32(int2ϕ(fxOne*fxOne)) / 4 + src := []uint32{oneQuarter, oneQuarter, oneQuarter, oneQuarter} + for i := 0; i < 4; i++ { + dst := make([]uint8, 4) + fixedAccumulateOpSrcSIMD(dst[:i], src[:i]) + for j := range dst { + if j < i { + if got := dst[j]; got == 0 { + t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got) + } + } else { + if got := dst[j]; got != 0 { + t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got) + } + } + } + } +} + +func TestFloatingAccumulateSIMDShortDst(t *testing.T) { + if !haveFloatingAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + const oneQuarter = 0.25 + src := []float32{oneQuarter, oneQuarter, oneQuarter, oneQuarter} + for i := 0; i < 4; i++ { + dst := make([]uint8, 4) + floatingAccumulateOpSrcSIMD(dst[:i], src[:i]) + for j := range dst { + if j < i { + if got := dst[j]; got == 0 { + t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got) + } + } else { + if got := dst[j]; got != 0 { + t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got) + } + } + } + } +} + +func TestFixedAccumulateOpOverShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "over") } +func TestFixedAccumulateOpSrcShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "src") } +func TestFixedAccumulateMaskShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "mask") } +func TestFloatingAccumulateOpOverShort(t *testing.T) { testAcc(t, flInShort, flMaskShort, "over") } +func TestFloatingAccumulateOpSrcShort(t *testing.T) { testAcc(t, flInShort, flMaskShort, "src") } +func TestFloatingAccumulateMaskShort(t *testing.T) { testAcc(t, flInShort, flMaskShort, "mask") } + +func TestFixedAccumulateOpOver16(t *testing.T) { testAcc(t, fxIn16, fxMask16, "over") } +func TestFixedAccumulateOpSrc16(t *testing.T) { testAcc(t, fxIn16, fxMask16, "src") } +func TestFixedAccumulateMask16(t *testing.T) { testAcc(t, fxIn16, fxMask16, "mask") } +func TestFloatingAccumulateOpOver16(t *testing.T) { testAcc(t, flIn16, flMask16, "over") } +func TestFloatingAccumulateOpSrc16(t *testing.T) { testAcc(t, flIn16, flMask16, "src") } +func TestFloatingAccumulateMask16(t *testing.T) { testAcc(t, flIn16, flMask16, "mask") } + +func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { + for _, simd := range []bool{false, true} { + maxN := 0 + switch in := in.(type) { + case []uint32: + if simd && !haveFixedAccumulateSIMD { + continue + } + maxN = len(in) + case []float32: + if simd && !haveFloatingAccumulateSIMD { + continue + } + maxN = len(in) + } + + for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 33, 55, 79, 96, 120, 165, 256, maxN} { + + if n > maxN { + continue + } + + var ( + got8, want8 []uint8 + got32, want32 []uint32 + ) + switch op { + case "over": + const background = 0x40 + got8 = make([]uint8, n) + for i := range got8 { + got8[i] = background + } + want8 = make([]uint8, n) + for i := range want8 { + dstA := uint32(background * 0x101) + maskA := mask[i] + outA := dstA*(0xffff-maskA)/0xffff + maskA + want8[i] = uint8(outA >> 8) + } + + case "src": + got8 = make([]uint8, n) + want8 = make([]uint8, n) + for i := range want8 { + want8[i] = uint8(mask[i] >> 8) + } + + case "mask": + got32 = make([]uint32, n) + want32 = mask[:n] + } + + switch in := in.(type) { + case []uint32: + switch op { + case "over": + if simd { + fixedAccumulateOpOverSIMD(got8, in[:n]) + } else { + fixedAccumulateOpOver(got8, in[:n]) + } + case "src": + if simd { + fixedAccumulateOpSrcSIMD(got8, in[:n]) + } else { + fixedAccumulateOpSrc(got8, in[:n]) + } + case "mask": + copy(got32, in[:n]) + if simd { + fixedAccumulateMaskSIMD(got32) + } else { + fixedAccumulateMask(got32) + } + } + case []float32: + switch op { + case "over": + if simd { + floatingAccumulateOpOverSIMD(got8, in[:n]) + } else { + floatingAccumulateOpOver(got8, in[:n]) + } + case "src": + if simd { + floatingAccumulateOpSrcSIMD(got8, in[:n]) + } else { + floatingAccumulateOpSrc(got8, in[:n]) + } + case "mask": + if simd { + floatingAccumulateMaskSIMD(got32, in[:n]) + } else { + floatingAccumulateMask(got32, in[:n]) + } + } + } + + if op != "mask" { + if !bytes.Equal(got8, want8) { + t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got8, want8) + } + } else { + if !uint32sEqual(got32, want32) { + t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got32, want32) + } + } + } + } +} + +func uint32sEqual(xs, ys []uint32) bool { + if len(xs) != len(ys) { + return false + } + for i := range xs { + if xs[i] != ys[i] { + return false + } + } + return true +} + +func float32sEqual(xs, ys []float32) bool { + if len(xs) != len(ys) { + return false + } + for i := range xs { + if xs[i] != ys[i] { + return false + } + } + return true +} + +func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) } +func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, fxIn16, "over", true) } +func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } +func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } +func BenchmarkFixedAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, fxIn16, "mask", true) } +func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } +func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) } +func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } +func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } +func BenchmarkFloatingAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, flIn16, "mask", true) } + +func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } +func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) } +func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } +func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } +func BenchmarkFixedAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, fxIn64, "mask", true) } +func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } +func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) } +func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } +func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } +func BenchmarkFloatingAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, flIn64, "mask", true) } + +func benchAcc(b *testing.B, in interface{}, op string, simd bool) { + var f func() + + switch in := in.(type) { + case []uint32: + if simd && !haveFixedAccumulateSIMD { + b.Skip("No SIMD implemention") + } + + switch op { + case "over": + dst := make([]uint8, len(in)) + if simd { + f = func() { fixedAccumulateOpOverSIMD(dst, in) } + } else { + f = func() { fixedAccumulateOpOver(dst, in) } + } + case "src": + dst := make([]uint8, len(in)) + if simd { + f = func() { fixedAccumulateOpSrcSIMD(dst, in) } + } else { + f = func() { fixedAccumulateOpSrc(dst, in) } + } + case "mask": + buf := make([]uint32, len(in)) + copy(buf, in) + if simd { + f = func() { fixedAccumulateMaskSIMD(buf) } + } else { + f = func() { fixedAccumulateMask(buf) } + } + } + + case []float32: + if simd && !haveFloatingAccumulateSIMD { + b.Skip("No SIMD implemention") + } + + switch op { + case "over": + dst := make([]uint8, len(in)) + if simd { + f = func() { floatingAccumulateOpOverSIMD(dst, in) } + } else { + f = func() { floatingAccumulateOpOver(dst, in) } + } + case "src": + dst := make([]uint8, len(in)) + if simd { + f = func() { floatingAccumulateOpSrcSIMD(dst, in) } + } else { + f = func() { floatingAccumulateOpSrc(dst, in) } + } + case "mask": + dst := make([]uint32, len(in)) + if simd { + f = func() { floatingAccumulateMaskSIMD(dst, in) } + } else { + f = func() { floatingAccumulateMask(dst, in) } + } + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + f() + } +} + +// itou exists because "uint32(int2ϕ(-1))" doesn't compile: constant -1 +// overflows uint32. +func itou(i int2ϕ) uint32 { + return uint32(i) +} + +var fxInShort = []uint32{ + itou(+0x08000), // +0.125, // Running sum: +0.125 + itou(-0x20000), // -0.500, // Running sum: -0.375 + itou(+0x10000), // +0.250, // Running sum: -0.125 + itou(+0x18000), // +0.375, // Running sum: +0.250 + itou(+0x08000), // +0.125, // Running sum: +0.375 + itou(+0x00000), // +0.000, // Running sum: +0.375 + itou(-0x40000), // -1.000, // Running sum: -0.625 + itou(-0x20000), // -0.500, // Running sum: -1.125 + itou(+0x10000), // +0.250, // Running sum: -0.875 + itou(+0x38000), // +0.875, // Running sum: +0.000 + itou(+0x10000), // +0.250, // Running sum: +0.250 + itou(+0x30000), // +0.750, // Running sum: +1.000 +} + +var flInShort = []float32{ + +0.125, // Running sum: +0.125 + -0.500, // Running sum: -0.375 + +0.250, // Running sum: -0.125 + +0.375, // Running sum: +0.250 + +0.125, // Running sum: +0.375 + +0.000, // Running sum: +0.375 + -1.000, // Running sum: -0.625 + -0.500, // Running sum: -1.125 + +0.250, // Running sum: -0.875 + +0.875, // Running sum: +0.000 + +0.250, // Running sum: +0.250 + +0.750, // Running sum: +1.000 +} + +// It's OK for fxMaskShort and flMaskShort to have slightly different values. +// Both the fixed and floating point implementations already have (different) +// rounding errors in the xxxLineTo methods before we get to accumulation. It's +// OK for 50% coverage (in ideal math) to be approximated by either 0x7fff or +// 0x8000. Both slices do contain checks that 0% and 100% map to 0x0000 and +// 0xffff, as does checkCornersCenter in vector_test.go. +// +// It is important, though, for the SIMD and non-SIMD fixed point +// implementations to give the exact same output, and likewise for the floating +// point implementations. + +var fxMaskShort = []uint32{ + 0x2000, + 0x6000, + 0x2000, + 0x4000, + 0x6000, + 0x6000, + 0xa000, + 0xffff, + 0xe000, + 0x0000, + 0x4000, + 0xffff, +} + +var flMaskShort = []uint32{ + 0x1fff, + 0x5fff, + 0x1fff, + 0x3fff, + 0x5fff, + 0x5fff, + 0x9fff, + 0xffff, + 0xdfff, + 0x0000, + 0x3fff, + 0xffff, +} + +func TestMakeFxInXxx(t *testing.T) { + dump := func(us []uint32) string { + var b bytes.Buffer + for i, u := range us { + if i%8 == 0 { + b.WriteByte('\n') + } + fmt.Fprintf(&b, "%#08x, ", u) + } + return b.String() + } + + if !uint32sEqual(fxIn16, hardCodedFxIn16) { + t.Errorf("height 16: got:%v\nwant:%v", dump(fxIn16), dump(hardCodedFxIn16)) + } +} + +func TestMakeFlInXxx(t *testing.T) { + dump := func(fs []float32) string { + var b bytes.Buffer + for i, f := range fs { + if i%8 == 0 { + b.WriteByte('\n') + } + fmt.Fprintf(&b, "%v, ", f) + } + return b.String() + } + + if !float32sEqual(flIn16, hardCodedFlIn16) { + t.Errorf("height 16: got:%v\nwant:%v", dump(flIn16), dump(hardCodedFlIn16)) + } +} + +func makeInXxx(height int, useFloatingPointMath bool) *Rasterizer { + width, data := scaledBenchmarkGlyphData(height) + z := NewRasterizer(width, height) + z.setUseFloatingPointMath(useFloatingPointMath) + for _, d := range data { + switch d.n { + case 0: + z.MoveTo(d.px, d.py) + case 1: + z.LineTo(d.px, d.py) + case 2: + z.QuadTo(d.px, d.py, d.qx, d.qy) + } + } + return z +} + +func makeFxInXxx(height int) []uint32 { + z := makeInXxx(height, false) + return z.bufU32 +} + +func makeFlInXxx(height int) []float32 { + z := makeInXxx(height, true) + return z.bufF32 +} + +// fxInXxx and flInXxx are the z.bufU32 and z.bufF32 inputs to the accumulate +// functions when rasterizing benchmarkGlyphData at a height of Xxx pixels. +// +// fxMaskXxx and flMaskXxx are the corresponding golden outputs of those +// accumulateMask functions. +// +// The hardCodedEtc versions are a sanity check for unexpected changes in the +// rasterization implementations up to but not including accumulation. + +var ( + fxIn16 = makeFxInXxx(16) + fxIn64 = makeFxInXxx(64) + flIn16 = makeFlInXxx(16) + flIn64 = makeFlInXxx(64) +) + +var hardCodedFxIn16 = []uint32{ + 0x00000000, 0x00000000, 0xffffe91d, 0xfffe7c4a, 0xfffeaa9f, 0xffff4e33, 0xffffc1c5, 0x00007782, + 0x00009619, 0x0001a857, 0x000129e9, 0x00000028, 0x00000000, 0x00000000, 0xffff6e70, 0xfffd3199, + 0xffff5ff8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00014b29, + 0x0002acf3, 0x000007e2, 0xffffca5a, 0xfffcab73, 0xffff8a34, 0x00001b55, 0x0001b334, 0x0001449e, + 0x0000434d, 0xffff62ec, 0xfffe1443, 0xffff325d, 0x00000000, 0x0002234a, 0x0001dcb6, 0xfffe2948, + 0xfffdd6b8, 0x00000000, 0x00028cc0, 0x00017340, 0x00000000, 0x00000000, 0x00000000, 0xffffd2d6, + 0xfffcadd0, 0xffff7f5c, 0x00007400, 0x00038c00, 0xfffe9260, 0xffff2da0, 0x0000023a, 0x0002259b, + 0x0000182a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffdc600, 0xfffe3a00, 0x00000059, + 0x0003a44d, 0x00005b59, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0xfffe33f3, 0xfffdcc0d, 0x00000000, 0x00033c02, 0x0000c3fe, 0x00000000, + 0x00000000, 0xffffa13d, 0xfffeeec8, 0xffff8c02, 0xffff8c48, 0xffffc7b5, 0x00000000, 0xffff5b68, + 0xffff3498, 0x00000000, 0x00033c00, 0x0000c400, 0xffff9bc4, 0xfffdf4a3, 0xfffe8df3, 0xffffe1a8, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00033c00, + 0x000092c7, 0xfffcf373, 0xffff3dc7, 0x00000fcc, 0x00011ae7, 0x000130c3, 0x0000680d, 0x00004a59, + 0x00000a20, 0xfffe9dc4, 0xfffe4a3c, 0x00000000, 0x00033c00, 0xfffe87ef, 0xfffe3c11, 0x0000105e, + 0x0002b9c4, 0x000135dc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00, + 0x00000000, 0x00033c00, 0xfffd9000, 0xffff3400, 0x0000e400, 0x00031c00, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00, 0x00000000, 0x00033c00, 0xfffcf9a5, + 0xffffca5b, 0x000120e6, 0x0002df1a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xfffdb195, 0xfffe4e6b, 0x00000000, 0x00033c00, 0xfffd9e00, 0xffff2600, 0x00002f0e, 0x00033ea3, + 0x0000924d, 0x00000000, 0x00000000, 0x00000000, 0xfffe83b3, 0xfffd881d, 0xfffff431, 0x00000000, + 0x00031f60, 0xffff297a, 0xfffdb726, 0x00000000, 0x000053a7, 0x0001b506, 0x0000a24b, 0xffffa32d, + 0xfffead9b, 0xffff0479, 0xffffffc9, 0x00000000, 0x00000000, 0x0002d800, 0x0001249d, 0xfffd67bb, + 0xfffe9baa, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ac03, 0x0001448b, + 0xfffe0f70, 0x00000000, 0x000229ea, 0x0001d616, 0xffffff8c, 0xfffebf76, 0xfffe54d9, 0xffff5d9e, + 0xffffd3eb, 0x0000c65e, 0x0000fc15, 0x0001d491, 0xffffb566, 0xfffd9433, 0x00000000, 0x0000e4ec, +} + +var hardCodedFlIn16 = []float32{ + 0, 0, -0.022306755, -0.3782405, -0.33334962, -0.1741521, -0.0607556, 0.11660573, + 0.14664596, 0.41462868, 0.2907673, 0.0001568835, 0, 0, -0.14239307, -0.7012868, + -0.15632017, 0, 0, 0, 0, 0, 0, 0.3230303, + 0.6690931, 0.007876594, -0.05189419, -0.832786, -0.11531975, 0.026225802, 0.42518616, 0.3154636, + 0.06598757, -0.15304244, -0.47969276, -0.20012794, 0, 0.5327272, 0.46727282, -0.45950258, + -0.5404974, 0, 0.63484025, 0.36515975, 0, 0, 0, -0.04351709, + -0.8293345, -0.12714837, 0.11087036, 0.88912964, -0.35792422, -0.2053554, 0.0022513224, 0.5374398, + 0.023588525, 0, 0, 0, 0, -0.55346966, -0.44653034, 0.0002531938, + 0.9088273, 0.090919495, 0, 0, 0, 0, 0, 0, + 0, 0, -0.44745448, -0.5525455, 0, 0.80748945, 0.19251058, 0, + 0, -0.092476256, -0.2661464, -0.11322958, -0.11298219, -0.055094406, 0, -0.16045958, + -0.1996116, 0, 0.80748653, 0.19251347, -0.09804727, -0.51129663, -0.3610403, -0.029615778, + 0, 0, 0, 0, 0, 0, 0, 0.80748653, + 0.14411622, -0.76251525, -0.1890875, 0.01527351, 0.27528667, 0.29730347, 0.101477206, 0.07259522, + 0.009900213, -0.34395567, -0.42788061, 0, 0.80748653, -0.3648737, -0.44261283, 0.015778137, + 0.6826565, 0.30156538, 0, 0, 0, 0, -0.44563293, -0.55436707, + 0, 0.80748653, -0.60703933, -0.20044717, 0.22371745, 0.77628255, 0, 0, + 0, 0, 0, -0.44563293, -0.55436707, 0, 0.80748653, -0.7550391, + -0.05244744, 0.2797074, 0.72029257, 0, 0, 0, 0, 0, + -0.57440215, -0.42559785, 0, 0.80748653, -0.59273535, -0.21475118, 0.04544862, 0.81148535, + 0.14306602, 0, 0, 0, -0.369642, -0.61841226, -0.011945802, 0, + 0.7791623, -0.20691396, -0.57224834, 0, 0.08218567, 0.42637306, 0.1586175, -0.089709565, + -0.32935485, -0.24788953, -0.00022224105, 0, 0, 0.7085409, 0.28821066, -0.64765793, + -0.34909368, 0, 0, 0, 0, 0, 0.16679136, 0.31914657, + -0.48593786, 0, 0.537915, 0.462085, -0.00041967133, -0.3120329, -0.41914812, -0.15886839, + -0.042683028, 0.19370951, 0.24624406, 0.45803425, -0.07049577, -0.6091341, 0, 0.22253075, +} + +var fxMask16 = []uint32{ + 0x0000, 0x0000, 0x05b8, 0x66a6, 0xbbfe, 0xe871, 0xf800, 0xda20, 0xb499, 0x4a84, 0x0009, 0x0000, 0x0000, + 0x0000, 0x2463, 0xd7fd, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad35, 0x01f8, 0x0000, + 0x0d69, 0xe28c, 0xffff, 0xf92a, 0x8c5d, 0x3b36, 0x2a62, 0x51a7, 0xcc97, 0xffff, 0xffff, 0x772d, 0x0000, + 0x75ad, 0xffff, 0xffff, 0x5ccf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b4a, 0xdfd6, 0xffff, 0xe2ff, 0x0000, + 0x5b67, 0x8fff, 0x8f70, 0x060a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8e7f, 0xffff, 0xffe9, 0x16d6, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7303, 0xffff, 0xffff, 0x30ff, + 0x0000, 0x0000, 0x0000, 0x17b0, 0x5bfe, 0x78fe, 0x95ec, 0xa3fe, 0xa3fe, 0xcd24, 0xfffe, 0xfffe, 0x30fe, + 0x0001, 0x190d, 0x9be5, 0xf868, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0x30fe, + 0x0c4c, 0xcf6f, 0xfffe, 0xfc0b, 0xb551, 0x6920, 0x4f1d, 0x3c87, 0x39ff, 0x928e, 0xffff, 0xffff, 0x30ff, + 0x8f03, 0xffff, 0xfbe7, 0x4d76, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff, + 0xccff, 0xffff, 0xc6ff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff, + 0xf296, 0xffff, 0xb7c6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x939a, 0xffff, 0xffff, 0x30ff, + 0xc97f, 0xffff, 0xf43c, 0x2493, 0x0000, 0x0000, 0x0000, 0x0000, 0x5f13, 0xfd0c, 0xffff, 0xffff, 0x3827, + 0x6dc9, 0xffff, 0xffff, 0xeb16, 0x7dd4, 0x5541, 0x6c76, 0xc10f, 0xfff1, 0xffff, 0xffff, 0xffff, 0x49ff, + 0x00d8, 0xa6e9, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xd4fe, 0x83db, 0xffff, 0xffff, 0x7584, + 0x0000, 0x001c, 0x503e, 0xbb08, 0xe3a1, 0xeea6, 0xbd0e, 0x7e09, 0x08e5, 0x1b8b, 0xb67f, 0xb67f, 0x7d44, +} + +var flMask16 = []uint32{ + 0x0000, 0x0000, 0x05b5, 0x668a, 0xbbe0, 0xe875, 0xf803, 0xda29, 0xb49f, 0x4a7a, 0x000a, 0x0000, 0x0000, + 0x0000, 0x2473, 0xd7fb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad4d, 0x0204, 0x0000, + 0x0d48, 0xe27a, 0xffff, 0xf949, 0x8c70, 0x3bae, 0x2ac9, 0x51f7, 0xccc4, 0xffff, 0xffff, 0x779f, 0x0000, + 0x75a1, 0xffff, 0xffff, 0x5d7b, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b23, 0xdf73, 0xffff, 0xe39d, 0x0000, + 0x5ba0, 0x9033, 0x8f9f, 0x0609, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8db0, 0xffff, 0xffef, 0x1746, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x728c, 0xffff, 0xffff, 0x3148, + 0x0000, 0x0000, 0x0000, 0x17ac, 0x5bce, 0x78cb, 0x95b7, 0xa3d2, 0xa3d2, 0xcce6, 0xffff, 0xffff, 0x3148, + 0x0000, 0x1919, 0x9bfd, 0xf86b, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x3148, + 0x0c63, 0xcf97, 0xffff, 0xfc17, 0xb59d, 0x6981, 0x4f87, 0x3cf1, 0x3a68, 0x9276, 0xffff, 0xffff, 0x3148, + 0x8eb0, 0xffff, 0xfbf5, 0x4d33, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7214, 0xffff, 0xffff, 0x3148, + 0xccaf, 0xffff, 0xc6ba, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7214, 0xffff, 0xffff, 0x3148, + 0xf292, 0xffff, 0xb865, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x930c, 0xffff, 0xffff, 0x3148, + 0xc906, 0xffff, 0xf45d, 0x249f, 0x0000, 0x0000, 0x0000, 0x0000, 0x5ea0, 0xfcf1, 0xffff, 0xffff, 0x3888, + 0x6d81, 0xffff, 0xffff, 0xeaf5, 0x7dcf, 0x5533, 0x6c2b, 0xc07b, 0xfff1, 0xffff, 0xffff, 0xffff, 0x4a9d, + 0x00d4, 0xa6a1, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xd54d, 0x8399, 0xffff, 0xffff, 0x764b, + 0x0000, 0x001b, 0x4ffc, 0xbb4a, 0xe3f5, 0xeee3, 0xbd4c, 0x7e42, 0x0900, 0x1b0c, 0xb6fc, 0xb6fc, 0x7e04, +} + +// TestFixedFloatingCloseness compares the closeness of the fixed point and +// floating point rasterizer. +func TestFixedFloatingCloseness(t *testing.T) { + if len(fxMask16) != len(flMask16) { + t.Fatalf("len(fxMask16) != len(flMask16)") + } + + total := uint32(0) + for i := range fxMask16 { + a := fxMask16[i] + b := flMask16[i] + if a > b { + total += a - b + } else { + total += b - a + } + } + n := len(fxMask16) + + // This log message is useful when changing the fixed point rasterizer + // implementation, such as by changing ϕ. Assuming that the floating point + // rasterizer is accurate, the average difference is a measure of how + // inaccurate the (faster) fixed point rasterizer is. + // + // Smaller is better. + percent := float64(total*100) / float64(n*65535) + t.Logf("Comparing closeness of the fixed point and floating point rasterizer.\n"+ + "Specifically, the elements of fxMask16 and flMask16.\n"+ + "Total diff = %d, n = %d, avg = %.5f out of 65535, or %.5f%%.\n", + total, n, float64(total)/float64(n), percent) + + const thresholdPercent = 1.0 + if percent > thresholdPercent { + t.Errorf("average difference: got %.5f%%, want <= %.5f%%", percent, thresholdPercent) + } +} diff --git a/vendor/golang.org/x/image/vector/gen.go b/vendor/golang.org/x/image/vector/gen.go new file mode 100644 index 0000000..28b298b --- /dev/null +++ b/vendor/golang.org/x/image/vector/gen.go @@ -0,0 +1,447 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bytes" + "io/ioutil" + "log" + "strings" + "text/template" +) + +const ( + copyright = "" + + "// Copyright 2016 The Go Authors. All rights reserved.\n" + + "// Use of this source code is governed by a BSD-style\n" + + "// license that can be found in the LICENSE file.\n" + + doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n" + + dashDashDash = "// --------" +) + +func main() { + tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl") + if err != nil { + log.Fatalf("ReadFile: %v", err) + } + if !bytes.HasPrefix(tmpl, []byte(copyright)) { + log.Fatal("source template did not start with the copyright header") + } + tmpl = tmpl[len(copyright):] + + preamble := []byte(nil) + if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 { + log.Fatalf("source template did not contain %q", dashDashDash) + } else { + preamble, tmpl = tmpl[:i], tmpl[i:] + } + + t, err := template.New("").Parse(string(tmpl)) + if err != nil { + log.Fatalf("Parse: %v", err) + } + + out := bytes.NewBuffer(nil) + out.WriteString(doNotEdit) + out.Write(preamble) + + for i, v := range instances { + if i != 0 { + out.WriteString("\n") + } + if strings.Contains(v.LoadArgs, "{{.ShortName}}") { + v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1) + } + if err := t.Execute(out, v); err != nil { + log.Fatalf("Execute(%q): %v", v.ShortName, err) + } + } + + if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil { + log.Fatalf("WriteFile: %v", err) + } +} + +var instances = []struct { + LongName string + ShortName string + FrameSize string + ArgsSize string + Args string + DstElemSize1 int + DstElemSize4 int + XMM3 string + XMM4 string + XMM5 string + XMM6 string + XMM8 string + XMM9 string + XMM10 string + LoadArgs string + Setup string + LoadXMMRegs string + Add string + ClampAndScale string + ConvertToInt32 string + Store4 string + Store1 string +}{{ + LongName: "fixedAccumulateOpOver", + ShortName: "fxAccOpOver", + FrameSize: fxFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: opOverXMM6, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, +}, { + LongName: "fixedAccumulateOpSrc", + ShortName: "fxAccOpSrc", + FrameSize: fxFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: opSrcXMM6, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, +}, { + LongName: "fixedAccumulateMask", + ShortName: "fxAccMask", + FrameSize: fxFrameSize, + ArgsSize: oneArgArgsSize, + Args: "buf []uint32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: oneArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, +}, { + LongName: "floatingAccumulateOpOver", + ShortName: "flAccOpOver", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: opOverXMM6, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, +}, { + LongName: "floatingAccumulateOpSrc", + ShortName: "flAccOpSrc", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: opSrcXMM6, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, +}, { + LongName: "floatingAccumulateMask", + ShortName: "flAccMask", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint32, src []float32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, +}} + +const ( + fxFrameSize = `0` + flFrameSize = `8` + + oneArgArgsSize = `24` + twoArgArgsSize = `48` + + sizeOfUint8 = 1 + sizeOfUint32 = 4 + + fxXMM3 = `-` + flXMM3 = `flSignMask` + + fxXMM4 = `-` + flXMM4 = `flOne` + + fxXMM5 = `fxAlmost65536` + flXMM5 = `flAlmost65536` + + oneArgLoadArgs = ` + MOVQ buf_base+0(FP), DI + MOVQ buf_len+8(FP), BX + MOVQ buf_base+0(FP), SI + MOVQ buf_len+8(FP), R10 + ` + twoArgLoadArgs = ` + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT {{.ShortName}}End + ` + + fxSetup = `` + flSetup = ` + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + ` + + fxLoadXMMRegs = ` + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + ` + flLoadXMMRegs = ` + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flAlmost65536<>(SB), X5 + ` + + fxAdd = `PADDD` + flAdd = `ADDPS` + + fxClampAndScale = ` + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + ` + flClampAndScale = ` + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + ` + + fxConvertToInt32 = ` + // z = convertToInt32(y) + // No-op. + ` + flConvertToInt32 = ` + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + ` + + opOverStore4 = ` + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + // Merge the two registers back to one, X11, and add maskA. + PSLLQ $32, X11 + XORPS X0, X11 + PADDD X11, X2 + // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + ` + opSrcStore4 = ` + // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + ` + maskStore4 = ` + // copy(dst[:4], z) + MOVOU X2, (DI) + ` + + opOverStore1 = ` + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + ` + opSrcStore1 = ` + // dst[0] = uint8(z>>8) + MOVL X2, BX + SHRL $8, BX + MOVB BX, (DI) + ` + maskStore1 = ` + // dst[0] = uint32(z) + MOVL X2, (DI) + ` + + opOverXMM6 = `gather` + opSrcXMM6 = `gather` + maskXMM6 = `-` + + opOverXMM8 = `scatterAndMulBy0x101` + opSrcXMM8 = `-` + maskXMM8 = `-` + + opOverXMM9 = `fxAlmost65536` + opSrcXMM9 = `-` + maskXMM9 = `-` + + opOverXMM10 = `inverseFFFF` + opSrcXMM10 = `-` + maskXMM10 = `-` + + opOverLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + ` + opSrcLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. + MOVOU gather<>(SB), X6 + ` + maskLoadXMMRegs = `` +) diff --git a/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl b/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl new file mode 100644 index 0000000..66b21a1 --- /dev/null +++ b/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl @@ -0,0 +1,171 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build go1.6 +// +build !noasm + +#include "textflag.h" + +// fl is short for floating point math. fx is short for fixed point math. + +DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff +DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff +DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 +DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 +DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff +DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff + +// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an +// XMM register to the low byte of that register's four uint32 values. It +// duplicates those bytes, effectively multiplying each uint32 by 0x101. +// +// It transforms a little-endian 16-byte XMM value from +// ijkl???????????? +// to +// ii00jj00kk00ll00 +DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 +DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 + +// gather is a PSHUFB mask that brings the second-lowest byte of the XMM +// register's four uint32 values to the low four bytes of that register. +// +// It transforms a little-endian 16-byte XMM value from +// ?i???j???k???l?? +// to +// ijkl000000000000 +DATA gather<>+0x00(SB)/8, $0x808080800d090501 +DATA gather<>+0x08(SB)/8, $0x8080808080808080 + +DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff +DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff +DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 +DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 + +GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL flOne<>(SB), (NOPTR+RODATA), $16 +GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 +GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 +GLOBL gather<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 + +// func haveSSE4_1() bool +TEXT ·haveSSE4_1(SB), NOSPLIT, $0 + MOVQ $1, AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// ---------------------------------------------------------------------------- + +// func {{.LongName}}SIMD({{.Args}}) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 {{.XMM3}} +// xmm4 {{.XMM4}} +// xmm5 {{.XMM5}} +// xmm6 {{.XMM6}} +// xmm7 offset +// xmm8 {{.XMM8}} +// xmm9 {{.XMM9}} +// xmm10 {{.XMM10}} +TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}} + {{.LoadArgs}} + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + {{.Setup}} + + {{.LoadXMMRegs}} + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +{{.ShortName}}Loop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE {{.ShortName}}Loop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + {{.Add}} X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + {{.Add}} X0, X1 + + // x += offset + {{.Add}} X7, X1 + + {{.ClampAndScale}} + + {{.ConvertToInt32}} + + {{.Store4}} + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ ${{.DstElemSize4}}, DI + ADDQ $16, SI + JMP {{.ShortName}}Loop4 + +{{.ShortName}}Loop1: + // for i < len(src) + CMPQ R9, R11 + JAE {{.ShortName}}End + + // x = src[i] + offset + MOVL (SI), X1 + {{.Add}} X7, X1 + + {{.ClampAndScale}} + + {{.ConvertToInt32}} + + {{.Store1}} + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ ${{.DstElemSize1}}, DI + ADDQ $4, SI + JMP {{.ShortName}}Loop1 + +{{.ShortName}}End: + RET diff --git a/vendor/golang.org/x/image/vector/raster_fixed.go b/vendor/golang.org/x/image/vector/raster_fixed.go new file mode 100644 index 0000000..5b0fe7a --- /dev/null +++ b/vendor/golang.org/x/image/vector/raster_fixed.go @@ -0,0 +1,327 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package vector + +// This file contains a fixed point math implementation of the vector +// graphics rasterizer. + +const ( + // ϕ is the number of binary digits after the fixed point. + // + // For example, if ϕ == 10 (and int1ϕ is based on the int32 type) then we + // are using 22.10 fixed point math. + // + // When changing this number, also change the assembly code (search for ϕ + // in the .s files). + ϕ = 9 + + fxOne int1ϕ = 1 << ϕ + fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1) + fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up. +) + +// int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed +// point. +type int1ϕ int32 + +// int2ϕ is a signed fixed-point number with 2*ϕ binary digits after the fixed +// point. +// +// The Rasterizer's bufU32 field, nominally of type []uint32 (since that slice +// is also used by other code), can be thought of as a []int2ϕ during the +// fixedLineTo method. Lines of code that are actually like: +// buf[i] += uint32(etc) // buf has type []uint32. +// can be thought of as +// buf[i] += int2ϕ(etc) // buf has type []int2ϕ. +type int2ϕ int32 + +func fixedMax(x, y int1ϕ) int1ϕ { + if x > y { + return x + } + return y +} + +func fixedMin(x, y int1ϕ) int1ϕ { + if x < y { + return x + } + return y +} + +func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) } +func fixedCeil(x int1ϕ) int32 { return int32((x + fxOneMinusIota) >> ϕ) } + +func (z *Rasterizer) fixedLineTo(bx, by float32) { + ax, ay := z.penX, z.penY + z.penX, z.penY = bx, by + dir := int1ϕ(1) + if ay > by { + dir, ax, ay, bx, by = -1, bx, by, ax, ay + } + // Horizontal line segments yield no change in coverage. Almost horizontal + // segments would yield some change, in ideal math, but the computation + // further below, involving 1 / (by - ay), is unstable in fixed point math, + // so we treat the segment as if it was perfectly horizontal. + if by-ay <= 0.000001 { + return + } + dxdy := (bx - ax) / (by - ay) + + ayϕ := int1ϕ(ay * float32(fxOne)) + byϕ := int1ϕ(by * float32(fxOne)) + + x := int1ϕ(ax * float32(fxOne)) + y := fixedFloor(ayϕ) + yMax := fixedCeil(byϕ) + if yMax > int32(z.size.Y) { + yMax = int32(z.size.Y) + } + width := int32(z.size.X) + + for ; y < yMax; y++ { + dy := fixedMin(int1ϕ(y+1)<<ϕ, byϕ) - fixedMax(int1ϕ(y)<<ϕ, ayϕ) + xNext := x + int1ϕ(float32(dy)*dxdy) + if y < 0 { + x = xNext + continue + } + buf := z.bufU32[y*width:] + d := dy * dir // d ranges up to ±1<<(1*ϕ). + x0, x1 := x, xNext + if x > xNext { + x0, x1 = x1, x0 + } + x0i := fixedFloor(x0) + x0Floor := int1ϕ(x0i) << ϕ + x1i := fixedCeil(x1) + x1Ceil := int1ϕ(x1i) << ϕ + + if x1i <= x0i+1 { + xmf := (x+xNext)>>1 - x0Floor + if i := clamp(x0i+0, width); i < uint(len(buf)) { + buf[i] += uint32(d * (fxOne - xmf)) + } + if i := clamp(x0i+1, width); i < uint(len(buf)) { + buf[i] += uint32(d * xmf) + } + } else { + oneOverS := x1 - x0 + twoOverS := 2 * oneOverS + x0f := x0 - x0Floor + oneMinusX0f := fxOne - x0f + oneMinusX0fSquared := oneMinusX0f * oneMinusX0f + x1f := x1 - x1Ceil + fxOne + x1fSquared := x1f * x1f + + // These next two variables are unused, as rounding errors are + // minimized when we delay the division by oneOverS for as long as + // possible. These lines of code (and the "In ideal math" comments + // below) are commented out instead of deleted in order to aid the + // comparison with the floating point version of the rasterizer. + // + // a0 := ((oneMinusX0f * oneMinusX0f) >> 1) / oneOverS + // am := ((x1f * x1f) >> 1) / oneOverS + + if i := clamp(x0i, width); i < uint(len(buf)) { + // In ideal math: buf[i] += uint32(d * a0) + D := oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ). + D *= d // D ranges up to ±1<<(3*ϕ). + D /= twoOverS + buf[i] += uint32(D) + } + + if x1i == x0i+2 { + if i := clamp(x0i+1, width); i < uint(len(buf)) { + // In ideal math: buf[i] += uint32(d * (fxOne - a0 - am)) + // + // (x1i == x0i+2) and (twoOverS == 2 * (x1 - x0)) implies + // that twoOverS ranges up to +1<<(1*ϕ+2). + D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared // D ranges up to ±1<<(2*ϕ+2). + D *= d // D ranges up to ±1<<(3*ϕ+2). + D /= twoOverS + buf[i] += uint32(D) + } + } else { + // This is commented out for the same reason as a0 and am. + // + // a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS + + if i := clamp(x0i+1, width); i < uint(len(buf)) { + // In ideal math: + // buf[i] += uint32(d * (a1 - a0)) + // or equivalently (but better in non-ideal, integer math, + // with respect to rounding errors), + // buf[i] += uint32(A * d / twoOverS) + // where + // A = (a1 - a0) * twoOverS + // = a1*twoOverS - a0*twoOverS + // Noting that twoOverS/oneOverS equals 2, substituting for + // a0 and then a1, given above, yields: + // A = a1*twoOverS - oneMinusX0fSquared + // = (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared + // = fxOneAndAHalf<<(ϕ+1) - x0f<<(ϕ+1) - oneMinusX0fSquared + // + // This is a positive number minus two non-negative + // numbers. For an upper bound on A, the positive number is + // P = fxOneAndAHalf<<(ϕ+1) + // < (2*fxOne)<<(ϕ+1) + // = fxOne<<(ϕ+2) + // = 1<<(2*ϕ+2) + // + // For a lower bound on A, the two non-negative numbers are + // N = x0f<<(ϕ+1) + oneMinusX0fSquared + // ≤ x0f<<(ϕ+1) + fxOne*fxOne + // = x0f<<(ϕ+1) + 1<<(2*ϕ) + // < x0f<<(ϕ+1) + 1<<(2*ϕ+1) + // ≤ fxOne<<(ϕ+1) + 1<<(2*ϕ+1) + // = 1<<(2*ϕ+1) + 1<<(2*ϕ+1) + // = 1<<(2*ϕ+2) + // + // Thus, A ranges up to ±1<<(2*ϕ+2). It is possible to + // derive a tighter bound, but this bound is sufficient to + // reason about overflow. + D := (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ+2). + D *= d // D ranges up to ±1<<(3*ϕ+2). + D /= twoOverS + buf[i] += uint32(D) + } + dTimesS := uint32((d << (2 * ϕ)) / oneOverS) + for xi := x0i + 2; xi < x1i-1; xi++ { + if i := clamp(xi, width); i < uint(len(buf)) { + buf[i] += dTimesS + } + } + + // This is commented out for the same reason as a0 and am. + // + // a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS + + if i := clamp(x1i-1, width); i < uint(len(buf)) { + // In ideal math: + // buf[i] += uint32(d * (fxOne - a2 - am)) + // or equivalently (but better in non-ideal, integer math, + // with respect to rounding errors), + // buf[i] += uint32(A * d / twoOverS) + // where + // A = (fxOne - a2 - am) * twoOverS + // = twoOverS<<ϕ - a2*twoOverS - am*twoOverS + // Noting that twoOverS/oneOverS equals 2, substituting for + // am and then a2, given above, yields: + // A = twoOverS<<ϕ - a2*twoOverS - x1f*x1f + // = twoOverS<<ϕ - a1*twoOverS - (int1ϕ(x1i-x0i-3)<<(2*ϕ))*2 - x1f*x1f + // = twoOverS<<ϕ - a1*twoOverS - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f + // Substituting for a1, given above, yields: + // A = twoOverS<<ϕ - ((fxOneAndAHalf-x0f)<<ϕ)*2 - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f + // = twoOverS<<ϕ - (fxOneAndAHalf-x0f)<<(ϕ+1) - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f + // = B<<ϕ - x1f*x1f + // where + // B = twoOverS - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) + // = (x1-x0)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) + // + // Re-arranging the defintions given above: + // x0Floor := int1ϕ(x0i) << ϕ + // x0f := x0 - x0Floor + // x1Ceil := int1ϕ(x1i) << ϕ + // x1f := x1 - x1Ceil + fxOne + // combined with fxOne = 1<<ϕ yields: + // x0 = x0f + int1ϕ(x0i)<<ϕ + // x1 = x1f + int1ϕ(x1i-1)<<ϕ + // so that expanding (x1-x0) yields: + // B = (x1f-x0f + int1ϕ(x1i-x0i-1)<<ϕ)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) + // = (x1f-x0f)<<1 + int1ϕ(x1i-x0i-1)<<(ϕ+1) - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) + // A large part of the second and fourth terms cancel: + // B = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(-2)<<(ϕ+1) + // = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 + 1<<(ϕ+2) + // = (x1f - fxOneAndAHalf)<<1 + 1<<(ϕ+2) + // The first term, (x1f - fxOneAndAHalf)<<1, is a negative + // number, bounded below by -fxOneAndAHalf<<1, which is + // greater than -fxOne<<2, or -1<<(ϕ+2). Thus, B ranges up + // to ±1<<(ϕ+2). One final simplification: + // B = x1f<<1 + (1<<(ϕ+2) - fxOneAndAHalf<<1) + const C = 1<<(ϕ+2) - fxOneAndAHalf<<1 + D := x1f<<1 + C // D ranges up to ±1<<(1*ϕ+2). + D <<= ϕ // D ranges up to ±1<<(2*ϕ+2). + D -= x1fSquared // D ranges up to ±1<<(2*ϕ+3). + D *= d // D ranges up to ±1<<(3*ϕ+3). + D /= twoOverS + buf[i] += uint32(D) + } + } + + if i := clamp(x1i, width); i < uint(len(buf)) { + // In ideal math: buf[i] += uint32(d * am) + D := x1fSquared // D ranges up to ±1<<(2*ϕ). + D *= d // D ranges up to ±1<<(3*ϕ). + D /= twoOverS + buf[i] += uint32(D) + } + } + + x = xNext + } +} + +func fixedAccumulateOpOver(dst []uint8, src []uint32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + + acc := int2ϕ(0) + for i, v := range src { + acc += int2ϕ(v) + a := acc + if a < 0 { + a = -a + } + a >>= 2*ϕ - 16 + if a > 0xffff { + a = 0xffff + } + // This algorithm comes from the standard library's image/draw package. + dstA := uint32(dst[i]) * 0x101 + maskA := uint32(a) + outA := dstA*(0xffff-maskA)/0xffff + maskA + dst[i] = uint8(outA >> 8) + } +} + +func fixedAccumulateOpSrc(dst []uint8, src []uint32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + + acc := int2ϕ(0) + for i, v := range src { + acc += int2ϕ(v) + a := acc + if a < 0 { + a = -a + } + a >>= 2*ϕ - 8 + if a > 0xff { + a = 0xff + } + dst[i] = uint8(a) + } +} + +func fixedAccumulateMask(buf []uint32) { + acc := int2ϕ(0) + for i, v := range buf { + acc += int2ϕ(v) + a := acc + if a < 0 { + a = -a + } + a >>= 2*ϕ - 16 + if a > 0xffff { + a = 0xffff + } + buf[i] = uint32(a) + } +} diff --git a/vendor/golang.org/x/image/vector/raster_floating.go b/vendor/golang.org/x/image/vector/raster_floating.go new file mode 100644 index 0000000..fd11db1 --- /dev/null +++ b/vendor/golang.org/x/image/vector/raster_floating.go @@ -0,0 +1,220 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package vector + +// This file contains a floating point math implementation of the vector +// graphics rasterizer. + +import ( + "math" +) + +func floatingMax(x, y float32) float32 { + if x > y { + return x + } + return y +} + +func floatingMin(x, y float32) float32 { + if x < y { + return x + } + return y +} + +func floatingFloor(x float32) int32 { return int32(math.Floor(float64(x))) } +func floatingCeil(x float32) int32 { return int32(math.Ceil(float64(x))) } + +func (z *Rasterizer) floatingLineTo(bx, by float32) { + ax, ay := z.penX, z.penY + z.penX, z.penY = bx, by + dir := float32(1) + if ay > by { + dir, ax, ay, bx, by = -1, bx, by, ax, ay + } + // Horizontal line segments yield no change in coverage. Almost horizontal + // segments would yield some change, in ideal math, but the computation + // further below, involving 1 / (by - ay), is unstable in floating point + // math, so we treat the segment as if it was perfectly horizontal. + if by-ay <= 0.000001 { + return + } + dxdy := (bx - ax) / (by - ay) + + x := ax + y := floatingFloor(ay) + yMax := floatingCeil(by) + if yMax > int32(z.size.Y) { + yMax = int32(z.size.Y) + } + width := int32(z.size.X) + + for ; y < yMax; y++ { + dy := floatingMin(float32(y+1), by) - floatingMax(float32(y), ay) + + // The "float32" in expressions like "float32(foo*bar)" here and below + // look redundant, since foo and bar already have type float32, but are + // explicit in order to disable the compiler's Fused Multiply Add (FMA) + // instruction selection, which can improve performance but can result + // in different rounding errors in floating point computations. + // + // This package aims to have bit-exact identical results across all + // GOARCHes, and across pure Go code and assembly, so it disables FMA. + // + // See the discussion at + // https://groups.google.com/d/topic/golang-dev/Sti0bl2xUXQ/discussion + xNext := x + float32(dy*dxdy) + if y < 0 { + x = xNext + continue + } + buf := z.bufF32[y*width:] + d := float32(dy * dir) + x0, x1 := x, xNext + if x > xNext { + x0, x1 = x1, x0 + } + x0i := floatingFloor(x0) + x0Floor := float32(x0i) + x1i := floatingCeil(x1) + x1Ceil := float32(x1i) + + if x1i <= x0i+1 { + xmf := float32(0.5*(x+xNext)) - x0Floor + if i := clamp(x0i+0, width); i < uint(len(buf)) { + buf[i] += d - float32(d*xmf) + } + if i := clamp(x0i+1, width); i < uint(len(buf)) { + buf[i] += float32(d * xmf) + } + } else { + s := 1 / (x1 - x0) + x0f := x0 - x0Floor + oneMinusX0f := 1 - x0f + a0 := float32(0.5 * s * oneMinusX0f * oneMinusX0f) + x1f := x1 - x1Ceil + 1 + am := float32(0.5 * s * x1f * x1f) + + if i := clamp(x0i, width); i < uint(len(buf)) { + buf[i] += float32(d * a0) + } + + if x1i == x0i+2 { + if i := clamp(x0i+1, width); i < uint(len(buf)) { + buf[i] += float32(d * (1 - a0 - am)) + } + } else { + a1 := float32(s * (1.5 - x0f)) + if i := clamp(x0i+1, width); i < uint(len(buf)) { + buf[i] += float32(d * (a1 - a0)) + } + dTimesS := float32(d * s) + for xi := x0i + 2; xi < x1i-1; xi++ { + if i := clamp(xi, width); i < uint(len(buf)) { + buf[i] += dTimesS + } + } + a2 := a1 + float32(s*float32(x1i-x0i-3)) + if i := clamp(x1i-1, width); i < uint(len(buf)) { + buf[i] += float32(d * (1 - a2 - am)) + } + } + + if i := clamp(x1i, width); i < uint(len(buf)) { + buf[i] += float32(d * am) + } + } + + x = xNext + } +} + +const ( + // almost256 scales a floating point value in the range [0, 1] to a uint8 + // value in the range [0x00, 0xff]. + // + // 255 is too small. Floating point math accumulates rounding errors, so a + // fully covered src value that would in ideal math be float32(1) might be + // float32(1-ε), and uint8(255 * (1-ε)) would be 0xfe instead of 0xff. The + // uint8 conversion rounds to zero, not to nearest. + // + // 256 is too big. If we multiplied by 256, below, then a fully covered src + // value of float32(1) would translate to uint8(256 * 1), which can be 0x00 + // instead of the maximal value 0xff. + // + // math.Float32bits(almost256) is 0x437fffff. + almost256 = 255.99998 + + // almost65536 scales a floating point value in the range [0, 1] to a + // uint16 value in the range [0x0000, 0xffff]. + // + // math.Float32bits(almost65536) is 0x477fffff. + almost65536 = almost256 * 256 +) + +func floatingAccumulateOpOver(dst []uint8, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + + acc := float32(0) + for i, v := range src { + acc += v + a := acc + if a < 0 { + a = -a + } + if a > 1 { + a = 1 + } + // This algorithm comes from the standard library's image/draw package. + dstA := uint32(dst[i]) * 0x101 + maskA := uint32(almost65536 * a) + outA := dstA*(0xffff-maskA)/0xffff + maskA + dst[i] = uint8(outA >> 8) + } +} + +func floatingAccumulateOpSrc(dst []uint8, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + + acc := float32(0) + for i, v := range src { + acc += v + a := acc + if a < 0 { + a = -a + } + if a > 1 { + a = 1 + } + dst[i] = uint8(almost256 * a) + } +} + +func floatingAccumulateMask(dst []uint32, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + + acc := float32(0) + for i, v := range src { + acc += v + a := acc + if a < 0 { + a = -a + } + if a > 1 { + a = 1 + } + dst[i] = uint32(almost65536 * a) + } +} diff --git a/vendor/golang.org/x/image/vector/vector.go b/vendor/golang.org/x/image/vector/vector.go new file mode 100644 index 0000000..852a4f8 --- /dev/null +++ b/vendor/golang.org/x/image/vector/vector.go @@ -0,0 +1,472 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:generate go run gen.go +//go:generate asmfmt -w acc_amd64.s + +// asmfmt is https://github.com/klauspost/asmfmt + +// Package vector provides a rasterizer for 2-D vector graphics. +package vector // import "golang.org/x/image/vector" + +// The rasterizer's design follows +// https://medium.com/@raphlinus/inside-the-fastest-font-renderer-in-the-world-75ae5270c445 +// +// Proof of concept code is in +// https://github.com/google/font-go +// +// See also: +// http://nothings.org/gamedev/rasterize/ +// http://projects.tuxee.net/cl-vectors/section-the-cl-aa-algorithm +// https://people.gnome.org/~mathieu/libart/internals.html#INTERNALS-SCANLINE + +import ( + "image" + "image/color" + "image/draw" + "math" +) + +// floatingPointMathThreshold is the width or height above which the rasterizer +// chooses to used floating point math instead of fixed point math. +// +// Both implementations of line segmentation rasterization (see raster_fixed.go +// and raster_floating.go) implement the same algorithm (in ideal, infinite +// precision math) but they perform differently in practice. The fixed point +// math version is roughtly 1.25x faster (on GOARCH=amd64) on the benchmarks, +// but at sufficiently large scales, the computations will overflow and hence +// show rendering artifacts. The floating point math version has more +// consistent quality over larger scales, but it is significantly slower. +// +// This constant determines when to use the faster implementation and when to +// use the better quality implementation. +// +// The rationale for this particular value is that TestRasterizePolygon in +// vector_test.go checks the rendering quality of polygon edges at various +// angles, inscribed in a circle of diameter 512. It may be that a higher value +// would still produce acceptable quality, but 512 seems to work. +const floatingPointMathThreshold = 512 + +func lerp(t, px, py, qx, qy float32) (x, y float32) { + return px + t*(qx-px), py + t*(qy-py) +} + +func clamp(i, width int32) uint { + if i < 0 { + return 0 + } + if i < width { + return uint(i) + } + return uint(width) +} + +// NewRasterizer returns a new Rasterizer whose rendered mask image is bounded +// by the given width and height. +func NewRasterizer(w, h int) *Rasterizer { + z := &Rasterizer{} + z.Reset(w, h) + return z +} + +// Raster is a 2-D vector graphics rasterizer. +// +// The zero value is usable, in that it is a Rasterizer whose rendered mask +// image has zero width and zero height. Call Reset to change its bounds. +type Rasterizer struct { + // bufXxx are buffers of float32 or uint32 values, holding either the + // individual or cumulative area values. + // + // We don't actually need both values at any given time, and to conserve + // memory, the integration of the individual to the cumulative could modify + // the buffer in place. In other words, we could use a single buffer, say + // of type []uint32, and add some math.Float32bits and math.Float32frombits + // calls to satisfy the compiler's type checking. As of Go 1.7, though, + // there is a performance penalty between: + // bufF32[i] += x + // and + // bufU32[i] = math.Float32bits(x + math.Float32frombits(bufU32[i])) + // + // See golang.org/issue/17220 for some discussion. + bufF32 []float32 + bufU32 []uint32 + + useFloatingPointMath bool + + size image.Point + firstX float32 + firstY float32 + penX float32 + penY float32 + + // DrawOp is the operator used for the Draw method. + // + // The zero value is draw.Over. + DrawOp draw.Op + + // TODO: an exported field equivalent to the mask point in the + // draw.DrawMask function in the stdlib image/draw package? +} + +// Reset resets a Rasterizer as if it was just returned by NewRasterizer. +// +// This includes setting z.DrawOp to draw.Over. +func (z *Rasterizer) Reset(w, h int) { + z.size = image.Point{w, h} + z.firstX = 0 + z.firstY = 0 + z.penX = 0 + z.penY = 0 + z.DrawOp = draw.Over + + z.setUseFloatingPointMath(w > floatingPointMathThreshold || h > floatingPointMathThreshold) +} + +func (z *Rasterizer) setUseFloatingPointMath(b bool) { + z.useFloatingPointMath = b + + // Make z.bufF32 or z.bufU32 large enough to hold width * height samples. + if z.useFloatingPointMath { + if n := z.size.X * z.size.Y; n > cap(z.bufF32) { + z.bufF32 = make([]float32, n) + } else { + z.bufF32 = z.bufF32[:n] + for i := range z.bufF32 { + z.bufF32[i] = 0 + } + } + } else { + if n := z.size.X * z.size.Y; n > cap(z.bufU32) { + z.bufU32 = make([]uint32, n) + } else { + z.bufU32 = z.bufU32[:n] + for i := range z.bufU32 { + z.bufU32[i] = 0 + } + } + } +} + +// Size returns the width and height passed to NewRasterizer or Reset. +func (z *Rasterizer) Size() image.Point { + return z.size +} + +// Bounds returns the rectangle from (0, 0) to the width and height passed to +// NewRasterizer or Reset. +func (z *Rasterizer) Bounds() image.Rectangle { + return image.Rectangle{Max: z.size} +} + +// Pen returns the location of the path-drawing pen: the last argument to the +// most recent XxxTo call. +func (z *Rasterizer) Pen() (x, y float32) { + return z.penX, z.penY +} + +// ClosePath closes the current path. +func (z *Rasterizer) ClosePath() { + z.LineTo(z.firstX, z.firstY) +} + +// MoveTo starts a new path and moves the pen to (ax, ay). +// +// The coordinates are allowed to be out of the Rasterizer's bounds. +func (z *Rasterizer) MoveTo(ax, ay float32) { + z.firstX = ax + z.firstY = ay + z.penX = ax + z.penY = ay +} + +// LineTo adds a line segment, from the pen to (bx, by), and moves the pen to +// (bx, by). +// +// The coordinates are allowed to be out of the Rasterizer's bounds. +func (z *Rasterizer) LineTo(bx, by float32) { + if z.useFloatingPointMath { + z.floatingLineTo(bx, by) + } else { + z.fixedLineTo(bx, by) + } +} + +// QuadTo adds a quadratic Bézier segment, from the pen via (bx, by) to (cx, +// cy), and moves the pen to (cx, cy). +// +// The coordinates are allowed to be out of the Rasterizer's bounds. +func (z *Rasterizer) QuadTo(bx, by, cx, cy float32) { + ax, ay := z.penX, z.penY + devsq := devSquared(ax, ay, bx, by, cx, cy) + if devsq >= 0.333 { + const tol = 3 + n := 1 + int(math.Sqrt(math.Sqrt(tol*float64(devsq)))) + t, nInv := float32(0), 1/float32(n) + for i := 0; i < n-1; i++ { + t += nInv + abx, aby := lerp(t, ax, ay, bx, by) + bcx, bcy := lerp(t, bx, by, cx, cy) + z.LineTo(lerp(t, abx, aby, bcx, bcy)) + } + } + z.LineTo(cx, cy) +} + +// CubeTo adds a cubic Bézier segment, from the pen via (bx, by) and (cx, cy) +// to (dx, dy), and moves the pen to (dx, dy). +// +// The coordinates are allowed to be out of the Rasterizer's bounds. +func (z *Rasterizer) CubeTo(bx, by, cx, cy, dx, dy float32) { + ax, ay := z.penX, z.penY + devsq := devSquared(ax, ay, bx, by, dx, dy) + if devsqAlt := devSquared(ax, ay, cx, cy, dx, dy); devsq < devsqAlt { + devsq = devsqAlt + } + if devsq >= 0.333 { + const tol = 3 + n := 1 + int(math.Sqrt(math.Sqrt(tol*float64(devsq)))) + t, nInv := float32(0), 1/float32(n) + for i := 0; i < n-1; i++ { + t += nInv + abx, aby := lerp(t, ax, ay, bx, by) + bcx, bcy := lerp(t, bx, by, cx, cy) + cdx, cdy := lerp(t, cx, cy, dx, dy) + abcx, abcy := lerp(t, abx, aby, bcx, bcy) + bcdx, bcdy := lerp(t, bcx, bcy, cdx, cdy) + z.LineTo(lerp(t, abcx, abcy, bcdx, bcdy)) + } + } + z.LineTo(dx, dy) +} + +// devSquared returns a measure of how curvy the sequence (ax, ay) to (bx, by) +// to (cx, cy) is. It determines how many line segments will approximate a +// Bézier curve segment. +// +// http://lists.nongnu.org/archive/html/freetype-devel/2016-08/msg00080.html +// gives the rationale for this evenly spaced heuristic instead of a recursive +// de Casteljau approach: +// +// The reason for the subdivision by n is that I expect the "flatness" +// computation to be semi-expensive (it's done once rather than on each +// potential subdivision) and also because you'll often get fewer subdivisions. +// Taking a circular arc as a simplifying assumption (ie a spherical cow), +// where I get n, a recursive approach would get 2^⌈lg n⌉, which, if I haven't +// made any horrible mistakes, is expected to be 33% more in the limit. +func devSquared(ax, ay, bx, by, cx, cy float32) float32 { + devx := ax - 2*bx + cx + devy := ay - 2*by + cy + return devx*devx + devy*devy +} + +// Draw implements the Drawer interface from the standard library's image/draw +// package. +// +// The vector paths previously added via the XxxTo calls become the mask for +// drawing src onto dst. +func (z *Rasterizer) Draw(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { + // TODO: adjust r and sp (and mp?) if src.Bounds() doesn't contain + // r.Add(sp.Sub(r.Min)). + + if src, ok := src.(*image.Uniform); ok { + srcR, srcG, srcB, srcA := src.RGBA() + switch dst := dst.(type) { + case *image.Alpha: + // Fast path for glyph rendering. + if srcA == 0xffff { + if z.DrawOp == draw.Over { + z.rasterizeDstAlphaSrcOpaqueOpOver(dst, r) + } else { + z.rasterizeDstAlphaSrcOpaqueOpSrc(dst, r) + } + return + } + case *image.RGBA: + if z.DrawOp == draw.Over { + z.rasterizeDstRGBASrcUniformOpOver(dst, r, srcR, srcG, srcB, srcA) + } else { + z.rasterizeDstRGBASrcUniformOpSrc(dst, r, srcR, srcG, srcB, srcA) + } + return + } + } + + if z.DrawOp == draw.Over { + z.rasterizeOpOver(dst, r, src, sp) + } else { + z.rasterizeOpSrc(dst, r, src, sp) + } +} + +func (z *Rasterizer) accumulateMask() { + if z.useFloatingPointMath { + if n := z.size.X * z.size.Y; n > cap(z.bufU32) { + z.bufU32 = make([]uint32, n) + } else { + z.bufU32 = z.bufU32[:n] + } + if haveFloatingAccumulateSIMD { + floatingAccumulateMaskSIMD(z.bufU32, z.bufF32) + } else { + floatingAccumulateMask(z.bufU32, z.bufF32) + } + } else { + if haveFixedAccumulateSIMD { + fixedAccumulateMaskSIMD(z.bufU32) + } else { + fixedAccumulateMask(z.bufU32) + } + } +} + +func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) { + // TODO: non-zero vs even-odd winding? + if r == dst.Bounds() && r == z.Bounds() { + // We bypass the z.accumulateMask step and convert straight from + // z.bufF32 or z.bufU32 to dst.Pix. + if z.useFloatingPointMath { + if haveFloatingAccumulateSIMD { + floatingAccumulateOpOverSIMD(dst.Pix, z.bufF32) + } else { + floatingAccumulateOpOver(dst.Pix, z.bufF32) + } + } else { + if haveFixedAccumulateSIMD { + fixedAccumulateOpOverSIMD(dst.Pix, z.bufU32) + } else { + fixedAccumulateOpOver(dst.Pix, z.bufU32) + } + } + return + } + + z.accumulateMask() + pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + ma := z.bufU32[y*z.size.X+x] + i := y*dst.Stride + x + + // This formula is like rasterizeOpOver's, simplified for the + // concrete dst type and opaque src assumption. + a := 0xffff - ma + pix[i] = uint8((uint32(pix[i])*0x101*a/0xffff + ma) >> 8) + } + } +} + +func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) { + // TODO: non-zero vs even-odd winding? + if r == dst.Bounds() && r == z.Bounds() { + // We bypass the z.accumulateMask step and convert straight from + // z.bufF32 or z.bufU32 to dst.Pix. + if z.useFloatingPointMath { + if haveFloatingAccumulateSIMD { + floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32) + } else { + floatingAccumulateOpSrc(dst.Pix, z.bufF32) + } + } else { + if haveFixedAccumulateSIMD { + fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32) + } else { + fixedAccumulateOpSrc(dst.Pix, z.bufU32) + } + } + return + } + + z.accumulateMask() + pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + ma := z.bufU32[y*z.size.X+x] + + // This formula is like rasterizeOpSrc's, simplified for the + // concrete dst type and opaque src assumption. + pix[y*dst.Stride+x] = uint8(ma >> 8) + } + } +} + +func (z *Rasterizer) rasterizeDstRGBASrcUniformOpOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + z.accumulateMask() + pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + ma := z.bufU32[y*z.size.X+x] + + // This formula is like rasterizeOpOver's, simplified for the + // concrete dst type and uniform src assumption. + a := 0xffff - (sa * ma / 0xffff) + i := y*dst.Stride + 4*x + pix[i+0] = uint8(((uint32(pix[i+0])*0x101*a + sr*ma) / 0xffff) >> 8) + pix[i+1] = uint8(((uint32(pix[i+1])*0x101*a + sg*ma) / 0xffff) >> 8) + pix[i+2] = uint8(((uint32(pix[i+2])*0x101*a + sb*ma) / 0xffff) >> 8) + pix[i+3] = uint8(((uint32(pix[i+3])*0x101*a + sa*ma) / 0xffff) >> 8) + } + } +} + +func (z *Rasterizer) rasterizeDstRGBASrcUniformOpSrc(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + z.accumulateMask() + pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + ma := z.bufU32[y*z.size.X+x] + + // This formula is like rasterizeOpSrc's, simplified for the + // concrete dst type and uniform src assumption. + i := y*dst.Stride + 4*x + pix[i+0] = uint8((sr * ma / 0xffff) >> 8) + pix[i+1] = uint8((sg * ma / 0xffff) >> 8) + pix[i+2] = uint8((sb * ma / 0xffff) >> 8) + pix[i+3] = uint8((sa * ma / 0xffff) >> 8) + } + } +} + +func (z *Rasterizer) rasterizeOpOver(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { + z.accumulateMask() + out := color.RGBA64{} + outc := color.Color(&out) + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + sr, sg, sb, sa := src.At(sp.X+x, sp.Y+y).RGBA() + ma := z.bufU32[y*z.size.X+x] + + // This algorithm comes from the standard library's image/draw + // package. + dr, dg, db, da := dst.At(r.Min.X+x, r.Min.Y+y).RGBA() + a := 0xffff - (sa * ma / 0xffff) + out.R = uint16((dr*a + sr*ma) / 0xffff) + out.G = uint16((dg*a + sg*ma) / 0xffff) + out.B = uint16((db*a + sb*ma) / 0xffff) + out.A = uint16((da*a + sa*ma) / 0xffff) + + dst.Set(r.Min.X+x, r.Min.Y+y, outc) + } + } +} + +func (z *Rasterizer) rasterizeOpSrc(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { + z.accumulateMask() + out := color.RGBA64{} + outc := color.Color(&out) + for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { + for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { + sr, sg, sb, sa := src.At(sp.X+x, sp.Y+y).RGBA() + ma := z.bufU32[y*z.size.X+x] + + // This algorithm comes from the standard library's image/draw + // package. + out.R = uint16(sr * ma / 0xffff) + out.G = uint16(sg * ma / 0xffff) + out.B = uint16(sb * ma / 0xffff) + out.A = uint16(sa * ma / 0xffff) + + dst.Set(r.Min.X+x, r.Min.Y+y, outc) + } + } +} diff --git a/vendor/golang.org/x/image/vector/vector_test.go b/vendor/golang.org/x/image/vector/vector_test.go new file mode 100644 index 0000000..012968e --- /dev/null +++ b/vendor/golang.org/x/image/vector/vector_test.go @@ -0,0 +1,519 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package vector + +// TODO: add tests for NaN and Inf coordinates. + +import ( + "fmt" + "image" + "image/color" + "image/draw" + "image/png" + "math" + "math/rand" + "os" + "path/filepath" + "testing" +) + +// encodePNG is useful for manually debugging the tests. +func encodePNG(dstFilename string, src image.Image) error { + f, err := os.Create(dstFilename) + if err != nil { + return err + } + encErr := png.Encode(f, src) + closeErr := f.Close() + if encErr != nil { + return encErr + } + return closeErr +} + +func pointOnCircle(center, radius, index, number int) (x, y float32) { + c := float64(center) + r := float64(radius) + i := float64(index) + n := float64(number) + return float32(c + r*(math.Cos(2*math.Pi*i/n))), + float32(c + r*(math.Sin(2*math.Pi*i/n))) +} + +func TestRasterizeOutOfBounds(t *testing.T) { + // Set this to a non-empty string such as "/tmp" to manually inspect the + // rasterization. + // + // If empty, this test simply checks that calling LineTo with points out of + // the rasterizer's bounds doesn't panic. + const tmpDir = "" + + const center, radius, n = 16, 20, 16 + var z Rasterizer + for i := 0; i < n; i++ { + for j := 1; j < n/2; j++ { + z.Reset(2*center, 2*center) + z.MoveTo(1*center, 1*center) + z.LineTo(pointOnCircle(center, radius, i+0, n)) + z.LineTo(pointOnCircle(center, radius, i+j, n)) + z.ClosePath() + + z.MoveTo(0*center, 0*center) + z.LineTo(0*center, 2*center) + z.LineTo(2*center, 2*center) + z.LineTo(2*center, 0*center) + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if tmpDir == "" { + continue + } + + filename := filepath.Join(tmpDir, fmt.Sprintf("out-%02d-%02d.png", i, j)) + if err := encodePNG(filename, dst); err != nil { + t.Error(err) + } + t.Logf("wrote %s", filename) + } + } +} + +func TestRasterizePolygon(t *testing.T) { + var z Rasterizer + for radius := 4; radius <= 256; radius *= 2 { + for n := 3; n <= 19; n += 4 { + z.Reset(2*radius, 2*radius) + z.MoveTo(float32(2*radius), float32(1*radius)) + for i := 1; i < n; i++ { + z.LineTo(pointOnCircle(radius, radius, i, n)) + } + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if err := checkCornersCenter(dst); err != nil { + t.Errorf("radius=%d, n=%d: %v", radius, n, err) + } + } + } +} + +func TestRasterizeAlmostAxisAligned(t *testing.T) { + z := NewRasterizer(8, 8) + z.MoveTo(2, 2) + z.LineTo(6, math.Nextafter32(2, 0)) + z.LineTo(6, 6) + z.LineTo(math.Nextafter32(2, 0), 6) + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if err := checkCornersCenter(dst); err != nil { + t.Error(err) + } +} + +func TestRasterizeWideAlmostHorizontalLines(t *testing.T) { + var z Rasterizer + for i := uint(3); i < 16; i++ { + x := float32(int(1 << i)) + + z.Reset(8, 8) + z.MoveTo(-x, 3) + z.LineTo(+x, 4) + z.LineTo(+x, 6) + z.LineTo(-x, 6) + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if err := checkCornersCenter(dst); err != nil { + t.Errorf("i=%d: %v", i, err) + } + } +} + +func TestRasterize30Degrees(t *testing.T) { + z := NewRasterizer(8, 8) + z.MoveTo(4, 4) + z.LineTo(8, 4) + z.LineTo(4, 6) + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if err := checkCornersCenter(dst); err != nil { + t.Error(err) + } +} + +func TestRasterizeRandomLineTos(t *testing.T) { + var z Rasterizer + for i := 5; i < 50; i++ { + n, rng := 0, rand.New(rand.NewSource(int64(i))) + + z.Reset(i+2, i+2) + z.MoveTo(float32(i/2), float32(i/2)) + for ; rng.Intn(16) != 0; n++ { + x := 1 + rng.Intn(i) + y := 1 + rng.Intn(i) + z.LineTo(float32(x), float32(y)) + } + z.ClosePath() + + dst := image.NewAlpha(z.Bounds()) + z.Draw(dst, dst.Bounds(), image.Opaque, image.Point{}) + + if err := checkCorners(dst); err != nil { + t.Errorf("i=%d (%d nodes): %v", i, n, err) + } + } +} + +// checkCornersCenter checks that the corners of the image are all 0x00 and the +// center is 0xff. +func checkCornersCenter(m *image.Alpha) error { + if err := checkCorners(m); err != nil { + return err + } + size := m.Bounds().Size() + center := m.Pix[(size.Y/2)*m.Stride+(size.X/2)] + if center != 0xff { + return fmt.Errorf("center: got %#02x, want 0xff", center) + } + return nil +} + +// checkCorners checks that the corners of the image are all 0x00. +func checkCorners(m *image.Alpha) error { + size := m.Bounds().Size() + corners := [4]uint8{ + m.Pix[(0*size.Y+0)*m.Stride+(0*size.X+0)], + m.Pix[(0*size.Y+0)*m.Stride+(1*size.X-1)], + m.Pix[(1*size.Y-1)*m.Stride+(0*size.X+0)], + m.Pix[(1*size.Y-1)*m.Stride+(1*size.X-1)], + } + if corners != [4]uint8{} { + return fmt.Errorf("corners were not all zero: %v", corners) + } + return nil +} + +var basicMask = []byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe3, 0xaa, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfa, 0x5f, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc, 0x24, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xa1, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc, 0x14, 0x00, 0x00, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x4a, 0x00, 0x00, + 0x00, 0x00, 0xcc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x81, 0x00, 0x00, + 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0xff, 0xff, 0xef, 0xe4, 0xff, 0xff, 0xff, 0xb6, 0x00, 0x00, + 0x00, 0x00, 0x0c, 0xf2, 0xff, 0xff, 0xfe, 0x9e, 0x15, 0x00, 0x15, 0x96, 0xff, 0xce, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x88, 0xfc, 0xe3, 0x43, 0x00, 0x00, 0x00, 0x00, 0x06, 0xcd, 0xdc, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x10, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x25, 0xde, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +} + +func testBasicPath(t *testing.T, prefix string, dst draw.Image, src image.Image, op draw.Op, want []byte) { + z := NewRasterizer(16, 16) + z.MoveTo(2, 2) + z.LineTo(8, 2) + z.QuadTo(14, 2, 14, 14) + z.CubeTo(8, 2, 5, 20, 2, 8) + z.ClosePath() + + z.DrawOp = op + z.Draw(dst, z.Bounds(), src, image.Point{}) + + var got []byte + switch dst := dst.(type) { + case *image.Alpha: + got = dst.Pix + case *image.RGBA: + got = dst.Pix + default: + t.Errorf("%s: unrecognized dst image type %T", prefix, dst) + } + + if len(got) != len(want) { + t.Errorf("%s: len(got)=%d and len(want)=%d differ", prefix, len(got), len(want)) + return + } + for i := range got { + delta := int(got[i]) - int(want[i]) + // The +/- 2 allows different implementations to give different + // rounding errors. + if delta < -2 || +2 < delta { + t.Errorf("%s: i=%d: got %#02x, want %#02x", prefix, i, got[i], want[i]) + return + } + } +} + +func TestBasicPathDstAlpha(t *testing.T) { + for _, background := range []uint8{0x00, 0x80} { + for _, op := range []draw.Op{draw.Over, draw.Src} { + for _, xPadding := range []int{0, 7} { + bounds := image.Rect(0, 0, 16+xPadding, 16) + dst := image.NewAlpha(bounds) + for i := range dst.Pix { + dst.Pix[i] = background + } + + want := make([]byte, len(dst.Pix)) + copy(want, dst.Pix) + + if op == draw.Over && background == 0x80 { + for y := 0; y < 16; y++ { + for x := 0; x < 16; x++ { + ma := basicMask[16*y+x] + i := dst.PixOffset(x, y) + want[i] = 0xff - (0xff-ma)/2 + } + } + } else { + for y := 0; y < 16; y++ { + for x := 0; x < 16; x++ { + ma := basicMask[16*y+x] + i := dst.PixOffset(x, y) + want[i] = ma + } + } + } + + prefix := fmt.Sprintf("background=%#02x, op=%v, xPadding=%d", background, op, xPadding) + testBasicPath(t, prefix, dst, image.Opaque, op, want) + } + } + } +} + +func TestBasicPathDstRGBA(t *testing.T) { + blue := image.NewUniform(color.RGBA{0x00, 0x00, 0xff, 0xff}) + + for _, op := range []draw.Op{draw.Over, draw.Src} { + for _, xPadding := range []int{0, 7} { + bounds := image.Rect(0, 0, 16+xPadding, 16) + dst := image.NewRGBA(bounds) + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { + dst.SetRGBA(x, y, color.RGBA{ + R: uint8(y * 0x07), + G: uint8(x * 0x05), + B: 0x00, + A: 0x80, + }) + } + } + + want := make([]byte, len(dst.Pix)) + copy(want, dst.Pix) + + if op == draw.Over { + for y := 0; y < 16; y++ { + for x := 0; x < 16; x++ { + ma := basicMask[16*y+x] + i := dst.PixOffset(x, y) + want[i+0] = uint8((uint32(0xff-ma) * uint32(y*0x07)) / 0xff) + want[i+1] = uint8((uint32(0xff-ma) * uint32(x*0x05)) / 0xff) + want[i+2] = ma + want[i+3] = ma/2 + 0x80 + } + } + } else { + for y := 0; y < 16; y++ { + for x := 0; x < 16; x++ { + ma := basicMask[16*y+x] + i := dst.PixOffset(x, y) + want[i+0] = 0x00 + want[i+1] = 0x00 + want[i+2] = ma + want[i+3] = ma + } + } + } + + prefix := fmt.Sprintf("op=%v, xPadding=%d", op, xPadding) + testBasicPath(t, prefix, dst, blue, op, want) + } + } +} + +const ( + benchmarkGlyphWidth = 893 + benchmarkGlyphHeight = 1122 +) + +type benchmarkGlyphDatum struct { + // n being 0, 1 or 2 means moveTo, lineTo or quadTo. + n uint32 + px float32 + py float32 + qx float32 + qy float32 +} + +// benchmarkGlyphData is the 'a' glyph from the Roboto Regular font, translated +// so that its top left corner is (0, 0). +var benchmarkGlyphData = []benchmarkGlyphDatum{ + {0, 699, 1102, 0, 0}, + {2, 683, 1070, 673, 988}, + {2, 544, 1122, 365, 1122}, + {2, 205, 1122, 102.5, 1031.5}, + {2, 0, 941, 0, 802}, + {2, 0, 633, 128.5, 539.5}, + {2, 257, 446, 490, 446}, + {1, 670, 446, 0, 0}, + {1, 670, 361, 0, 0}, + {2, 670, 264, 612, 206.5}, + {2, 554, 149, 441, 149}, + {2, 342, 149, 275, 199}, + {2, 208, 249, 208, 320}, + {1, 22, 320, 0, 0}, + {2, 22, 239, 79.5, 163.5}, + {2, 137, 88, 235.5, 44}, + {2, 334, 0, 452, 0}, + {2, 639, 0, 745, 93.5}, + {2, 851, 187, 855, 351}, + {1, 855, 849, 0, 0}, + {2, 855, 998, 893, 1086}, + {1, 893, 1102, 0, 0}, + {1, 699, 1102, 0, 0}, + {0, 392, 961, 0, 0}, + {2, 479, 961, 557, 916}, + {2, 635, 871, 670, 799}, + {1, 670, 577, 0, 0}, + {1, 525, 577, 0, 0}, + {2, 185, 577, 185, 776}, + {2, 185, 863, 243, 912}, + {2, 301, 961, 392, 961}, +} + +func scaledBenchmarkGlyphData(height int) (width int, data []benchmarkGlyphDatum) { + scale := float32(height) / benchmarkGlyphHeight + + // Clone the benchmarkGlyphData slice and scale its coordinates. + data = append(data, benchmarkGlyphData...) + for i := range data { + data[i].px *= scale + data[i].py *= scale + data[i].qx *= scale + data[i].qy *= scale + } + + return int(math.Ceil(float64(benchmarkGlyphWidth * scale))), data +} + +// benchGlyph benchmarks rasterizing a TrueType glyph. +// +// Note that, compared to the github.com/google/font-go prototype, the height +// here is the height of the bounding box, not the pixels per em used to scale +// a glyph's vectors. A height of 64 corresponds to a ppem greater than 64. +func benchGlyph(b *testing.B, colorModel byte, loose bool, height int, op draw.Op) { + width, data := scaledBenchmarkGlyphData(height) + z := NewRasterizer(width, height) + + bounds := z.Bounds() + if loose { + bounds.Max.X++ + } + dst, src := draw.Image(nil), image.Image(nil) + switch colorModel { + case 'A': + dst = image.NewAlpha(bounds) + src = image.Opaque + case 'N': + dst = image.NewNRGBA(bounds) + src = image.NewUniform(color.NRGBA{0x40, 0x80, 0xc0, 0xff}) + case 'R': + dst = image.NewRGBA(bounds) + src = image.NewUniform(color.RGBA{0x40, 0x80, 0xc0, 0xff}) + default: + b.Fatal("unsupported color model") + } + bounds = z.Bounds() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + z.Reset(width, height) + z.DrawOp = op + for _, d := range data { + switch d.n { + case 0: + z.MoveTo(d.px, d.py) + case 1: + z.LineTo(d.px, d.py) + case 2: + z.QuadTo(d.px, d.py, d.qx, d.qy) + } + } + z.Draw(dst, bounds, src, image.Point{}) + } +} + +// The heights 16, 32, 64, 128, 256, 1024 include numbers both above and below +// the floatingPointMathThreshold constant (512). + +func BenchmarkGlyphAlpha16Over(b *testing.B) { benchGlyph(b, 'A', false, 16, draw.Over) } +func BenchmarkGlyphAlpha16Src(b *testing.B) { benchGlyph(b, 'A', false, 16, draw.Src) } +func BenchmarkGlyphAlpha32Over(b *testing.B) { benchGlyph(b, 'A', false, 32, draw.Over) } +func BenchmarkGlyphAlpha32Src(b *testing.B) { benchGlyph(b, 'A', false, 32, draw.Src) } +func BenchmarkGlyphAlpha64Over(b *testing.B) { benchGlyph(b, 'A', false, 64, draw.Over) } +func BenchmarkGlyphAlpha64Src(b *testing.B) { benchGlyph(b, 'A', false, 64, draw.Src) } +func BenchmarkGlyphAlpha128Over(b *testing.B) { benchGlyph(b, 'A', false, 128, draw.Over) } +func BenchmarkGlyphAlpha128Src(b *testing.B) { benchGlyph(b, 'A', false, 128, draw.Src) } +func BenchmarkGlyphAlpha256Over(b *testing.B) { benchGlyph(b, 'A', false, 256, draw.Over) } +func BenchmarkGlyphAlpha256Src(b *testing.B) { benchGlyph(b, 'A', false, 256, draw.Src) } +func BenchmarkGlyphAlpha1024Over(b *testing.B) { benchGlyph(b, 'A', false, 1024, draw.Over) } +func BenchmarkGlyphAlpha1024Src(b *testing.B) { benchGlyph(b, 'A', false, 1024, draw.Src) } + +func BenchmarkGlyphAlphaLoose16Over(b *testing.B) { benchGlyph(b, 'A', true, 16, draw.Over) } +func BenchmarkGlyphAlphaLoose16Src(b *testing.B) { benchGlyph(b, 'A', true, 16, draw.Src) } +func BenchmarkGlyphAlphaLoose32Over(b *testing.B) { benchGlyph(b, 'A', true, 32, draw.Over) } +func BenchmarkGlyphAlphaLoose32Src(b *testing.B) { benchGlyph(b, 'A', true, 32, draw.Src) } +func BenchmarkGlyphAlphaLoose64Over(b *testing.B) { benchGlyph(b, 'A', true, 64, draw.Over) } +func BenchmarkGlyphAlphaLoose64Src(b *testing.B) { benchGlyph(b, 'A', true, 64, draw.Src) } +func BenchmarkGlyphAlphaLoose128Over(b *testing.B) { benchGlyph(b, 'A', true, 128, draw.Over) } +func BenchmarkGlyphAlphaLoose128Src(b *testing.B) { benchGlyph(b, 'A', true, 128, draw.Src) } +func BenchmarkGlyphAlphaLoose256Over(b *testing.B) { benchGlyph(b, 'A', true, 256, draw.Over) } +func BenchmarkGlyphAlphaLoose256Src(b *testing.B) { benchGlyph(b, 'A', true, 256, draw.Src) } +func BenchmarkGlyphAlphaLoose1024Over(b *testing.B) { benchGlyph(b, 'A', true, 1024, draw.Over) } +func BenchmarkGlyphAlphaLoose1024Src(b *testing.B) { benchGlyph(b, 'A', true, 1024, draw.Src) } + +func BenchmarkGlyphRGBA16Over(b *testing.B) { benchGlyph(b, 'R', false, 16, draw.Over) } +func BenchmarkGlyphRGBA16Src(b *testing.B) { benchGlyph(b, 'R', false, 16, draw.Src) } +func BenchmarkGlyphRGBA32Over(b *testing.B) { benchGlyph(b, 'R', false, 32, draw.Over) } +func BenchmarkGlyphRGBA32Src(b *testing.B) { benchGlyph(b, 'R', false, 32, draw.Src) } +func BenchmarkGlyphRGBA64Over(b *testing.B) { benchGlyph(b, 'R', false, 64, draw.Over) } +func BenchmarkGlyphRGBA64Src(b *testing.B) { benchGlyph(b, 'R', false, 64, draw.Src) } +func BenchmarkGlyphRGBA128Over(b *testing.B) { benchGlyph(b, 'R', false, 128, draw.Over) } +func BenchmarkGlyphRGBA128Src(b *testing.B) { benchGlyph(b, 'R', false, 128, draw.Src) } +func BenchmarkGlyphRGBA256Over(b *testing.B) { benchGlyph(b, 'R', false, 256, draw.Over) } +func BenchmarkGlyphRGBA256Src(b *testing.B) { benchGlyph(b, 'R', false, 256, draw.Src) } +func BenchmarkGlyphRGBA1024Over(b *testing.B) { benchGlyph(b, 'R', false, 1024, draw.Over) } +func BenchmarkGlyphRGBA1024Src(b *testing.B) { benchGlyph(b, 'R', false, 1024, draw.Src) } + +func BenchmarkGlyphNRGBA16Over(b *testing.B) { benchGlyph(b, 'N', false, 16, draw.Over) } +func BenchmarkGlyphNRGBA16Src(b *testing.B) { benchGlyph(b, 'N', false, 16, draw.Src) } +func BenchmarkGlyphNRGBA32Over(b *testing.B) { benchGlyph(b, 'N', false, 32, draw.Over) } +func BenchmarkGlyphNRGBA32Src(b *testing.B) { benchGlyph(b, 'N', false, 32, draw.Src) } +func BenchmarkGlyphNRGBA64Over(b *testing.B) { benchGlyph(b, 'N', false, 64, draw.Over) } +func BenchmarkGlyphNRGBA64Src(b *testing.B) { benchGlyph(b, 'N', false, 64, draw.Src) } +func BenchmarkGlyphNRGBA128Over(b *testing.B) { benchGlyph(b, 'N', false, 128, draw.Over) } +func BenchmarkGlyphNRGBA128Src(b *testing.B) { benchGlyph(b, 'N', false, 128, draw.Src) } +func BenchmarkGlyphNRGBA256Over(b *testing.B) { benchGlyph(b, 'N', false, 256, draw.Over) } +func BenchmarkGlyphNRGBA256Src(b *testing.B) { benchGlyph(b, 'N', false, 256, draw.Src) } +func BenchmarkGlyphNRGBA1024Over(b *testing.B) { benchGlyph(b, 'N', false, 1024, draw.Over) } +func BenchmarkGlyphNRGBA1024Src(b *testing.B) { benchGlyph(b, 'N', false, 1024, draw.Src) } -- cgit v1.2.3