summaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/net/html/token_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/net/html/token_test.go')
-rw-r--r--vendor/golang.org/x/net/html/token_test.go748
1 files changed, 748 insertions, 0 deletions
diff --git a/vendor/golang.org/x/net/html/token_test.go b/vendor/golang.org/x/net/html/token_test.go
new file mode 100644
index 0000000..20221c3
--- /dev/null
+++ b/vendor/golang.org/x/net/html/token_test.go
@@ -0,0 +1,748 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+ "bytes"
+ "io"
+ "io/ioutil"
+ "reflect"
+ "runtime"
+ "strings"
+ "testing"
+)
+
+type tokenTest struct {
+ // A short description of the test case.
+ desc string
+ // The HTML to parse.
+ html string
+ // The string representations of the expected tokens, joined by '$'.
+ golden string
+}
+
+var tokenTests = []tokenTest{
+ {
+ "empty",
+ "",
+ "",
+ },
+ // A single text node. The tokenizer should not break text nodes on whitespace,
+ // nor should it normalize whitespace within a text node.
+ {
+ "text",
+ "foo bar",
+ "foo bar",
+ },
+ // An entity.
+ {
+ "entity",
+ "one < two",
+ "one < two",
+ },
+ // A start, self-closing and end tag. The tokenizer does not care if the start
+ // and end tokens don't match; that is the job of the parser.
+ {
+ "tags",
+ "<a>b<c/>d</e>",
+ "<a>$b$<c/>$d$</e>",
+ },
+ // Angle brackets that aren't a tag.
+ {
+ "not a tag #0",
+ "<",
+ "&lt;",
+ },
+ {
+ "not a tag #1",
+ "</",
+ "&lt;/",
+ },
+ {
+ "not a tag #2",
+ "</>",
+ "<!---->",
+ },
+ {
+ "not a tag #3",
+ "a</>b",
+ "a$<!---->$b",
+ },
+ {
+ "not a tag #4",
+ "</ >",
+ "<!-- -->",
+ },
+ {
+ "not a tag #5",
+ "</.",
+ "<!--.-->",
+ },
+ {
+ "not a tag #6",
+ "</.>",
+ "<!--.-->",
+ },
+ {
+ "not a tag #7",
+ "a < b",
+ "a &lt; b",
+ },
+ {
+ "not a tag #8",
+ "<.>",
+ "&lt;.&gt;",
+ },
+ {
+ "not a tag #9",
+ "a<<<b>>>c",
+ "a&lt;&lt;$<b>$&gt;&gt;c",
+ },
+ {
+ "not a tag #10",
+ "if x<0 and y < 0 then x*y>0",
+ "if x&lt;0 and y &lt; 0 then x*y&gt;0",
+ },
+ {
+ "not a tag #11",
+ "<<p>",
+ "&lt;$<p>",
+ },
+ // EOF in a tag name.
+ {
+ "tag name eof #0",
+ "<a",
+ "",
+ },
+ {
+ "tag name eof #1",
+ "<a ",
+ "",
+ },
+ {
+ "tag name eof #2",
+ "a<b",
+ "a",
+ },
+ {
+ "tag name eof #3",
+ "<a><b",
+ "<a>",
+ },
+ {
+ "tag name eof #4",
+ `<a x`,
+ ``,
+ },
+ // Some malformed tags that are missing a '>'.
+ {
+ "malformed tag #0",
+ `<p</p>`,
+ `<p< p="">`,
+ },
+ {
+ "malformed tag #1",
+ `<p </p>`,
+ `<p <="" p="">`,
+ },
+ {
+ "malformed tag #2",
+ `<p id`,
+ ``,
+ },
+ {
+ "malformed tag #3",
+ `<p id=`,
+ ``,
+ },
+ {
+ "malformed tag #4",
+ `<p id=>`,
+ `<p id="">`,
+ },
+ {
+ "malformed tag #5",
+ `<p id=0`,
+ ``,
+ },
+ {
+ "malformed tag #6",
+ `<p id=0</p>`,
+ `<p id="0&lt;/p">`,
+ },
+ {
+ "malformed tag #7",
+ `<p id="0</p>`,
+ ``,
+ },
+ {
+ "malformed tag #8",
+ `<p id="0"</p>`,
+ `<p id="0" <="" p="">`,
+ },
+ {
+ "malformed tag #9",
+ `<p></p id`,
+ `<p>`,
+ },
+ // Raw text and RCDATA.
+ {
+ "basic raw text",
+ "<script><a></b></script>",
+ "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
+ },
+ {
+ "unfinished script end tag",
+ "<SCRIPT>a</SCR",
+ "<script>$a&lt;/SCR",
+ },
+ {
+ "broken script end tag",
+ "<SCRIPT>a</SCR ipt>",
+ "<script>$a&lt;/SCR ipt&gt;",
+ },
+ {
+ "EOF in script end tag",
+ "<SCRIPT>a</SCRipt",
+ "<script>$a&lt;/SCRipt",
+ },
+ {
+ "scriptx end tag",
+ "<SCRIPT>a</SCRiptx",
+ "<script>$a&lt;/SCRiptx",
+ },
+ {
+ "' ' completes script end tag",
+ "<SCRIPT>a</SCRipt ",
+ "<script>$a",
+ },
+ {
+ "'>' completes script end tag",
+ "<SCRIPT>a</SCRipt>",
+ "<script>$a$</script>",
+ },
+ {
+ "self-closing script end tag",
+ "<SCRIPT>a</SCRipt/>",
+ "<script>$a$</script>",
+ },
+ {
+ "nested script tag",
+ "<SCRIPT>a</SCRipt<script>",
+ "<script>$a&lt;/SCRipt&lt;script&gt;",
+ },
+ {
+ "script end tag after unfinished",
+ "<SCRIPT>a</SCRipt</script>",
+ "<script>$a&lt;/SCRipt$</script>",
+ },
+ {
+ "script/style mismatched tags",
+ "<script>a</style>",
+ "<script>$a&lt;/style&gt;",
+ },
+ {
+ "style element with entity",
+ "<style>&apos;",
+ "<style>$&amp;apos;",
+ },
+ {
+ "textarea with tag",
+ "<textarea><div></textarea>",
+ "<textarea>$&lt;div&gt;$</textarea>",
+ },
+ {
+ "title with tag and entity",
+ "<title><b>K&amp;R C</b></title>",
+ "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
+ },
+ // DOCTYPE tests.
+ {
+ "Proper DOCTYPE",
+ "<!DOCTYPE html>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "DOCTYPE with no space",
+ "<!doctypehtml>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "DOCTYPE with two spaces",
+ "<!doctype html>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "looks like DOCTYPE but isn't",
+ "<!DOCUMENT html>",
+ "<!--DOCUMENT html-->",
+ },
+ {
+ "DOCTYPE at EOF",
+ "<!DOCtype",
+ "<!DOCTYPE >",
+ },
+ // XML processing instructions.
+ {
+ "XML processing instruction",
+ "<?xml?>",
+ "<!--?xml?-->",
+ },
+ // Comments.
+ {
+ "comment0",
+ "abc<b><!-- skipme --></b>def",
+ "abc$<b>$<!-- skipme -->$</b>$def",
+ },
+ {
+ "comment1",
+ "a<!-->z",
+ "a$<!---->$z",
+ },
+ {
+ "comment2",
+ "a<!--->z",
+ "a$<!---->$z",
+ },
+ {
+ "comment3",
+ "a<!--x>-->z",
+ "a$<!--x>-->$z",
+ },
+ {
+ "comment4",
+ "a<!--x->-->z",
+ "a$<!--x->-->$z",
+ },
+ {
+ "comment5",
+ "a<!>z",
+ "a$<!---->$z",
+ },
+ {
+ "comment6",
+ "a<!->z",
+ "a$<!----->$z",
+ },
+ {
+ "comment7",
+ "a<!---<>z",
+ "a$<!---<>z-->",
+ },
+ {
+ "comment8",
+ "a<!--z",
+ "a$<!--z-->",
+ },
+ {
+ "comment9",
+ "a<!--z-",
+ "a$<!--z-->",
+ },
+ {
+ "comment10",
+ "a<!--z--",
+ "a$<!--z-->",
+ },
+ {
+ "comment11",
+ "a<!--z---",
+ "a$<!--z--->",
+ },
+ {
+ "comment12",
+ "a<!--z----",
+ "a$<!--z---->",
+ },
+ {
+ "comment13",
+ "a<!--x--!>z",
+ "a$<!--x-->$z",
+ },
+ // An attribute with a backslash.
+ {
+ "backslash",
+ `<p id="a\"b">`,
+ `<p id="a\" b"="">`,
+ },
+ // Entities, tag name and attribute key lower-casing, and whitespace
+ // normalization within a tag.
+ {
+ "tricky",
+ "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
+ `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
+ },
+ // A nonexistent entity. Tokenizing and converting back to a string should
+ // escape the "&" to become "&amp;".
+ {
+ "noSuchEntity",
+ `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
+ `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
+ },
+ {
+ "entity without semicolon",
+ `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+ `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+ },
+ {
+ "entity with digits",
+ "&frac12;",
+ "½",
+ },
+ // Attribute tests:
+ // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
+ {
+ "Empty attribute",
+ `<input disabled FOO>`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Empty attribute, whitespace",
+ `<input disabled FOO >`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Unquoted attribute value",
+ `<input value=yes FOO=BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Unquoted attribute value, spaces",
+ `<input value = yes FOO = BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Unquoted attribute value, trailing space",
+ `<input value=yes FOO=BAR >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value",
+ `<input value='yes' FOO='BAR'>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value, trailing space",
+ `<input value='yes' FOO='BAR' >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Double-quoted attribute value",
+ `<input value="I'm an attribute" FOO="BAR">`,
+ `<input value="I&#39;m an attribute" foo="BAR">`,
+ },
+ {
+ "Attribute name characters",
+ `<meta http-equiv="content-type">`,
+ `<meta http-equiv="content-type">`,
+ },
+ {
+ "Mixed attributes",
+ `a<P V="0 1" w='2' X=3 y>z`,
+ `a$<p v="0 1" w="2" x="3" y="">$z`,
+ },
+ {
+ "Attributes with a solitary single quote",
+ `<p id=can't><p id=won't>`,
+ `<p id="can&#39;t">$<p id="won&#39;t">`,
+ },
+}
+
+func TestTokenizer(t *testing.T) {
+loop:
+ for _, tt := range tokenTests {
+ z := NewTokenizer(strings.NewReader(tt.html))
+ if tt.golden != "" {
+ for i, s := range strings.Split(tt.golden, "$") {
+ if z.Next() == ErrorToken {
+ t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
+ continue loop
+ }
+ actual := z.Token().String()
+ if s != actual {
+ t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+ continue loop
+ }
+ }
+ }
+ z.Next()
+ if z.Err() != io.EOF {
+ t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
+ }
+ }
+}
+
+func TestMaxBuffer(t *testing.T) {
+ // Exceeding the maximum buffer size generates ErrBufferExceeded.
+ z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
+ z.SetMaxBuf(5)
+ tt := z.Next()
+ if got, want := tt, ErrorToken; got != want {
+ t.Fatalf("token type: got: %v want: %v", got, want)
+ }
+ if got, want := z.Err(), ErrBufferExceeded; got != want {
+ t.Errorf("error type: got: %v want: %v", got, want)
+ }
+ if got, want := string(z.Raw()), "<tttt"; got != want {
+ t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
+ }
+}
+
+func TestMaxBufferReconstruction(t *testing.T) {
+ // Exceeding the maximum buffer size at any point while tokenizing permits
+ // reconstructing the original input.
+tests:
+ for _, test := range tokenTests {
+ for maxBuf := 1; ; maxBuf++ {
+ r := strings.NewReader(test.html)
+ z := NewTokenizer(r)
+ z.SetMaxBuf(maxBuf)
+ var tokenized bytes.Buffer
+ for {
+ tt := z.Next()
+ tokenized.Write(z.Raw())
+ if tt == ErrorToken {
+ if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
+ t.Errorf("%s: unexpected error: %v", test.desc, err)
+ }
+ break
+ }
+ }
+ // Anything tokenized along with untokenized input or data left in the reader.
+ assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
+ if err != nil {
+ t.Errorf("%s: ReadAll: %v", test.desc, err)
+ continue tests
+ }
+ if got, want := string(assembled), test.html; got != want {
+ t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
+ continue tests
+ }
+ // EOF indicates that we completed tokenization and hence found the max
+ // maxBuf that generates ErrBufferExceeded, so continue to the next test.
+ if z.Err() == io.EOF {
+ break
+ }
+ } // buffer sizes
+ } // tests
+}
+
+func TestPassthrough(t *testing.T) {
+ // Accumulating the raw output for each parse event should reconstruct the
+ // original input.
+ for _, test := range tokenTests {
+ z := NewTokenizer(strings.NewReader(test.html))
+ var parsed bytes.Buffer
+ for {
+ tt := z.Next()
+ parsed.Write(z.Raw())
+ if tt == ErrorToken {
+ break
+ }
+ }
+ if got, want := parsed.String(), test.html; got != want {
+ t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
+ }
+ }
+}
+
+func TestBufAPI(t *testing.T) {
+ s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
+ z := NewTokenizer(bytes.NewBufferString(s))
+ var result bytes.Buffer
+ depth := 0
+loop:
+ for {
+ tt := z.Next()
+ switch tt {
+ case ErrorToken:
+ if z.Err() != io.EOF {
+ t.Error(z.Err())
+ }
+ break loop
+ case TextToken:
+ if depth > 0 {
+ result.Write(z.Text())
+ }
+ case StartTagToken, EndTagToken:
+ tn, _ := z.TagName()
+ if len(tn) == 1 && tn[0] == 'a' {
+ if tt == StartTagToken {
+ depth++
+ } else {
+ depth--
+ }
+ }
+ }
+ }
+ u := "14567"
+ v := string(result.Bytes())
+ if u != v {
+ t.Errorf("TestBufAPI: want %q got %q", u, v)
+ }
+}
+
+func TestConvertNewlines(t *testing.T) {
+ testCases := map[string]string{
+ "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
+ "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
+ "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
+ "": "",
+ "\n": "\n",
+ "\n\r": "\n\n",
+ "\r": "\n",
+ "\r\n": "\n",
+ "\r\n\n": "\n\n",
+ "\r\n\r": "\n\n",
+ "\r\n\r\n": "\n\n",
+ "\r\r": "\n\n",
+ "\r\r\n": "\n\n",
+ "\r\r\n\n": "\n\n\n",
+ "\r\r\r\n": "\n\n\n",
+ "\r \n": "\n \n",
+ "xyz": "xyz",
+ }
+ for in, want := range testCases {
+ if got := string(convertNewlines([]byte(in))); got != want {
+ t.Errorf("input %q: got %q, want %q", in, got, want)
+ }
+ }
+}
+
+func TestReaderEdgeCases(t *testing.T) {
+ const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
+ testCases := []io.Reader{
+ &zeroOneByteReader{s: s},
+ &eofStringsReader{s: s},
+ &stuckReader{},
+ }
+ for i, tc := range testCases {
+ got := []TokenType{}
+ z := NewTokenizer(tc)
+ for {
+ tt := z.Next()
+ if tt == ErrorToken {
+ break
+ }
+ got = append(got, tt)
+ }
+ if err := z.Err(); err != nil && err != io.EOF {
+ if err != io.ErrNoProgress {
+ t.Errorf("i=%d: %v", i, err)
+ }
+ continue
+ }
+ want := []TokenType{
+ StartTagToken,
+ TextToken,
+ EndTagToken,
+ }
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("i=%d: got %v, want %v", i, got, want)
+ continue
+ }
+ }
+}
+
+// zeroOneByteReader is like a strings.Reader that alternates between
+// returning 0 bytes and 1 byte at a time.
+type zeroOneByteReader struct {
+ s string
+ n int
+}
+
+func (r *zeroOneByteReader) Read(p []byte) (int, error) {
+ if len(p) == 0 {
+ return 0, nil
+ }
+ if len(r.s) == 0 {
+ return 0, io.EOF
+ }
+ r.n++
+ if r.n%2 != 0 {
+ return 0, nil
+ }
+ p[0], r.s = r.s[0], r.s[1:]
+ return 1, nil
+}
+
+// eofStringsReader is like a strings.Reader but can return an (n, err) where
+// n > 0 && err != nil.
+type eofStringsReader struct {
+ s string
+}
+
+func (r *eofStringsReader) Read(p []byte) (int, error) {
+ n := copy(p, r.s)
+ r.s = r.s[n:]
+ if r.s != "" {
+ return n, nil
+ }
+ return n, io.EOF
+}
+
+// stuckReader is an io.Reader that always returns no data and no error.
+type stuckReader struct{}
+
+func (*stuckReader) Read(p []byte) (int, error) {
+ return 0, nil
+}
+
+const (
+ rawLevel = iota
+ lowLevel
+ highLevel
+)
+
+func benchmarkTokenizer(b *testing.B, level int) {
+ buf, err := ioutil.ReadFile("testdata/go1.html")
+ if err != nil {
+ b.Fatalf("could not read testdata/go1.html: %v", err)
+ }
+ b.SetBytes(int64(len(buf)))
+ runtime.GC()
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ z := NewTokenizer(bytes.NewBuffer(buf))
+ for {
+ tt := z.Next()
+ if tt == ErrorToken {
+ if err := z.Err(); err != nil && err != io.EOF {
+ b.Fatalf("tokenizer error: %v", err)
+ }
+ break
+ }
+ switch level {
+ case rawLevel:
+ // Calling z.Raw just returns the raw bytes of the token. It does
+ // not unescape &lt; to <, or lower-case tag names and attribute keys.
+ z.Raw()
+ case lowLevel:
+ // Caling z.Text, z.TagName and z.TagAttr returns []byte values
+ // whose contents may change on the next call to z.Next.
+ switch tt {
+ case TextToken, CommentToken, DoctypeToken:
+ z.Text()
+ case StartTagToken, SelfClosingTagToken:
+ _, more := z.TagName()
+ for more {
+ _, _, more = z.TagAttr()
+ }
+ case EndTagToken:
+ z.TagName()
+ }
+ case highLevel:
+ // Calling z.Token converts []byte values to strings whose validity
+ // extend beyond the next call to z.Next.
+ z.Token()
+ }
+ }
+ }
+}
+
+func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
+func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
+func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }