a<b>c</b>d

// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bytes" "errors" "io" "strconv" "strings" "golang.org/x/net/html/atom" ) // A TokenType is the type of a Token. type TokenType uint32 const ( // ErrorToken means that an error occurred during tokenization. ErrorToken TokenType = iota // TextToken means a text node. TextToken // A StartTagToken looks like . StartTagToken // An EndTagToken looks like . EndTagToken // A SelfClosingTagToken tag looks like
. SelfClosingTagToken // A CommentToken looks like . CommentToken // A DoctypeToken looks like DoctypeToken ) // ErrBufferExceeded means that the buffering limit was exceeded. var ErrBufferExceeded = errors.New("max buffer exceeded") // String returns a string representation of the TokenType. func (t TokenType) String() string { switch t { case ErrorToken: return "Error" case TextToken: return "Text" case StartTagToken: return "StartTag" case EndTagToken: return "EndTag" case SelfClosingTagToken: return "SelfClosingTag" case CommentToken: return "Comment" case DoctypeToken: return "Doctype" } return "Invalid(" + strconv.Itoa(int(t)) + ")" } // An Attribute is an attribute namespace-key-value triple. Namespace is // non-empty for foreign attributes like xlink, Key is alphabetic (and hence // does not contain escapable characters like '&', '<' or '>'), and Val is // unescaped (it looks like "a" case EndTagToken: return "" case SelfClosingTagToken: return "<" + t.tagString() + "/>" case CommentToken: return "" case DoctypeToken: return "" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" } // span is a range of bytes in a Tokenizer's buffer. The start is inclusive, // the end is exclusive. type span struct { start, end int } // A Tokenizer returns a stream of HTML Tokens. type Tokenizer struct { // r is the source of the HTML text. r io.Reader // tt is the TokenType of the current token. tt TokenType // err is the first error encountered during tokenization. It is possible // for tt != Error && err != nil to hold: this means that Next returned a // valid token but the subsequent Next call will return an error token. // For example, if the HTML text input was just "plain", then the first // Next call would set z.err to io.EOF but return a TextToken, and all // subsequent Next calls would return an ErrorToken. // err is never reset. Once it becomes non-nil, it stays non-nil. err error // readErr is the error returned by the io.Reader r. It is separate from // err because it is valid for an io.Reader to return (n int, err1 error) // such that n > 0 && err1 != nil, and callers should always process the // n > 0 bytes before considering the error err1. readErr error // buf[raw.start:raw.end] holds the raw bytes of the current token. // buf[raw.end:] is buffered input that will yield future tokens. raw span buf []byte // maxBuf limits the data buffered in buf. A value of 0 means unlimited. maxBuf int // buf[data.start:data.end] holds the raw bytes of the current token's data: // a text token's text, a tag token's tag name, etc. data span // pendingAttr is the attribute key and value currently being tokenized. // When complete, pendingAttr is pushed onto attr. nAttrReturned is // incremented on each call to TagAttr. pendingAttr [2]span attr [][2]span nAttrReturned int // rawTag is the "script" in "" that closes the next token. If // non-empty, the subsequent call to Next will return a raw or RCDATA text // token: one that treats "

" as text instead of an element. // rawTag's contents are lower-cased. rawTag string // textIsRaw is whether the current text token's data is not escaped. textIsRaw bool // convertNUL is whether NUL bytes in the current token's data should // be converted into \ufffd replacement characters. convertNUL bool // allowCDATA is whether CDATA sections are allowed in the current context. allowCDATA bool } // AllowCDATA sets whether or not the tokenizer recognizes as // the text "foo". The default value is false, which means to recognize it as // a bogus comment "" instead. // // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and // only if tokenizing foreign content, such as MathML and SVG. However, // tracking foreign-contentness is difficult to do purely in the tokenizer, // as opposed to the parser, due to HTML integration points: an