summaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/net/html/parse.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/net/html/parse.go')
-rw-r--r--vendor/golang.org/x/net/html/parse.go529
1 files changed, 426 insertions, 103 deletions
diff --git a/vendor/golang.org/x/net/html/parse.go b/vendor/golang.org/x/net/html/parse.go
index be4b2bf..992cff2 100644
--- a/vendor/golang.org/x/net/html/parse.go
+++ b/vendor/golang.org/x/net/html/parse.go
@@ -25,20 +25,22 @@ type parser struct {
hasSelfClosingToken bool
// doc is the document root element.
doc *Node
- // The stack of open elements (section 12.2.3.2) and active formatting
- // elements (section 12.2.3.3).
+ // The stack of open elements (section 12.2.4.2) and active formatting
+ // elements (section 12.2.4.3).
oe, afe nodeStack
- // Element pointers (section 12.2.3.4).
+ // Element pointers (section 12.2.4.4).
head, form *Node
- // Other parsing state flags (section 12.2.3.5).
+ // Other parsing state flags (section 12.2.4.5).
scripting, framesetOK bool
+ // The stack of template insertion modes
+ templateStack insertionModeStack
// im is the current insertion mode.
im insertionMode
// originalIM is the insertion mode to go back to after completing a text
// or inTableText insertion mode.
originalIM insertionMode
// fosterParenting is whether new elements should be inserted according to
- // the foster parenting rules (section 12.2.5.3).
+ // the foster parenting rules (section 12.2.6.1).
fosterParenting bool
// quirks is whether the parser is operating in "quirks mode."
quirks bool
@@ -56,7 +58,7 @@ func (p *parser) top() *Node {
return p.doc
}
-// Stop tags for use in popUntil. These come from section 12.2.3.2.
+// Stop tags for use in popUntil. These come from section 12.2.4.2.
var (
defaultScopeStopTags = map[string][]a.Atom{
"": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
@@ -79,7 +81,7 @@ const (
// popUntil pops the stack of open elements at the highest element whose tag
// is in matchTags, provided there is no higher element in the scope's stop
-// tags (as defined in section 12.2.3.2). It returns whether or not there was
+// tags (as defined in section 12.2.4.2). It returns whether or not there was
// such an element. If there was not, popUntil leaves the stack unchanged.
//
// For example, the set of stop tags for table scope is: "html", "table". If
@@ -126,7 +128,7 @@ func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
return -1
}
case tableScope:
- if tagAtom == a.Html || tagAtom == a.Table {
+ if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
return -1
}
case selectScope:
@@ -162,17 +164,17 @@ func (p *parser) clearStackToContext(s scope) {
tagAtom := p.oe[i].DataAtom
switch s {
case tableScope:
- if tagAtom == a.Html || tagAtom == a.Table {
+ if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
p.oe = p.oe[:i+1]
return
}
case tableRowScope:
- if tagAtom == a.Html || tagAtom == a.Tr {
+ if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
p.oe = p.oe[:i+1]
return
}
case tableBodyScope:
- if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead {
+ if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
p.oe = p.oe[:i+1]
return
}
@@ -183,7 +185,7 @@ func (p *parser) clearStackToContext(s scope) {
}
// generateImpliedEndTags pops nodes off the stack of open elements as long as
-// the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt.
+// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
// If exceptions are specified, nodes with that name will not be popped off.
func (p *parser) generateImpliedEndTags(exceptions ...string) {
var i int
@@ -192,7 +194,7 @@ loop:
n := p.oe[i]
if n.Type == ElementNode {
switch n.DataAtom {
- case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt:
+ case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
for _, except := range exceptions {
if n.Data == except {
break loop
@@ -234,9 +236,9 @@ func (p *parser) shouldFosterParent() bool {
}
// fosterParent adds a child node according to the foster parenting rules.
-// Section 12.2.5.3, "foster parenting".
+// Section 12.2.6.1, "foster parenting".
func (p *parser) fosterParent(n *Node) {
- var table, parent, prev *Node
+ var table, parent, prev, template *Node
var i int
for i = len(p.oe) - 1; i >= 0; i-- {
if p.oe[i].DataAtom == a.Table {
@@ -245,6 +247,19 @@ func (p *parser) fosterParent(n *Node) {
}
}
+ var j int
+ for j = len(p.oe) - 1; j >= 0; j-- {
+ if p.oe[j].DataAtom == a.Template {
+ template = p.oe[j]
+ break
+ }
+ }
+
+ if template != nil && (table == nil || j > i) {
+ template.AppendChild(n)
+ return
+ }
+
if table == nil {
// The foster parent is the html element.
parent = p.oe[0]
@@ -304,7 +319,7 @@ func (p *parser) addElement() {
})
}
-// Section 12.2.3.3.
+// Section 12.2.4.3.
func (p *parser) addFormattingElement() {
tagAtom, attr := p.tok.DataAtom, p.tok.Attr
p.addElement()
@@ -351,7 +366,7 @@ findIdenticalElements:
p.afe = append(p.afe, p.top())
}
-// Section 12.2.3.3.
+// Section 12.2.4.3.
func (p *parser) clearActiveFormattingElements() {
for {
n := p.afe.pop()
@@ -361,7 +376,7 @@ func (p *parser) clearActiveFormattingElements() {
}
}
-// Section 12.2.3.3.
+// Section 12.2.4.3.
func (p *parser) reconstructActiveFormattingElements() {
n := p.afe.top()
if n == nil {
@@ -390,12 +405,12 @@ func (p *parser) reconstructActiveFormattingElements() {
}
}
-// Section 12.2.4.
+// Section 12.2.5.
func (p *parser) acknowledgeSelfClosingTag() {
p.hasSelfClosingToken = false
}
-// An insertion mode (section 12.2.3.1) is the state transition function from
+// An insertion mode (section 12.2.4.1) is the state transition function from
// a particular state in the HTML5 parser's state machine. It updates the
// parser's fields depending on parser.tok (where ErrorToken means EOF).
// It returns whether the token was consumed.
@@ -403,7 +418,7 @@ type insertionMode func(*parser) bool
// setOriginalIM sets the insertion mode to return to after completing a text or
// inTableText insertion mode.
-// Section 12.2.3.1, "using the rules for".
+// Section 12.2.4.1, "using the rules for".
func (p *parser) setOriginalIM() {
if p.originalIM != nil {
panic("html: bad parser state: originalIM was set twice")
@@ -411,18 +426,35 @@ func (p *parser) setOriginalIM() {
p.originalIM = p.im
}
-// Section 12.2.3.1, "reset the insertion mode".
+// Section 12.2.4.1, "reset the insertion mode".
func (p *parser) resetInsertionMode() {
for i := len(p.oe) - 1; i >= 0; i-- {
n := p.oe[i]
- if i == 0 && p.context != nil {
+ last := i == 0
+ if last && p.context != nil {
n = p.context
}
switch n.DataAtom {
case a.Select:
+ if !last {
+ for ancestor, first := n, p.oe[0]; ancestor != first; {
+ ancestor = p.oe[p.oe.index(ancestor)-1]
+ switch ancestor.DataAtom {
+ case a.Template:
+ p.im = inSelectIM
+ return
+ case a.Table:
+ p.im = inSelectInTableIM
+ return
+ }
+ }
+ }
p.im = inSelectIM
case a.Td, a.Th:
+ // TODO: remove this divergence from the HTML5 spec.
+ //
+ // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
p.im = inCellIM
case a.Tr:
p.im = inRowIM
@@ -434,25 +466,41 @@ func (p *parser) resetInsertionMode() {
p.im = inColumnGroupIM
case a.Table:
p.im = inTableIM
+ case a.Template:
+ // TODO: remove this divergence from the HTML5 spec.
+ if n.Namespace != "" {
+ continue
+ }
+ p.im = p.templateStack.top()
case a.Head:
- p.im = inBodyIM
+ // TODO: remove this divergence from the HTML5 spec.
+ //
+ // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
+ p.im = inHeadIM
case a.Body:
p.im = inBodyIM
case a.Frameset:
p.im = inFramesetIM
case a.Html:
- p.im = beforeHeadIM
+ if p.head == nil {
+ p.im = beforeHeadIM
+ } else {
+ p.im = afterHeadIM
+ }
default:
+ if last {
+ p.im = inBodyIM
+ return
+ }
continue
}
return
}
- p.im = inBodyIM
}
const whitespace = " \t\r\n\f"
-// Section 12.2.5.4.1.
+// Section 12.2.6.4.1.
func initialIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -479,7 +527,7 @@ func initialIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.2.
+// Section 12.2.6.4.2.
func beforeHTMLIM(p *parser) bool {
switch p.tok.Type {
case DoctypeToken:
@@ -517,7 +565,7 @@ func beforeHTMLIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.3.
+// Section 12.2.6.4.3.
func beforeHeadIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -560,7 +608,7 @@ func beforeHeadIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.4.
+// Section 12.2.6.4.4.
func inHeadIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -582,7 +630,16 @@ func inHeadIM(p *parser) bool {
p.oe.pop()
p.acknowledgeSelfClosingTag()
return true
- case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
+ case a.Noscript:
+ p.addElement()
+ if p.scripting {
+ p.setOriginalIM()
+ p.im = textIM
+ } else {
+ p.im = inHeadNoscriptIM
+ }
+ return true
+ case a.Script, a.Title, a.Noframes, a.Style:
p.addElement()
p.setOriginalIM()
p.im = textIM
@@ -590,19 +647,41 @@ func inHeadIM(p *parser) bool {
case a.Head:
// Ignore the token.
return true
+ case a.Template:
+ p.addElement()
+ p.afe = append(p.afe, &scopeMarker)
+ p.framesetOK = false
+ p.im = inTemplateIM
+ p.templateStack = append(p.templateStack, inTemplateIM)
+ return true
}
case EndTagToken:
switch p.tok.DataAtom {
case a.Head:
- n := p.oe.pop()
- if n.DataAtom != a.Head {
- panic("html: bad parser state: <head> element not found, in the in-head insertion mode")
- }
+ p.oe.pop()
p.im = afterHeadIM
return true
case a.Body, a.Html, a.Br:
p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
return false
+ case a.Template:
+ if !p.oe.contains(a.Template) {
+ return true
+ }
+ // TODO: remove this divergence from the HTML5 spec.
+ //
+ // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
+ p.generateImpliedEndTags()
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
+ p.oe = p.oe[:i]
+ break
+ }
+ }
+ p.clearActiveFormattingElements()
+ p.templateStack.pop()
+ p.resetInsertionMode()
+ return true
default:
// Ignore the token.
return true
@@ -622,7 +701,50 @@ func inHeadIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.6.
+// 12.2.6.4.5.
+func inHeadNoscriptIM(p *parser) bool {
+ switch p.tok.Type {
+ case DoctypeToken:
+ // Ignore the token.
+ return true
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Html:
+ return inBodyIM(p)
+ case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
+ return inHeadIM(p)
+ case a.Head, a.Noscript:
+ // Ignore the token.
+ return true
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Noscript, a.Br:
+ default:
+ // Ignore the token.
+ return true
+ }
+ case TextToken:
+ s := strings.TrimLeft(p.tok.Data, whitespace)
+ if len(s) == 0 {
+ // It was all whitespace.
+ return inHeadIM(p)
+ }
+ case CommentToken:
+ return inHeadIM(p)
+ }
+ p.oe.pop()
+ if p.top().DataAtom != a.Head {
+ panic("html: the new current node will be a head element.")
+ }
+ p.im = inHeadIM
+ if p.tok.DataAtom == a.Noscript {
+ return true
+ }
+ return false
+}
+
+// Section 12.2.6.4.6.
func afterHeadIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -648,7 +770,7 @@ func afterHeadIM(p *parser) bool {
p.addElement()
p.im = inFramesetIM
return true
- case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
+ case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
p.oe = append(p.oe, p.head)
defer p.oe.remove(p.head)
return inHeadIM(p)
@@ -660,6 +782,8 @@ func afterHeadIM(p *parser) bool {
switch p.tok.DataAtom {
case a.Body, a.Html, a.Br:
// Drop down to creating an implied <body> tag.
+ case a.Template:
+ return inHeadIM(p)
default:
// Ignore the token.
return true
@@ -697,7 +821,7 @@ func copyAttributes(dst *Node, src Token) {
}
}
-// Section 12.2.5.4.7.
+// Section 12.2.6.4.7.
func inBodyIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -727,10 +851,16 @@ func inBodyIM(p *parser) bool {
case StartTagToken:
switch p.tok.DataAtom {
case a.Html:
+ if p.oe.contains(a.Template) {
+ return true
+ }
copyAttributes(p.oe[0], p.tok)
- case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
+ case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
return inHeadIM(p)
case a.Body:
+ if p.oe.contains(a.Template) {
+ return true
+ }
if len(p.oe) >= 2 {
body := p.oe[1]
if body.Type == ElementNode && body.DataAtom == a.Body {
@@ -767,9 +897,13 @@ func inBodyIM(p *parser) bool {
// The newline, if any, will be dealt with by the TextToken case.
p.framesetOK = false
case a.Form:
- if p.form == nil {
- p.popUntil(buttonScope, a.P)
- p.addElement()
+ if p.form != nil && !p.oe.contains(a.Template) {
+ // Ignore the token
+ return true
+ }
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ if !p.oe.contains(a.Template) {
p.form = p.top()
}
case a.Li:
@@ -819,7 +953,7 @@ func inBodyIM(p *parser) bool {
case a.A:
for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
- p.inBodyEndTagFormatting(a.A)
+ p.inBodyEndTagFormatting(a.A, "a")
p.oe.remove(n)
p.afe.remove(n)
break
@@ -833,7 +967,7 @@ func inBodyIM(p *parser) bool {
case a.Nobr:
p.reconstructActiveFormattingElements()
if p.elementInScope(defaultScope, a.Nobr) {
- p.inBodyEndTagFormatting(a.Nobr)
+ p.inBodyEndTagFormatting(a.Nobr, "nobr")
p.reconstructActiveFormattingElements()
}
p.addFormattingElement()
@@ -903,6 +1037,14 @@ func inBodyIM(p *parser) bool {
p.acknowledgeSelfClosingTag()
p.popUntil(buttonScope, a.P)
p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
+ if p.form == nil {
+ // NOTE: The 'isindex' element has been removed,
+ // and the 'template' element has not been designed to be
+ // collaborative with the index element.
+ //
+ // Ignore the token.
+ return true
+ }
if action != "" {
p.form.Attr = []Attribute{{Key: "action", Val: action}}
}
@@ -952,11 +1094,16 @@ func inBodyIM(p *parser) bool {
}
p.reconstructActiveFormattingElements()
p.addElement()
- case a.Rp, a.Rt:
+ case a.Rb, a.Rtc:
if p.elementInScope(defaultScope, a.Ruby) {
p.generateImpliedEndTags()
}
p.addElement()
+ case a.Rp, a.Rt:
+ if p.elementInScope(defaultScope, a.Ruby) {
+ p.generateImpliedEndTags("rtc")
+ }
+ p.addElement()
case a.Math, a.Svg:
p.reconstructActiveFormattingElements()
if p.tok.DataAtom == a.Math {
@@ -993,15 +1140,29 @@ func inBodyIM(p *parser) bool {
case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
p.popUntil(defaultScope, p.tok.DataAtom)
case a.Form:
- node := p.form
- p.form = nil
- i := p.indexOfElementInScope(defaultScope, a.Form)
- if node == nil || i == -1 || p.oe[i] != node {
- // Ignore the token.
- return true
+ if p.oe.contains(a.Template) {
+ i := p.indexOfElementInScope(defaultScope, a.Form)
+ if i == -1 {
+ // Ignore the token.
+ return true
+ }
+ p.generateImpliedEndTags()
+ if p.oe[i].DataAtom != a.Form {
+ // Ignore the token.
+ return true
+ }
+ p.popUntil(defaultScope, a.Form)
+ } else {
+ node := p.form
+ p.form = nil
+ i := p.indexOfElementInScope(defaultScope, a.Form)
+ if node == nil || i == -1 || p.oe[i] != node {
+ // Ignore the token.
+ return true
+ }
+ p.generateImpliedEndTags()
+ p.oe.remove(node)
}
- p.generateImpliedEndTags()
- p.oe.remove(node)
case a.P:
if !p.elementInScope(buttonScope, a.P) {
p.parseImpliedToken(StartTagToken, a.P, a.P.String())
@@ -1014,7 +1175,7 @@ func inBodyIM(p *parser) bool {
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
- p.inBodyEndTagFormatting(p.tok.DataAtom)
+ p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
case a.Applet, a.Marquee, a.Object:
if p.popUntil(defaultScope, p.tok.DataAtom) {
p.clearActiveFormattingElements()
@@ -1022,20 +1183,37 @@ func inBodyIM(p *parser) bool {
case a.Br:
p.tok.Type = StartTagToken
return false
+ case a.Template:
+ return inHeadIM(p)
default:
- p.inBodyEndTagOther(p.tok.DataAtom)
+ p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
}
case CommentToken:
p.addChild(&Node{
Type: CommentNode,
Data: p.tok.Data,
})
+ case ErrorToken:
+ // TODO: remove this divergence from the HTML5 spec.
+ if len(p.templateStack) > 0 {
+ p.im = inTemplateIM
+ return false
+ } else {
+ for _, e := range p.oe {
+ switch e.DataAtom {
+ case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
+ a.Thead, a.Tr, a.Body, a.Html:
+ default:
+ return true
+ }
+ }
+ }
}
return true
}
-func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
+func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
// This is the "adoption agency" algorithm, described at
// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
@@ -1057,7 +1235,7 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
}
}
if formattingElement == nil {
- p.inBodyEndTagOther(tagAtom)
+ p.inBodyEndTagOther(tagAtom, tagName)
return
}
feIndex := p.oe.index(formattingElement)
@@ -1160,11 +1338,19 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
}
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
-// "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content
+// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
-func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
+func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
for i := len(p.oe) - 1; i >= 0; i-- {
- if p.oe[i].DataAtom == tagAtom {
+ // Two element nodes have the same tag if they have the same Data (a
+ // string-typed field). As an optimization, for common HTML tags, each
+ // Data string is assigned a unique, non-zero DataAtom (a uint32-typed
+ // field), since integer comparison is faster than string comparison.
+ // Uncommon (custom) tags get a zero DataAtom.
+ //
+ // The if condition here is equivalent to (p.oe[i].Data == tagName).
+ if (p.oe[i].DataAtom == tagAtom) &&
+ ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
p.oe = p.oe[:i]
break
}
@@ -1174,7 +1360,7 @@ func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
}
}
-// Section 12.2.5.4.8.
+// Section 12.2.6.4.8.
func textIM(p *parser) bool {
switch p.tok.Type {
case ErrorToken:
@@ -1203,12 +1389,9 @@ func textIM(p *parser) bool {
return p.tok.Type == EndTagToken
}
-// Section 12.2.5.4.9.
+// Section 12.2.6.4.9.
func inTableIM(p *parser) bool {
switch p.tok.Type {
- case ErrorToken:
- // Stop parsing.
- return true
case TextToken:
p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
switch p.oe.top().DataAtom {
@@ -1249,7 +1432,7 @@ func inTableIM(p *parser) bool {
}
// Ignore the token.
return true
- case a.Style, a.Script:
+ case a.Style, a.Script, a.Template:
return inHeadIM(p)
case a.Input:
for _, t := range p.tok.Attr {
@@ -1261,7 +1444,7 @@ func inTableIM(p *parser) bool {
}
// Otherwise drop down to the default action.
case a.Form:
- if p.form != nil {
+ if p.oe.contains(a.Template) || p.form != nil {
// Ignore the token.
return true
}
@@ -1291,6 +1474,8 @@ func inTableIM(p *parser) bool {
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
// Ignore the token.
return true
+ case a.Template:
+ return inHeadIM(p)
}
case CommentToken:
p.addChild(&Node{
@@ -1301,6 +1486,8 @@ func inTableIM(p *parser) bool {
case DoctypeToken:
// Ignore the token.
return true
+ case ErrorToken:
+ return inBodyIM(p)
}
p.fosterParenting = true
@@ -1309,7 +1496,7 @@ func inTableIM(p *parser) bool {
return inBodyIM(p)
}
-// Section 12.2.5.4.11.
+// Section 12.2.6.4.11.
func inCaptionIM(p *parser) bool {
switch p.tok.Type {
case StartTagToken:
@@ -1355,7 +1542,7 @@ func inCaptionIM(p *parser) bool {
return inBodyIM(p)
}
-// Section 12.2.5.4.12.
+// Section 12.2.6.4.12.
func inColumnGroupIM(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -1386,11 +1573,13 @@ func inColumnGroupIM(p *parser) bool {
p.oe.pop()
p.acknowledgeSelfClosingTag()
return true
+ case a.Template:
+ return inHeadIM(p)
}
case EndTagToken:
switch p.tok.DataAtom {
case a.Colgroup:
- if p.oe.top().DataAtom != a.Html {
+ if p.oe.top().DataAtom == a.Colgroup {
p.oe.pop()
p.im = inTableIM
}
@@ -1398,17 +1587,21 @@ func inColumnGroupIM(p *parser) bool {
case a.Col:
// Ignore the token.
return true
+ case a.Template:
+ return inHeadIM(p)
}
+ case ErrorToken:
+ return inBodyIM(p)
}
- if p.oe.top().DataAtom != a.Html {
- p.oe.pop()
- p.im = inTableIM
- return false
+ if p.oe.top().DataAtom != a.Colgroup {
+ return true
}
- return true
+ p.oe.pop()
+ p.im = inTableIM
+ return false
}
-// Section 12.2.5.4.13.
+// Section 12.2.6.4.13.
func inTableBodyIM(p *parser) bool {
switch p.tok.Type {
case StartTagToken:
@@ -1460,7 +1653,7 @@ func inTableBodyIM(p *parser) bool {
return inTableIM(p)
}
-// Section 12.2.5.4.14.
+// Section 12.2.6.4.14.
func inRowIM(p *parser) bool {
switch p.tok.Type {
case StartTagToken:
@@ -1511,7 +1704,7 @@ func inRowIM(p *parser) bool {
return inTableIM(p)
}
-// Section 12.2.5.4.15.
+// Section 12.2.6.4.15.
func inCellIM(p *parser) bool {
switch p.tok.Type {
case StartTagToken:
@@ -1551,8 +1744,9 @@ func inCellIM(p *parser) bool {
return true
}
// Close the cell and reprocess.
- p.popUntil(tableScope, a.Td, a.Th)
- p.clearActiveFormattingElements()
+ if p.popUntil(tableScope, a.Td, a.Th) {
+ p.clearActiveFormattingElements()
+ }
p.im = inRowIM
return false
}
@@ -1560,12 +1754,9 @@ func inCellIM(p *parser) bool {
return inBodyIM(p)
}
-// Section 12.2.5.4.16.
+// Section 12.2.6.4.16.
func inSelectIM(p *parser) bool {
switch p.tok.Type {
- case ErrorToken:
- // Stop parsing.
- return true
case TextToken:
p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
case StartTagToken:
@@ -1586,8 +1777,12 @@ func inSelectIM(p *parser) bool {
}
p.addElement()
case a.Select:
- p.tok.Type = EndTagToken
- return false
+ if p.popUntil(selectScope, a.Select) {
+ p.resetInsertionMode()
+ } else {
+ // Ignore the token.
+ return true
+ }
case a.Input, a.Keygen, a.Textarea:
if p.elementInScope(selectScope, a.Select) {
p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
@@ -1597,7 +1792,7 @@ func inSelectIM(p *parser) bool {
p.tokenizer.NextIsNotRawText()
// Ignore the token.
return true
- case a.Script:
+ case a.Script, a.Template:
return inHeadIM(p)
}
case EndTagToken:
@@ -1617,7 +1812,12 @@ func inSelectIM(p *parser) bool {
case a.Select:
if p.popUntil(selectScope, a.Select) {
p.resetInsertionMode()
+ } else {
+ // Ignore the token.
+ return true
}
+ case a.Template:
+ return inHeadIM(p)
}
case CommentToken:
p.addChild(&Node{
@@ -1627,30 +1827,107 @@ func inSelectIM(p *parser) bool {
case DoctypeToken:
// Ignore the token.
return true
+ case ErrorToken:
+ return inBodyIM(p)
}
return true
}
-// Section 12.2.5.4.17.
+// Section 12.2.6.4.17.
func inSelectInTableIM(p *parser) bool {
switch p.tok.Type {
case StartTagToken, EndTagToken:
switch p.tok.DataAtom {
case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
- if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) {
- p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
- return false
- } else {
+ if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
// Ignore the token.
return true
}
+ // This is like p.popUntil(selectScope, a.Select), but it also
+ // matches <math select>, not just <select>. Matching the MathML
+ // tag is arguably incorrect (conceptually), but it mimics what
+ // Chromium does.
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ if n := p.oe[i]; n.DataAtom == a.Select {
+ p.oe = p.oe[:i]
+ break
+ }
+ }
+ p.resetInsertionMode()
+ return false
}
}
return inSelectIM(p)
}
-// Section 12.2.5.4.18.
+// Section 12.2.6.4.18.
+func inTemplateIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken, CommentToken, DoctypeToken:
+ return inBodyIM(p)
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
+ return inHeadIM(p)
+ case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
+ p.templateStack.pop()
+ p.templateStack = append(p.templateStack, inTableIM)
+ p.im = inTableIM
+ return false
+ case a.Col:
+ p.templateStack.pop()
+ p.templateStack = append(p.templateStack, inColumnGroupIM)
+ p.im = inColumnGroupIM
+ return false
+ case a.Tr:
+ p.templateStack.pop()
+ p.templateStack = append(p.templateStack, inTableBodyIM)
+ p.im = inTableBodyIM
+ return false
+ case a.Td, a.Th:
+ p.templateStack.pop()
+ p.templateStack = append(p.templateStack, inRowIM)
+ p.im = inRowIM
+ return false
+ default:
+ p.templateStack.pop()
+ p.templateStack = append(p.templateStack, inBodyIM)
+ p.im = inBodyIM
+ return false
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Template:
+ return inHeadIM(p)
+ default:
+ // Ignore the token.
+ return true
+ }
+ case ErrorToken:
+ if !p.oe.contains(a.Template) {
+ // Ignore the token.
+ return true
+ }
+ // TODO: remove this divergence from the HTML5 spec.
+ //
+ // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
+ p.generateImpliedEndTags()
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
+ p.oe = p.oe[:i]
+ break
+ }
+ }
+ p.clearActiveFormattingElements()
+ p.templateStack.pop()
+ p.resetInsertionMode()
+ return false
+ }
+ return false
+}
+
+// Section 12.2.6.4.19.
func afterBodyIM(p *parser) bool {
switch p.tok.Type {
case ErrorToken:
@@ -1688,7 +1965,7 @@ func afterBodyIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.19.
+// Section 12.2.6.4.20.
func inFramesetIM(p *parser) bool {
switch p.tok.Type {
case CommentToken:
@@ -1738,7 +2015,7 @@ func inFramesetIM(p *parser) bool {
return true
}
-// Section 12.2.5.4.20.
+// Section 12.2.6.4.21.
func afterFramesetIM(p *parser) bool {
switch p.tok.Type {
case CommentToken:
@@ -1777,7 +2054,7 @@ func afterFramesetIM(p *parser) bool {
return true
}
-// Section 12.2.5.4.21.
+// Section 12.2.6.4.22.
func afterAfterBodyIM(p *parser) bool {
switch p.tok.Type {
case ErrorToken:
@@ -1806,7 +2083,7 @@ func afterAfterBodyIM(p *parser) bool {
return false
}
-// Section 12.2.5.4.22.
+// Section 12.2.6.4.23.
func afterAfterFramesetIM(p *parser) bool {
switch p.tok.Type {
case CommentToken:
@@ -1844,7 +2121,7 @@ func afterAfterFramesetIM(p *parser) bool {
const whitespaceOrNUL = whitespace + "\x00"
-// Section 12.2.5.5.
+// Section 12.2.6.5
func parseForeignContent(p *parser) bool {
switch p.tok.Type {
case TextToken:
@@ -1924,7 +2201,7 @@ func parseForeignContent(p *parser) bool {
return true
}
-// Section 12.2.5.
+// Section 12.2.6.
func (p *parser) inForeignContent() bool {
if len(p.oe) == 0 {
return false
@@ -2012,8 +2289,44 @@ func (p *parser) parse() error {
}
// Parse returns the parse tree for the HTML from the given Reader.
+//
+// It implements the HTML5 parsing algorithm
+// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
+// which is very complicated. The resultant tree can contain implicitly created
+// nodes that have no explicit <tag> listed in r's data, and nodes' parents can
+// differ from the nesting implied by a naive processing of start and end
+// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
+// with no corresponding node in the resulting tree.
+//
// The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) {
+ return ParseWithOptions(r)
+}
+
+// ParseFragment parses a fragment of HTML and returns the nodes that were
+// found. If the fragment is the InnerHTML for an existing element, pass that
+// element in context.
+//
+// It has the same intricacies as Parse.
+func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
+ return ParseFragmentWithOptions(r, context)
+}
+
+// ParseOption configures a parser.
+type ParseOption func(p *parser)
+
+// ParseOptionEnableScripting configures the scripting flag.
+// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
+//
+// By default, scripting is enabled.
+func ParseOptionEnableScripting(enable bool) ParseOption {
+ return func(p *parser) {
+ p.scripting = enable
+ }
+}
+
+// ParseWithOptions is like Parse, with options.
+func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
@@ -2023,6 +2336,11 @@ func Parse(r io.Reader) (*Node, error) {
framesetOK: true,
im: initialIM,
}
+
+ for _, f := range opts {
+ f(p)
+ }
+
err := p.parse()
if err != nil {
return nil, err
@@ -2030,10 +2348,8 @@ func Parse(r io.Reader) (*Node, error) {
return p.doc, nil
}
-// ParseFragment parses a fragment of HTML and returns the nodes that were
-// found. If the fragment is the InnerHTML for an existing element, pass that
-// element in context.
-func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
+// ParseFragmentWithOptions is like ParseFragment, with options.
+func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
contextTag := ""
if context != nil {
if context.Type != ElementNode {
@@ -2057,6 +2373,10 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
context: context,
}
+ for _, f := range opts {
+ f(p)
+ }
+
root := &Node{
Type: ElementNode,
DataAtom: a.Html,
@@ -2064,6 +2384,9 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
}
p.doc.AppendChild(root)
p.oe = nodeStack{root}
+ if context != nil && context.DataAtom == a.Template {
+ p.templateStack = append(p.templateStack, inTemplateIM)
+ }
p.resetInsertionMode()
for n := context; n != nil; n = n.Parent {