From 67b2df6cffdb637ab8cdc63cd356d8603b61a372 Mon Sep 17 00:00:00 2001 From: Dimitri Sokolyuk Date: Tue, 24 Jul 2018 13:57:08 +0200 Subject: tokenize instead of tree parsing --- href.go | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/href.go b/href.go index d5b0c0f..f094ff4 100644 --- a/href.go +++ b/href.go @@ -18,28 +18,27 @@ var ( errNoTitle = errors.New("no title") ) -const ( - maxLength = 10 * 1024 * 1024 // 10MB - maxDepth = 10 -) +const maxLength = 10 * 1024 * 1024 // 10MB -func title(n *html.Node, depth int) (string, error) { - var s string - if depth <= 0 { - return "", errTooDeep - } - if n.Type == html.ElementNode && n.Data == "title" { - for c := n.FirstChild; c != nil; c = c.NextSibling { - s += c.Data - } - return strings.TrimSpace(s), nil - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - if t, err := title(c, depth-1); err == nil { - return t, nil +func title(r io.Reader) (string, error) { + var inTitle bool + z := html.NewTokenizer(r) + for { + switch tt := z.Next(); tt { + case html.ErrorToken: + return "", z.Err() + case html.StartTagToken: + name, _ := z.TagName() + if string(name) == "title" { + inTitle = true + } + case html.TextToken: + if inTitle { + return string(z.Text()), nil + } } } - return "", errNoTitle + return "", nil } func getTitle(uri string) (string, error) { @@ -67,12 +66,7 @@ func getTitle(uri string) (string, error) { return "", err } - doc, err := html.Parse(r) - if err != nil { - return "", err - } - - return title(doc, maxDepth) + return title(r) } func getLinks(s string) (ret []string) { -- cgit v1.2.3