summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitri Sokolyuk <demon@dim13.org>2018-07-24 13:57:08 +0200
committerDimitri Sokolyuk <demon@dim13.org>2018-07-24 13:57:08 +0200
commit67b2df6cffdb637ab8cdc63cd356d8603b61a372 (patch)
tree89382dc694c6b736686e9d3082bc61524b504fae
parent634f3fd022ba8ad3f064944ab29f930689f14b38 (diff)
tokenize instead of tree parsing
-rw-r--r--href.go44
1 files changed, 19 insertions, 25 deletions
diff --git a/href.go b/href.go
index d5b0c0f..f094ff4 100644
--- a/href.go
+++ b/href.go
@@ -18,28 +18,27 @@ var (
errNoTitle = errors.New("no title")
)
-const (
- maxLength = 10 * 1024 * 1024 // 10MB
- maxDepth = 10
-)
+const maxLength = 10 * 1024 * 1024 // 10MB
-func title(n *html.Node, depth int) (string, error) {
- var s string
- if depth <= 0 {
- return "", errTooDeep
- }
- if n.Type == html.ElementNode && n.Data == "title" {
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- s += c.Data
- }
- return strings.TrimSpace(s), nil
- }
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if t, err := title(c, depth-1); err == nil {
- return t, nil
+func title(r io.Reader) (string, error) {
+ var inTitle bool
+ z := html.NewTokenizer(r)
+ for {
+ switch tt := z.Next(); tt {
+ case html.ErrorToken:
+ return "", z.Err()
+ case html.StartTagToken:
+ name, _ := z.TagName()
+ if string(name) == "title" {
+ inTitle = true
+ }
+ case html.TextToken:
+ if inTitle {
+ return string(z.Text()), nil
+ }
}
}
- return "", errNoTitle
+ return "", nil
}
func getTitle(uri string) (string, error) {
@@ -67,12 +66,7 @@ func getTitle(uri string) (string, error) {
return "", err
}
- doc, err := html.Parse(r)
- if err != nil {
- return "", err
- }
-
- return title(doc, maxDepth)
+ return title(r)
}
func getLinks(s string) (ret []string) {