summaryrefslogtreecommitdiff
path: root/internal/href/href.go
blob: d134f67523f3f945917ff2ab6d3da169bad0d012 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package href

import (
	"context"
	"errors"
	"io"
	"net/http"
	"strings"
	"time"
	"unicode/utf8"

	"golang.org/x/net/html"
	"golang.org/x/net/html/charset"
)

var (
	errNotHTML = errors.New("not HTML")
	errNotOK   = errors.New("not OK")
	errTooBig  = errors.New("content too big")
	errNoTitle = errors.New("no title")
	errNotText = errors.New("invalid UTF-8")
)

const maxLength = 10 * 1024 * 1024 // 10MB

func title(r io.Reader) (string, error) {
	var inTitle bool
	z := html.NewTokenizer(r)
	for {
		switch tt := z.Next(); tt {
		case html.ErrorToken:
			return "", z.Err()
		case html.StartTagToken:
			name, _ := z.TagName()
			if string(name) == "title" {
				inTitle = true
			}
		case html.TextToken:
			if inTitle {
				t := z.Text()
				if !utf8.Valid(t) {
					return "", errNotText
				}
				if s := strings.TrimSpace(string(t)); len(s) > 0 {
					return s, nil
				}
				return "", errNoTitle
			}
		}
	}
	return "", errNoTitle
}

func Title(uri string) (string, error) {
	req, err := http.NewRequest("GET", uri, nil)
	if err != nil {
		return "", err
	}
	ctx, cancel := context.WithTimeout(req.Context(), time.Minute)
	defer cancel()

	resp, err := http.DefaultClient.Do(req.WithContext(ctx))
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()

	ct := resp.Header.Get("Content-Type")
	if !strings.HasPrefix(ct, "text/html") {
		return "", errNotHTML
	}

	if resp.StatusCode != http.StatusOK {
		return "", errNotOK
	}

	if resp.ContentLength > maxLength {
		return "", errTooBig
	}

	r, err := charset.NewReader(io.LimitReader(resp.Body, maxLength), ct)
	if err != nil {
		return "", err
	}

	return title(r)
}

func Links(s string) (ret []string) {
	for _, v := range strings.Fields(s) {
		switch {
		case strings.HasPrefix(v, "www."):
			v = "http://" + v
			fallthrough
		case strings.HasPrefix(v, "http:"), strings.HasPrefix(v, "https:"):
			ret = append(ret, v)
		}
	}
	return
}