1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
package main
import (
"context"
"errors"
"io"
"net/http"
"strings"
"time"
"unicode/utf8"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)
var (
errNotHTML = errors.New("not HTML")
errNotOK = errors.New("not OK")
errTooBig = errors.New("content too big")
errNoTitle = errors.New("no title")
errNotText = errors.New("invalid UTF-8")
)
const maxLength = 10 * 1024 * 1024 // 10MB
func title(r io.Reader) (string, error) {
var inTitle bool
z := html.NewTokenizer(r)
for {
switch tt := z.Next(); tt {
case html.ErrorToken:
return "", z.Err()
case html.StartTagToken:
name, _ := z.TagName()
if string(name) == "title" {
inTitle = true
}
case html.TextToken:
if inTitle {
t := z.Text()
if !utf8.Valid(t) {
return "", errNotText
}
if s := strings.TrimSpace(string(t)); len(s) > 0 {
return s, nil
}
return "", errNoTitle
}
}
}
return "", errNoTitle
}
func getTitle(uri string) (string, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return "", err
}
ctx, cancel := context.WithTimeout(req.Context(), time.Minute)
defer cancel()
resp, err := http.DefaultClient.Do(req.WithContext(ctx))
if err != nil {
return "", err
}
defer resp.Body.Close()
ct := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ct, "text/html") {
return "", errNotHTML
}
if resp.StatusCode != http.StatusOK {
return "", errNotOK
}
if resp.ContentLength > maxLength {
return "", errTooBig
}
r, err := charset.NewReader(io.LimitReader(resp.Body, maxLength), ct)
if err != nil {
return "", err
}
return title(r)
}
func getLinks(s string) (ret []string) {
for _, v := range strings.Fields(s) {
switch {
case strings.HasPrefix(v, "www."):
v = "http://" + v
fallthrough
case strings.HasPrefix(v, "http:"), strings.HasPrefix(v, "https:"):
ret = append(ret, v)
}
}
return
}
|