package href import ( "context" "errors" "io" "net/http" "strings" "time" "unicode/utf8" "golang.org/x/net/html" "golang.org/x/net/html/charset" ) var ( errNotHTML = errors.New("not HTML") errNotOK = errors.New("not OK") errTooBig = errors.New("content too big") errNoTitle = errors.New("no title") errNotText = errors.New("invalid UTF-8") ) const maxLength = 10 * 1024 * 1024 // 10MB func title(r io.Reader) (string, error) { var inTitle bool z := html.NewTokenizer(r) for { switch tt := z.Next(); tt { case html.ErrorToken: return "", z.Err() case html.StartTagToken: name, _ := z.TagName() if string(name) == "title" { inTitle = true } case html.TextToken: if inTitle { t := z.Text() if !utf8.Valid(t) { return "", errNotText } if s := strings.TrimSpace(string(t)); len(s) > 0 { return s, nil } return "", errNoTitle } } } return "", errNoTitle } func Title(uri string) (string, error) { req, err := http.NewRequest("GET", uri, nil) if err != nil { return "", err } ctx, cancel := context.WithTimeout(req.Context(), time.Minute) defer cancel() resp, err := http.DefaultClient.Do(req.WithContext(ctx)) if err != nil { return "", err } defer resp.Body.Close() ct := resp.Header.Get("Content-Type") if !strings.HasPrefix(ct, "text/html") { return "", errNotHTML } if resp.StatusCode != http.StatusOK { return "", errNotOK } if resp.ContentLength > maxLength { return "", errTooBig } r, err := charset.NewReader(io.LimitReader(resp.Body, maxLength), ct) if err != nil { return "", err } return title(r) } func Links(s string) (ret []string) { for _, v := range strings.Fields(s) { switch { case strings.HasPrefix(v, "www."): v = "http://" + v fallthrough case strings.HasPrefix(v, "http:"), strings.HasPrefix(v, "https:"): ret = append(ret, v) } } return }