-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtml_meta_parser.go
114 lines (108 loc) · 2.7 KB
/
html_meta_parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Implements a basic HTML parser that just checks <title>
// It also annotates mime Type if possible
package unfurlist
import (
"bytes"
"errors"
"io"
"net/http"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"golang.org/x/net/html/charset"
)
func basicParseHTML(chunk *pageChunk) *unfurlResult {
result := new(unfurlResult)
sniffedContentType := http.DetectContentType(chunk.data)
result.Type = sniffedContentType
switch {
case strings.HasPrefix(result.Type, "image/"):
result.Type = "image"
result.Image = chunk.url.String()
case strings.HasPrefix(result.Type, "text/"):
result.Type = "website"
// pass Content-Type from response headers as it may have
// charset definition like "text/html; charset=windows-1251"
ct := chunk.ct
// There are cases where Content-Type header is "text/html", but http.DetectContentType
// narrows it down to a more specific "text/html; charset=utf-8". In such a case use
// the latter.
if !strings.Contains(ct, "charset=") && strings.Contains(sniffedContentType, "charset=") {
ct = sniffedContentType
}
if title, desc, err := extractData(chunk.data, ct); err == nil {
result.Title = title
result.Description = desc
}
case strings.HasPrefix(result.Type, "video/"):
result.Type = "video"
}
return result
}
func extractData(htmlBody []byte, ct string) (title, description string, err error) {
bodyReader, err := charset.NewReader(bytes.NewReader(htmlBody), ct)
if err != nil {
return "", "", err
}
z := html.NewTokenizer(bodyReader)
tokenize:
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
goto finish
}
return "", "", z.Err()
case html.StartTagToken:
name, hasAttr := z.TagName()
switch atom.Lookup(name) {
case atom.Body:
goto finish // title/meta should preceed body tag
case atom.Title:
if title != "" {
continue
}
if tt := z.Next(); tt == html.TextToken {
title = string(z.Text())
if description != "" {
goto finish
}
}
case atom.Meta:
if description != "" {
continue
}
var content []byte
var isDescription bool
for hasAttr {
var k, v []byte
k, v, hasAttr = z.TagAttr()
switch string(k) {
case "name":
if !bytes.Equal(v, []byte("description")) {
continue tokenize
}
isDescription = true
case "content":
content = v
}
}
if isDescription && len(content) > 0 {
description = string(content)
if title != "" {
goto finish
}
}
}
}
}
finish:
if title != "" || description != "" {
return title, description, nil
}
return "", "", errNoMetadataFound
}
var (
errNoMetadataFound = errors.New("no metadata found")
)