-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpipeline.go
148 lines (121 loc) · 2.85 KB
/
pipeline.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package pipeline
import (
"bytes"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// Mode for render
type mode int
const (
// ModeHTML use HTML output
modeHTML mode = iota
// ModePlain use Plain text output
modePlain
)
var (
stripHTMLTagRe = regexp.MustCompile(`<.+?>`)
)
// Pipeline stuct
type Pipeline struct {
Filters []Filter
mode mode
}
// NewPipeline create pipeline with HTML mode
func NewPipeline(filters []Filter) Pipeline {
return Pipeline{
Filters: filters,
mode: modeHTML,
}
}
// NewPlainPipeline create pipeline with Plain mode (HTML tags will remove)
func NewPlainPipeline(filters []Filter) Pipeline {
return Pipeline{
Filters: filters,
mode: modePlain,
}
}
// Call to Render with Pipleline
func (p Pipeline) Call(raw string) (out string, err error) {
if p.mode == modeHTML {
return p.callWithHTML(raw)
} else {
return p.callWithPlain(raw)
}
}
// Call to Render with Pipleline
func (p Pipeline) callWithHTML(raw string) (out string, err error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(raw))
if err != nil {
return
}
var hasEscapeFilter = false
for _, filter := range p.Filters {
switch filter.(type) {
case HTMLEscapeFilter:
hasEscapeFilter = true
}
err = filter.Call(doc)
if err != nil {
return
}
}
out, err = doc.Find("body").Html()
if err != nil {
return
}
if !hasEscapeFilter {
out = unescapeSingleQuote(out)
}
return
}
// CacallWithPlain render plain text
func (p Pipeline) callWithPlain(raw string) (out string, err error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(raw))
if err != nil {
return
}
for _, filter := range p.Filters {
switch filter.(type) {
case HTMLEscapeFilter:
continue
}
err = filter.Call(doc)
if err != nil {
return
}
}
out = getRawHTML(doc.Find("body"))
// Ensure to remove HTML Tag for avoid XSS
// Because Plain mode has limited not supports any HTML Tag, so here we can make sure to remove all of them.
out = stripHTMLTagRe.ReplaceAllString(out, "")
out = strings.TrimSpace(out)
return
}
func unescapeSingleQuote(in string) (out string) {
return strings.ReplaceAll(in, "'", "'")
}
// Text gets the combined text contents of each element in the set of matched
// elements, including their descendants.
// https://github.com/PuerkitoBio/goquery/blob/v1.6.0/property.go#L62
func getRawHTML(s *goquery.Selection) string {
var buf bytes.Buffer
// Slightly optimized vs calling Each: no single selection object created
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode || n.Type == html.RawNode {
// Keep newlines and spaces, like jQuery
buf.WriteString(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
}
for _, n := range s.Nodes {
f(n)
}
return buf.String()
}