-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.go
189 lines (170 loc) · 5.42 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/**
* Go uBlock
*
* Copyright 2021 Tenta, LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For any questions, please contact [email protected]
*/
package ublock
import (
"fmt"
"regexp"
"strings"
)
/* regex capture groups
0 - full match
1 - comment
2 - exception
3 - host entry
4 - regex entry
5 - domain entry
6 - pipe start
7 - url part entry
8 - pipe end
9 - filter options
10 - dom/css filter options
*/
/// regex classes
const (
/// regex defining legal chars in url part describing classes, or eats up justdomain type of lists too
/// [\|]* -- removed
reURLPart = `(?:[\w\.\-\:\/\+\&\=\?\*\;\,\~\%\^\@]+[\w\.\-\:\/\+\&\=\?\*\;\,\~\!\]\[\^\@]*)*`
/// three types of comments occuring in adblock plus filter definitions
reCommentGroup = `((?:\[[\w .]+\])|(?:\#.*)|(?:\!.*))`
/// exact address type filter
reFreeFormEntry = `(?:(` + reURLPart + `))`
/// fixed start and/or end type filter
reStartEndDefined = `(?:(\|)?(` + reURLPart + `)(\|)?)`
/// domain filter
reDomain = `(?:\|\|(` + reURLPart + `))`
/// host format
reHost = `[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}[\s]+([\w\.\-]+)`
/// regular expression filters (this one is ugly, it basically hangs on the idea of the regex having at least one char class)
reRegex = `\/((?:[\w\.\-\:\+\&\=\?\*\|\;\,\{\}\$\^\\\/\(\)]*[\[\]]*[\w\.\-\:\+\&\=\?\*\|\;\,\{\}\$\\\/\(\)]*)+)\/`
/// filtering parameters -- aggregated form, v0.1 can't handle these
reFilterParams = `(?:\$(.+))?`
/// exception token catcher
reException = `(\@\@)?`
reDOMFilter = `(\#\#.*|\#\?\#.*|\#\@\#.*)?` /// android/other will receive this as tokens[10] which it will ignore
reDOMFilterIOS = `(?:\#\#(.*))?(?:\#\@\#(.*))?(|\#\?\#.*|\#\$\#.*)?` /// for iOS (JSON content filtering) we capture CSS selectors and CSS selector exceptions
reArgumentGroup = `(?:` + reHost + `|` + reRegex + `|` + reDomain + `|` + reStartEndDefined + `)`
reEntry = `^` + reCommentGroup + `|` + reException + reArgumentGroup + reFilterParams + reDOMFilter + `$`
reEntryIOS = `^` + reCommentGroup + `|` + reException + reArgumentGroup + reFilterParams + reDOMFilterIOS + `$`
/// regex for protocol+subdomain parsing from URL (used in double pipe case)
reProtocolSubdomain = `^(?:[\w-+.]+://)?(?:(?:` + reURLPart + `)\.)*$`
)
const (
separatorExpansion = `:/?=&`
separatorCharClass = "[" + separatorExpansion + "]+"
)
var (
pEntry = regexp.MustCompile(reEntry)
pEntryIOS = regexp.MustCompile(reEntryIOS)
)
type rawGraphElement struct {
token string
suffix string
flags int
original string
filter string
opts *filterOptions
domfilter, domexception string
}
func parseLine(line string, foriOS bool) (r *rawGraphElement, e error) {
var tokens []string
if foriOS {
tokens = pEntryIOS.FindStringSubmatch(line)
} else {
tokens = pEntry.FindStringSubmatch(line)
}
/// check for full match, as in the whole pattern is matched, aka use only the formats supported
if tokens == nil {
return nil, fmt.Errorf("unrecognized format [%s]", line)
}
if tokens[0] == "" {
return nil, fmt.Errorf("unhandled format [%s]", line)
}
r = &rawGraphElement{}
var err error
/// can't handle fine grained filters as of yet
if tokens[9] != "" {
if foriOS {
//log("Options: [%s]\n", tokens[9])
if r.opts, err = newFilterOptions(tokens[9]); err != nil {
//log("Aborting construction of element due to Opts error: [%s]\n", err.Error())
return nil, err
}
} else {
return nil, fmt.Errorf("cannot handle options [%s]", tokens[9])
}
}
if tokens[10] != "" {
if foriOS {
//log("CSS selectors: [%s]\n", tokens[10])
r.domfilter = tokens[10]
} else {
return nil, fmt.Errorf("cannot handle DOM alteration")
}
}
if foriOS && tokens[11] != "" {
//log("CSS exception selectors: [%s]\n", tokens[10])
r.domexception = tokens[11]
}
if foriOS && tokens[12] != "" {
return nil, fmt.Errorf("cannot handle exntended/ABP css selectors")
}
if tokens[2] != "" {
r.flags |= flagException
}
/// host entry
if tokens[3] != "" {
r.token = tokens[3]
r.flags |= flagDoublePipe
} else
/// fixed domain
if tokens[5] != "" {
r.token = tokens[5]
// DANGER DANGER!!!!!
r.flags |= flagDoublePipe
}
/// url part or fixed end/start
if tokens[7] != "" {
if tokens[6] != "" {
r.flags |= flagPipeStart
}
if tokens[8] != "" {
r.flags |= flagPipeEnd
}
r.token = tokens[7]
}
if strings.Contains(r.token, "*") {
r.flags |= flagWildCard
}
if strings.Contains(r.token, "^") {
r.flags |= flagSeparator
}
r.original = r.token
//log("LINE::[%v]\n", tokens)
return r, nil
}
// determine end of line sequence and split the
// content into multiple lines
func splitIntoLines(content string) []string {
le := "\n"
if strings.Contains(content, "\r\n") {
le = "\r\n"
}
return strings.Split(content, le)
}