-
Notifications
You must be signed in to change notification settings - Fork 14
/
inputstream.lisp
344 lines (305 loc) · 12.2 KB
/
inputstream.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
;;;; HTML5 parser for Common Lisp
;;;;
;;;; Copyright (C) 2012 Thomas Bakketun <[email protected]>
;;;; Copyright (C) 2012 Asgeir Bjørlykke <[email protected]>
;;;; Copyright (C) 2012 Mathias Hellevang
;;;; Copyright (C) 2012 Stian Sletner <[email protected]>
;;;;
;;;; This library is free software: you can redistribute it and/or modify
;;;; it under the terms of the GNU Lesser General Public License as published
;;;; by the Free Software Foundation, either version 3 of the License, or
;;;; (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this library. If not, see <http://www.gnu.org/licenses/>.
(in-package :html5-parser)
(deftype array-length ()
"Type of an array index."
'(integer 0 #.array-dimension-limit))
(deftype chunk ()
"Type of the input stream buffer."
'(vector character *))
(defparameter *default-encoding* :utf-8)
(defclass html-input-stream ()
((source :initarg :source)
(encoding :reader html5-stream-encoding)
(char-stream :initform nil)
(chunk)
(chunk-offset)
(pending-cr)
(errors :initform nil :accessor html5-stream-errors)))
(defun make-html-input-stream (source &key override-encoding fallback-encoding)
(when (stringp source)
;; Encoding is not relevant when input is a string,
;; but we set it utf-8 here to avoid auto detecting taking place.
(setf override-encoding :utf-8))
(let ((self (make-instance 'html-input-stream :source source)))
(with-slots (encoding stream) self
(setf encoding (detect-encoding self
(find-encoding override-encoding)
(find-encoding fallback-encoding)))
(open-char-stream self))
self))
;; 12.2.2.2 Character encodings
(defun find-encoding (encoding-name)
;; Normalize the string designator
(setf encoding-name (string-upcase (substitute #\- #\_ (string-trim +space-characters+ (string encoding-name)))))
;; All known encoding will already be interned in the keyword package so find-symbol is fine here
(setf encoding-name (find-symbol encoding-name :keyword))
(handler-case
;; Verfiy that flexi-streams knows the encoding and resolve aliases
(case (flex:external-format-name (flex:make-external-format encoding-name))
;; Some encoding should be replaced by some other.
;; Only those supported by flexi-streams are listed here.
;; iso-8859-11 should be replaced by windows-874, but flexi-streams doesn't that encoding.
(:iso-8859-1 :windows-1252)
(:iso-8859-9 :windows-1254)
(:us-ascii :windows-1252)
(otherwise encoding-name))
(flex:external-format-error ())))
;; 12.2.2.1 Determining the character encoding
(defun detect-encoding (stream override-encoding fallback-encoding)
(with-slots (encoding) stream
(block nil
;; 1. and 2. encoding overridden by user or transport layer
(when override-encoding
(return (cons override-encoding :certain)))
;; 3. wait for 1024 bytes, not implemented
;; 4. Detect BOM
(let ((bom-encoding (detect-bom stream)))
(when bom-encoding
(return (cons bom-encoding :certain))))
;; 5. Prescan not implemented
;; 6. Use fallback encoding
(when fallback-encoding
(return (cons encoding :tentative)))
;; 7. Autodect not implemented
;; 8. Implementation-defined default
(return (cons *default-encoding* :tentative)))))
(defmacro handle-encoding-errors (stream &body body)
`(handler-bind ((flex:external-format-encoding-error
(lambda (x)
(declare (ignore x))
(push :invalid-codepoint (html5-stream-errors ,stream))
(use-value #\uFFFD))))
,@body))
(defun open-char-stream (self)
(with-slots (source encoding char-stream chunk chunk-offset pending-cr) self
(setf chunk (make-array (* 10 1024) :element-type 'character :fill-pointer 0))
(setf chunk-offset 0)
(setf pending-cr nil)
(when char-stream
(close char-stream))
(setf char-stream
(if (stringp source)
(make-string-input-stream source)
(flex:make-flexi-stream
(etypecase source
(pathname
(open source :element-type '(unsigned-byte 8)))
(stream
source)
(vector
(flex:make-in-memory-input-stream source)))
:external-format (flex:make-external-format (car encoding) :eol-style :lf))))
;; 12.2.2.4 says we should always skip the first byte order mark
(handle-encoding-errors self
(let ((first-char (peek-char nil char-stream nil)))
(when (eql first-char #\ufeff)
(read-char char-stream))))))
(defun detect-bom (self)
(with-slots (source) self
(let (byte-0 byte-1 byte-2)
(etypecase source
(vector
(when (> (length source) 0) (setf byte-0 (aref source 0)))
(when (> (length source) 1) (setf byte-1 (aref source 1)))
(when (> (length source) 2) (setf byte-2 (aref source 2))))
(pathname
(with-open-file (in source :element-type '(unsigned-byte 8))
(setf byte-0 (read-byte in nil))
(setf byte-1 (read-byte in nil))
(setf byte-2 (read-byte in nil))))
(stream
(error "Can't detect encoding when source is a stream.")))
(cond ((and (eql byte-0 #xfe)
(eql byte-1 #xff))
:utf-16be)
((and (eql byte-0 #xff)
(eql byte-1 #xfe))
:utf-16le)
((and (eql byte-0 #xef)
(eql byte-1 #xbb)
(eql byte-2 #xbf))
:utf-8)))))
;; 12.2.2.3 Changing the encoding while parsing
(defun html5-stream-change-encoding (stream new-encoding)
(setf new-encoding (find-encoding new-encoding))
(with-slots (encoding char-stream) stream
;; 1.
(when (member (car encoding) '(:utf-16le :utf-16be))
(setf encoding (cons (car encoding) :certain))
(return-from html5-stream-change-encoding))
;; 2.
(when (member new-encoding '(:utf-16le :utf-16be))
(setf new-encoding :utf-8))
;; 3.
(when (eql (car encoding) new-encoding)
(setf encoding (cons (car encoding) :certain))
(return-from html5-stream-change-encoding))
;; 4. Not impleneted
;; 5. Restart paring from scratch
(setf encoding (cons new-encoding :certain))
(open-char-stream stream)
(throw 'please-reparse t)))
(defun html5-stream-char (stream)
(with-slots (chunk chunk-offset) stream
(when (>= chunk-offset (length chunk))
(unless (read-chunk stream)
(return-from html5-stream-char +eof+)))
(prog1 (char chunk chunk-offset)
(incf chunk-offset))))
(defun our-scan (chars opposite-p chunk &key start)
(loop for i from start below (length chunk)
for char = (char chunk i)
while (if opposite-p
(position char chars)
(not (position char chars)))
finally (return i)))
(defun html5-stream-chars-until (stream characters &optional opposite-p)
"Returns a string of characters from the stream up to but not
including any character in characters or end of file.
"
(with-slots (chunk chunk-offset) stream
(declare (array-length chunk-offset) (chunk chunk))
(with-output-to-string (data)
(loop for end = (our-scan characters opposite-p chunk :start chunk-offset) do
;; If nothing matched, and it wasn't because we ran out of chunk,
;; then stop
(when (and (not end)
(/= chunk-offset (length chunk)))
(return))
;; If not the whole chunk matched, return everything
;; up to the part that didn't match
(when (and end
(/= chunk-offset (length chunk)))
(write-string chunk data :start chunk-offset :end end)
(setf chunk-offset end)
(return))
;; If the whole remainder of the chunk matched,
;; use it all and read the next chunk
(write-string chunk data :start chunk-offset)
(unless (read-chunk stream)
(return))))))
(defun html5-stream-unget (stream char)
(with-slots (chunk chunk-offset) stream
(unless (eql char +eof+)
(cond ((zerop chunk-offset)
(cond ((< (fill-pointer chunk) (array-dimension chunk 0))
(incf (fill-pointer chunk))
(replace chunk chunk :start1 1))
(t
(let ((new-chunk (make-array (1+ (array-dimension chunk 0))
:element-type 'character
:fill-pointer (1+ (fill-pointer chunk)))))
(replace new-chunk chunk :start1 1)
(setf chunk new-chunk))))
(setf (char chunk 0) char))
(t
(decf chunk-offset)
(assert (char= char (char chunk chunk-offset))))))))
(defun read-chunk (stream)
(declare (optimize speed))
(with-slots (char-stream chunk chunk-offset pending-cr) stream
(declare (array-length chunk-offset)
(chunk chunk))
(setf chunk-offset 0)
(let ((start 0))
(when pending-cr
(setf (char chunk 0) #\Return)
(setf start 1)
(setf pending-cr nil))
(setf (fill-pointer chunk) (array-dimension chunk 0))
(handle-encoding-errors stream
(setf (fill-pointer chunk) (read-sequence chunk char-stream :start start)))
(unless (zerop (length chunk))
;; check if last char is CR and EOF was not reached
(when (and (= (length chunk) (array-dimension chunk 0))
(eql (char chunk (1- (length chunk))) #\Return))
(setf pending-cr t)
(decf (fill-pointer chunk)))
(report-character-errors stream chunk)
;; Python code replaces surrugate pairs with U+FFFD here. Why?
;; Normalize line endings (CR LF)
(loop for previous = nil then current
for current across chunk
for index of-type array-length from 0
with offset of-type array-length = 0
do (unless (and (eql previous #\Return)
(eql current #\Newline))
(unless (= index offset)
(setf (char chunk offset) current))
(when (eql current #\Return)
(setf (char chunk offset) #\Newline))
(incf offset))
finally (setf (fill-pointer chunk) offset))
t))))
(defun char-range (char1 char2)
(loop for i from (char-code char1) to (char-code char2)
collect (code-char i)))
(defparameter *invalid-unicode*
`(,@(char-range #\u0001 #\u0008)
#\u000B
,@(char-range #\u000E #\u001F)
,@(char-range #\u007F #\u009F)
;; The following are noncharacter as defined by Unicode.
;; Clozure Common Lisp doesn't like them.
#-(or abcl ccl mezzano) ,@`(
,@(char-range #\uD800 #\uDFFF)
,@(char-range #\uFDD0 #\uFDEF)
#\uFFFE
#\uFFFF
#\u0001FFFE
#\u0001FFFF
#\u0002FFFE
#\u0002FFFF
#\u0003FFFE
#\u0003FFFF
#\u0004FFFE
#\u0004FFFF
#\u0005FFFE
#\u0005FFFF
#\u0006FFFE
#\u0006FFFF
#\u0007FFFE
#\u0007FFFF
#\u0008FFFE
#\u0008FFFF
#\u0009FFFE
#\u0009FFFF
#\u000AFFFE
#\u000AFFFF
#\u000BFFFE
#\u000BFFFF
#\u000CFFFE
#\u000CFFFF
#\u000DFFFE
#\u000DFFFF
#\u000EFFFE
#\u000EFFFF
#\u000FFFFE
#\u000FFFFF
#\u0010FFFE
#\u0010FFFF)))
(defparameter *invalid-unicode-hash* (make-hash-table))
(dolist (char *invalid-unicode*)
(setf (gethash char *invalid-unicode-hash*) char))
(defun report-character-errors (stream data)
(loop for char across data
when (gethash char *invalid-unicode-hash*)
do (push :invalid-codepoint (html5-stream-errors stream))))