-
Notifications
You must be signed in to change notification settings - Fork 17
/
KBWebArchiver.m
365 lines (299 loc) · 10.2 KB
/
KBWebArchiver.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
//
// KBWebArchiver.m (patched by John Winter)
// ---------------
//
// Orginal : Keith Blount 2005
// Page timeout fix: John Winter 2006
// Keith Blount 2008
// Code Cleanup: Jan Weiß 2011
//
#import "KBWebArchiver.h"
#import "NSURL+ValidityChecking.h"
NSString *const KBWebArchiverErrorDomain = @"KBWebArchiverErrorDomain";
@interface KBWebArchiver (Private)
- (void)getWebPage;
@end
@implementation KBWebArchiver
@synthesize URL = _URL;
@synthesize localResourceLoadingOnly = _localResourceLoadingOnly;
- (id)initWithURLString:(NSString *)aURLString isFilePath:(BOOL)flag
{
NSURL *aURL;
if (aURLString == nil)
{
aURL = nil;
}
else
{
aURL = (flag ? [NSURL fileURLWithPath:aURLString] : [NSURL URLWithString:aURLString]);
}
return [self initWithURL:aURL];
}
- (id)initWithURLString:(NSString *)aURLString
{
NSURL *aURL;
if (aURLString == nil)
{
aURL = nil;
}
else
{
aURL = [NSURL URLWithString:aURLString];
}
if (aURL && aURL.scheme) {
return [self initWithURL:aURL];
}
else {
return [self initWithURLString:aURLString isFilePath:YES];
}
}
- (id)initWithURL:(NSURL *)aURL
{
self = [super init];
if (self)
{
_URL = aURL;
_archiveInformation = nil;
_localResourceLoadingOnly = NO;
}
return self;
}
- (id)init
{
return [self initWithURL:nil];
}
- (void)setURLString:(NSString *)aURLString isFilePath:(BOOL)isFilePath
{
self.URL = (isFilePath ? [[NSURL alloc] initFileURLWithPath:aURLString] : [[NSURL alloc] initWithString:aURLString]);
}
- (NSString *)URLString
{
return ([_URL isFileURL] ? [_URL path] : [_URL absoluteString]);
}
- (BOOL)isFilePath
{
return [_URL isFileURL];
}
- (WebArchive *)webArchive
{
// If we changed the URL since the last time we checked, then (re)generate the web archive information.
if ([_URL isEqual:_archiveInformation[@"URL"]] == NO)
[self getWebPage];
return _archiveInformation[@"WebArchive"];
}
- (NSString *)string
{
// If we changed the URL since the last time we checked, then (re)generate the web archive information.
if ([_URL isEqual:_archiveInformation[@"URL"]] == NO)
[self getWebPage];
return _archiveInformation[@"String"];
}
- (NSString *)title
{
// If we changed the URL since the last time we checked, then (re)generate the web archive information.
if ([_URL isEqual:_archiveInformation[@"URL"]] == NO)
[self getWebPage];
return _archiveInformation[@"Title"];
}
- (NSError *)error
{
// If we changed the URL since the last time we checked, then we have no error to report.
if ([_URL isEqual:_archiveInformation[@"URL"]] == NO)
return nil;
return _archiveInformation[@"Error"];
}
- (void)getWebPage
{
_archiveInformation = [[NSMutableDictionary alloc] init];
if (_URL == nil)
{
//NSBeep();
NSLog (@"*** KBWebArchiver error: No URL passed in. ***");
return;
}
// Add the URL.
_archiveInformation[@"URL"] = _URL;
// We also set a default title for the web page - if all goes well, this will be changed to something more
// meaningful in -webView:didReceiveTitle:forFrame:.
_archiveInformation[@"Title"] = NSLocalizedString(@"Web Page", nil);
// Check the URL is valid if it is to be downloaded from the 'net.
if ([_URL isFileURL] == NO && [_URL httpIsValid] == NO)
{
NSMutableDictionary *userInfo = [NSMutableDictionary dictionary];
userInfo[NSLocalizedDescriptionKey] = NSLocalizedString(@"Invalid URL", @"");
userInfo[NSLocalizedRecoverySuggestionErrorKey] = NSLocalizedString(@"The URL was invalid and so could not be converted to a web archive.",nil);
_archiveInformation[@"Error"] = [NSError errorWithDomain:KBWebArchiverErrorDomain
code:KBWebArchiverErrorCodeInvalidURL
userInfo:userInfo];
return;
}
// We have to create a web view, load the web page into this web view, and then grab the web archive and information from there.
WebView *webView = [[WebView alloc] initWithFrame:NSMakeRect(0, 0, 1024, 768)];
[webView setFrameLoadDelegate:self];
[webView setResourceLoadDelegate:self];
[webView setPolicyDelegate:self];
NSError *localLoadingError = nil;
BOOL tryLocalLoad = NO;
while (1) {
_finishedLoading = NO;
_loadFailed = NO;
if (!tryLocalLoad)
{
// Set up the load request and try to load the page.
NSURLRequestCachePolicy cachePolicy;
#if (MAC_OS_X_VERSION_MIN_REQUIRED < 1050)
cachePolicy = NSURLRequestReloadIgnoringCacheData;
#else
cachePolicy = NSURLRequestReloadIgnoringLocalCacheData;
#endif
NSURLRequest *theRequest = [NSURLRequest requestWithURL:_URL
cachePolicy:cachePolicy
timeoutInterval:30];
[[webView mainFrame] loadRequest:theRequest];
}
else
{
// Falling back to loading data from local file
NSData *data = [NSData dataWithContentsOfURL:_URL
options:0
error:&localLoadingError];
if (data != nil)
{
[[webView mainFrame] loadData:data
MIMEType:@"text/html" // CHANGEME: Assuming html
textEncodingName:@"UTF-8" // CHANGEME: Assuming UTF8
baseURL:_URL];
}
else
{
_archiveInformation[@"Error"] = localLoadingError;
break;
}
}
// Wait until the site has finished loading.
NSRunLoop *currentRunLoop = [NSRunLoop currentRunLoop];
NSTimeInterval resolution = _localResourceLoadingOnly ? 0.1 : 0.01;
BOOL isRunning = YES;
while (isRunning && _finishedLoading == NO) {
NSDate *next = [NSDate dateWithTimeIntervalSinceNow:resolution];
isRunning = [currentRunLoop runMode:NSDefaultRunLoopMode beforeDate:next];
}
if (_customJS != nil) {
[webView stringByEvaluatingJavaScriptFromString: _customJS];
}
[[webView mainFrame] stopLoading]; // Ensure the frame stops loading, otherwise will crash when released!
if (!tryLocalLoad
&& _loadFailed
&& [_URL isFileURL]
&& ((localLoadingError = _archiveInformation[@"Error"]) != nil)
&& ([localLoadingError code] == 102)) // Frame load interrupted
{
// This can occur if the local file we are trying to load is missing its extension (usually “.html”)
tryLocalLoad = YES;
[_archiveInformation removeObjectForKey:@"Error"];
continue;
}
else
{
break;
}
}
[webView setFrameLoadDelegate:nil];
[webView setResourceLoadDelegate:nil];
[webView setPolicyDelegate:nil];
// If the load failed, don't set any more data - just return.
if (_loadFailed)
{
if (_archiveInformation[@"Error"] == nil)
{
NSMutableDictionary *userInfo = [NSMutableDictionary dictionary];
userInfo[NSLocalizedDescriptionKey] = NSLocalizedString(@"Web Page Failed to Load", @"");
userInfo[NSLocalizedRecoverySuggestionErrorKey] = NSLocalizedString(@"The web page at the given URL failed to load and so could not be converted to a WebArchive.",nil);
_archiveInformation[@"Error"] = [NSError errorWithDomain:KBWebArchiverErrorDomain
code:KBWebArchiverErrorCodeLoadFailed
userInfo:userInfo];
}
return;
}
// Get the text if the web view has any.
NSString *string = @"";
if ([[[[webView mainFrame] frameView] documentView] conformsToProtocol:@protocol(WebDocumentText)])
string = [(id <WebDocumentText>)[[[webView mainFrame] frameView] documentView] string];
_archiveInformation[@"String"] = string;
// the -dataSource method was causing some crashes and also some web pages only half-loaded;
// using the -DOMDocument method seems to work much better.
//WebArchive *webArchive = [[[webView mainFrame] dataSource] webArchive];
WebArchive *webArchive = [[[webView mainFrame] DOMDocument] webArchive];
if (webArchive)
{
_archiveInformation[@"WebArchive"] = webArchive;
}
else if (_archiveInformation[@"Error"] == nil)
{
NSMutableDictionary *userInfo = [NSMutableDictionary dictionary];
userInfo[NSLocalizedDescriptionKey] = NSLocalizedString(@"Web Archive Creation Failed", @"");
userInfo[NSLocalizedRecoverySuggestionErrorKey] = NSLocalizedString(@"A web archive could not be created from the page at the given URL.",nil);
_archiveInformation[@"Error"] = [NSError errorWithDomain:KBWebArchiverErrorDomain
code:KBWebArchiverErrorCodeArchiveCreationFailed
userInfo:userInfo];
}
}
// Oh dear, this can cause some crashes - eg. importing Yahoo...
- (void)webView:(WebView *)sender didFinishLoadForFrame:(WebFrame *)frame
{
if (frame == [sender mainFrame])
_finishedLoading = YES;
}
// Check for errors loading page
- (void)webView:(WebView *)sender didFailProvisionalLoadWithError:(NSError *)error forFrame:(WebFrame *)frame
{
if (frame == [sender mainFrame])
{
_loadFailed = YES;
_finishedLoading = YES;
if (error)
_archiveInformation[@"Error"] = error;
}
}
- (void)webView:(WebView *)sender didFailLoadWithError:(NSError *)error forFrame:(WebFrame *)frame
{
if (frame == [sender mainFrame])
{
// UPDATE: Some pages automatically report being cancelled and fail even though they load,
// so in this case we don't want to finish loading but we do want store the error.
if ([error code] != NSURLErrorCancelled)
{
_loadFailed = YES;
_finishedLoading = YES;
}
if (error)
_archiveInformation[@"Error"] = error;
}
}
// Get the title
- (void)webView:(WebView *)sender didReceiveTitle:(NSString *)title forFrame:(WebFrame *)frame
{
if (frame == [sender mainFrame] && title != nil)
_archiveInformation[@"Title"] = title;
}
// This method handles loading web archives - without this, a lot of web archives will not load...
- (void)webView:(WebView *)sender decidePolicyForMIMEType:(NSString *)type request:(NSURLRequest *)request frame:(WebFrame *)frame decisionListener:(id<WebPolicyDecisionListener>)listener
{
if ([WebView canShowMIMEType:type])
{
[listener use];
return;
}
[listener ignore];
}
- (NSURLRequest *)webView:(WebView *)sender resource:(id)identifier willSendRequest:(NSURLRequest *)request redirectResponse:(NSURLResponse *)redirectResponse fromDataSource:(WebDataSource *)dataSource
{
if (!_localResourceLoadingOnly
|| (_localResourceLoadingOnly && [[[request URL] scheme] isEqualToString:@"file"]))
{
return request;
} else {
return nil;
}
}
@end