-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathebook_downloader.rb
executable file
·157 lines (136 loc) · 3.92 KB
/
ebook_downloader.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env ruby
# TODO require bundler?
# TODO include cover image, pass to pandoc
require 'open-uri'
require 'io/wait'
require 'yaml'
require 'optparse'
require 'rubygems'
require 'open_uri_redirections'
require 'mechanize'
require 'addressable/uri'
require 'hashie'
HTML_HEADER = <<EOF
!DOCTYPE HTML>
<html>
<head>
<title>%s</title>
<author>%s</author>
</head>
<body>
EOF
HTML_FOOTER = "</body>\n</html>"
options = Hashie::Mash.new({
'sleep_time' => 0.1,
'output_directory' => '.',
'css_file' => "http://github.com/Huluk/blog-to-ebook/css.css",
'conversion_command' => "pandoc -s -c %3$s -o %2$s.epub %1$s",
})
local_config = File.join(File.dirname(__FILE__), 'config.yml')
if File.exist? local_config
options.merge! YAML.load(File.read(local_config))
end
optparse = OptionParser.new do|opts|
opts.banner = "Usage: #{$0} [-c config] -o output_directory"
opts.on('-c', '--config CONFIG', 'path to configuration file') do |config|
options.config = config
end
opts.on('-o', '--output PATH', 'write path') do |path|
options.output_directory = path
end
opts.on('-h', '--help', 'displays this message') do
puts opts
exit
end
end
optparse.parse!
if options.config?
begin
options.merge! YAML.load(File.read(options.config))
rescue LoadError
$stderr.puts "no such file: #{options.config}"; exit
rescue Psych::SyntaxError, TypeError => e
$stderr.puts "cannot read config file: #{e.message}"; exit
end
end
if $stdin.ready?
begin
options.merge! YAML.load($stdin.read)
rescue Psych::SyntaxError, TypeError => e
$stderr.puts "invalid config syntax in stdin: #{e.message}"; exit
end
end
unless File.directory?(options.output_directory)
Dir.mkdir(options.output_directory)
end
def toc_url_to_chapter_links(toc_url, path, regex)
browser = Mechanize.new
toc = browser.get(toc_url).at(path)
links = toc.search('a').map{ |link| link['href'] }
links.keep_if{ |u| u =~ regex }
return links.uniq
end
def read_url(url)
begin
return open(url, allow_redirections: :all).read
rescue URI::InvalidURIError
url = Addressable::URI.encode_component(url)
return open(url, allow_redirections: :all).read
end
end
def clean_document(document, garbage_path=nil, &block)
document.search(garbage_path).remove if garbage_path
if block
return yield(document)
else
return document
end
end
def download_images_and_update_urls(document, outdir)
document.search('img').each do |img|
url = img['src'].gsub(/\?.*$/,'')
extension = File.extname(url)
outfile = File.join(outdir, "#{url.hash}#{extension}")
img['src'] = outfile
unless File.exist? outfile
File.open(outfile, 'w') do |file|
begin
file.write(read_url(url))
rescue Exception => e
$stderr.puts "error while saving image: #{url}\n" +
"#{e.class}: #{e.message}"
end
end
end
end
return document
end
regex = /#{options.toc_regex}/i
links = toc_url_to_chapter_links(options.toc_url, options.content_path, regex)
puts 'downloading:'
titles = []
contents = []
links.each do |url|
puts url
document = Nokogiri::HTML.parse(read_url(url))
titles << document.at(options.title_path)
contents << document.at(options.content_path)
sleep options.sleep_time
end
image_dir = File.join(options.output_directory, 'images')
Dir.mkdir(image_dir) unless File.directory?(image_dir)
contents.map!{ |content|
content = clean_document(content, options.garbage_path)
download_images_and_update_urls(content, image_dir)
}
titles.map!{ |title| options.chapter_title_format % title.text }
contents.map!(&:inner_html)
book = (HTML_HEADER % [options.title, options.author]) +
titles.zip(contents).map(&:join).join +
HTML_FOOTER
html_file = File.join(options.output_directory, options.filename + '.html')
File.open(html_file, 'w') do |file|
file.write book
end
system(options.conversion_command %
[html_file, options.filename, options.css_file])