bin/import

#!/usr/bin/env ruby
require 'rexml/document'
require 'date'
require 'fileutils'
require 'net/http'
require 'uri'
require 'cgi'

class Encloser
  @enclosures_size

  def initialize(import_file)
    @import_file = import_file
  end

  def retrieve_enclosures
    @enclosures_size = Array.new(60) {  }
    @enclosures_size[1] = 71902534
    @enclosures_size[2] = 15911287
    @enclosures_size[3] = 28003038
    @enclosures_size[4] = 19005003
    @enclosures_size[5] = 34507554
    @enclosures_size[6] = 19724935
    @enclosures_size[7] = 31708241
    @enclosures_size[8] = 30610041
    @enclosures_size[9] = 33890098
    doc = REXML::Document.new( File.read( @import_file ) )
    root = doc.root
    root.get_elements( 'channel/item/enclosure' ).each do |enclosure|
      retrieve_enclosure( enclosure )
    end
  end

  def enclosure_size(index)
    @enclosures_size[index]
  end

  private

  def retrieve_enclosure(enclosure)
    length = enclosure.attribute('length').value
    url = enclosure.attribute('url').value
    if ( url =~ /[.]*Episode-([0-9]*).mp3/)
      puts "#{length} for #{$1}"
      @enclosures_size[Integer($1)] = length
    end
  end
end

class Importer
  @encloser

  def initialize(import_file, output_dir, rss_feed)
    @import_file = import_file
    @output_dir  = output_dir
    @encloser = Encloser.new( rss_feed )
    @encloser.retrieve_enclosures
  end

  def import_file
    doc = REXML::Document.new( File.read( @import_file ) )
    root = doc.root
    root.get_elements( 'channel/item' ).each do |item|
      import_item( item )
    end
  end

  private

  def import_item(item_xml)
    type = item_xml.get_text( 'wp:post_type' )
    return if type == 'page'
    status = item_xml.get_text( 'wp:status' )
    return if status != 'publish'

    tags = Array.new
    item_xml.get_elements( 'category' ).each do |tag|
      value = tag.text
      if value != "Uncategorized"
        tags << value
      end
    end

    title    = item_xml.get_text( 'title' ).to_s
    author   = 'Emmanuel Bernard'
    link     = item_xml.get_text( 'link' ).to_s

    if ( title =~ /[.]*pisode[s]? ([0-9]+)[.]*/ )
      episode = $1
      enclosure_size = @encloser.enclosure_size(Integer(episode))
    end

    if ( link =~ %r(/([^/]+)/$) )
      slug = $1
    end
    published = DateTime.parse( item_xml.get_text( 'pubDate' ).to_s )
    #content  = REXML::Text.unnormalize( item_xml.get_text( 'content:encoded' ).to_s )
    content  = item_xml.get_text( 'content:encoded' ).to_s
    in_pre = false
    p_content = ''
    content.each_line do |line|
      if ( line =~ /<pre>/ )
        in_pre = true
      end
      if ( line =~ /<\/pre>/ )
        in_pre = false
      end
      if ( in_pre || ( line =~ /^\s*</ ) )
        p_content << line
      elsif ( line.strip == '' )
        # nothing
      else
        p_content << "<p>#{line}</p>\n" 
      end
    end
    content = p_content

    content = import_images( content )
    content = import_assets( content )

    output_path = File.join( @output_dir, published.strftime( "%Y-%m-%d-#{slug}.html.erb" ) )
    #if ( ! File.exist?( output_path ) )
      puts "writing post: #{output_path}"
      FileUtils.mkdir_p( File.dirname( output_path ) )
      File.open( output_path, 'w' ) do |f|
        #f.puts "date: #{published.year}-#{published.month}-#{published.day}"
        f.puts '---'
        f.puts "title: '#{title}'"
        f.puts "author: 'Emmanuel Bernard'"
        f.puts "layout: blog-post"
        if ( !episode.nil? && !episode.empty? )
          f.puts "episode: #{episode}"
        end
        if ( !enclosure_size.nil? )
          f.puts "mp3_length: #{enclosure_size}"
        end
        f.puts '---'
        f.puts content
      end
    #end
  end

  def import_images(content)
    #doc = REXML::Document.new( '<entry>' + CGI.escapeHTML( content ) + '</entry>' )
    doc = REXML::Document.new( '<entry>' + content  + '</entry>' )
    REXML::XPath.each( doc, '//img' ) do |img|
      src = img.attributes['src']

      if ( src =~ /^\// )
        src = "http://lescastcodeurs.com#{src}"
      else
        next
      end

      basename = File.basename( src )
      output_path = "posts/assets/#{basename}"
 
      url = URI.parse( src )

      res = Net::HTTP.start(url.host, url.port) {|http|
        http.get(url.path)
      }

      FileUtils.mkdir_p( File.dirname( output_path ) )
      puts "writing image: #{output_path}"
      File.open( output_path, 'wb' ) do |f|
        f.write res.body
      end

      img.attributes['src'] = "/#{output_path}"
    end
    content = ''
    doc.root.children.each do |c|
      content += c.to_s
    end
    content
  end


  def import_assets(content)
    doc = REXML::Document.new( '<entry>' + content + '</entry>' )
    REXML::XPath.each( doc, '//a' ) do |a|
      href = a.attributes['href']
      local = false
      if ( href =~ /^\// )
        href = "http://lescastcodeurs.com#{href}"
        local = true
      end
      if ( local && href =~ /\.(pdf)$/ )
        basename = File.basename( href )
        output_path = "posts/assets/#{basename}"
  
        url = URI.parse( href )
  
        if ( ! File.exist?( output_path ) )
          puts "fetching: #{url}"
          FileUtils.mkdir_p( File.dirname( output_path ) )
          File.open( output_path, 'wb' ) do |f|
            Net::HTTP.start(url.host, url.port) do |http|
              http.get(url.path) do |str|
                f.write str
                $stderr.putc '.'
                $stderr.flush
              end
            end
          end
          puts "writing asset: #{output_path}"
        end
  
        a.attributes['href'] = "/#{output_path}"
      end

    end
    content = ''
    doc.root.children.each do |c|
      content += c.to_s
    end
    content

  end


end

if ( $0 == __FILE__ ) 
  importer = Importer.new( ARGV[0], ARGV[1], ARGV[2] )
  importer.import_file
end