diff --git a/lib/section_extractor/toc_parser.rb b/lib/section_extractor/toc_parser.rb index 5c99294..dfabbbd 100644 --- a/lib/section_extractor/toc_parser.rb +++ b/lib/section_extractor/toc_parser.rb @@ -5,6 +5,10 @@ class TocParser ROMAN_SERIES = %w[I II III IV V VI VII VIII IX X XI XII XIII XIV XV].freeze ALPHA_SERIES = ("a".."z").to_a MAX_TOC_ITEM_SIZE = 70 + RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m + RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m + RE_ALPHA = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m + RE_SPECIAL = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi attr_reader :content @@ -14,24 +18,18 @@ def initialize(content) def call tocs = [] - re_numeric = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m - re_roman = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m - re_alpha = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m - re_special = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi - - [re_numeric, re_roman, re_alpha, re_special].map do |re| + [RE_NUMERIC, RE_ROMAN, RE_ALPHA, RE_SPECIAL].map do |re| toc = Toc.new content.scan(re).each do |match| toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ") toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":") # Skip the TOC item if it has more than 5 dots # (this happens in the docs with a TOC) - next if toc_item_title.include?(".....") - if toc_item_title.size > MAX_TOC_ITEM_SIZE - toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) - end + next if toc_item_title.include?(".....") || toc_item_title.include?("_____") + + toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) if toc_item_title.size > MAX_TOC_ITEM_SIZE - puts " - Adding TOC item: #{toc_item_title}" + # puts " - Adding TOC item: #{toc_item_title}" toc.add_item(toc_item_title, content.rindex(match.first)) end