diff --git a/lib/section_extractor/document_parser.rb b/lib/section_extractor/document_parser.rb index 02a4c51..007577a 100644 --- a/lib/section_extractor/document_parser.rb +++ b/lib/section_extractor/document_parser.rb @@ -46,7 +46,8 @@ def extract_sections(content, tocs) # rubocop:disable Metrics/AbcSize end def extract_tocs(content) - SectionExtractor::TocParser.new(content).call + all_tocs = SectionExtractor::TocParser.new(content).call + all_tocs.values.map(&:values).flatten end def section_exists?(sections, section) diff --git a/lib/section_extractor/toc.rb b/lib/section_extractor/toc.rb index 7852ff6..747d39f 100644 --- a/lib/section_extractor/toc.rb +++ b/lib/section_extractor/toc.rb @@ -4,14 +4,14 @@ module SectionExtractor class Toc attr_accessor :toc_series_type, :toc_separator_chars, :toc_items - def initialize + def initialize(toc_series_type, toc_separator_chars) @toc_items = [] # The type of toc series can be: # - numeric: 1, 2, 3, ... # - roman: I, II, III, ... # - alpha: a), b), c), ... - @toc_series_type = nil - @toc_separator_chars = "" + @toc_series_type = toc_series_type + @toc_separator_chars = toc_separator_chars end def add_item(raw_title, position) diff --git a/lib/section_extractor/toc_parser.rb b/lib/section_extractor/toc_parser.rb index 307b786..b4fc8c5 100644 --- a/lib/section_extractor/toc_parser.rb +++ b/lib/section_extractor/toc_parser.rb @@ -5,21 +5,28 @@ class TocParser ROMAN_SERIES = %w[I II III IV V VI VII VIII IX X XI XII XIII XIV XV].freeze ALPHA_SERIES = ("a".."z").to_a MAX_TOC_ITEM_SIZE = 60 - RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m - RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m - RE_ALPHA = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m - RE_SPECIAL = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi - - attr_reader :content + RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m + RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m + RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi + RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi + RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m + REGEXES_WITH_TYPES = { + numeric: RE_NUMERIC, + numeric_with_clause: RE_NUMERIC_WITH_CLAUSE, + roman: RE_ROMAN, + roman_with_title: RE_ROMAN_WITH_TITLE, + alpha: RE_ALPHA + } + + attr_reader :content, :tocs def initialize(content) @content = content + @tocs = {} end def call - tocs = [] - [RE_NUMERIC, RE_ROMAN, RE_ALPHA, RE_SPECIAL].map do |re| - toc = Toc.new + REGEXES_WITH_TYPES.map do |type, re| content.scan(re).each do |match| toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ") toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":") @@ -28,162 +35,59 @@ def call next if toc_item_title.include?(".....") || toc_item_title.include?("_____") toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) if toc_item_title.size > MAX_TOC_ITEM_SIZE + separator_char = detect_separator_chars(toc_item_title, type) + if separator_char.nil? + puts " - Skipping #{toc_item_title} because separator_char is nil (type: #{type})" + next + end + tocs[type] ||= {} + tocs[type][separator_char] ||= Toc.new(type, separator_char) puts " - Adding TOC item: #{toc_item_title}" - toc.add_item(toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first)) + tocs[type][separator_char].add_item( + toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first) + ) end - - tocs << toc if toc.toc_items.any? end - analyze_and_close(tocs) - end - - def analyze_and_close(tocs) - tocs.map do |toc| - toc.toc_series_type = detect_series_type(toc) - toc_separator_chars = detect_separator_chars(toc) - if toc_separator_chars.size > 1 - extract_tocs_with_different_separators(toc, toc_separator_chars) - else - toc.toc_separator_chars = toc_separator_chars.first - toc - end - end.flatten.map do |toc| - calculate_titles(toc) - # TODO, for the moment is not necessary - # cleanup_toc_items(toc) - - toc - end - end - - private - - def extract_tocs_with_different_separators(toc, toc_separator_chars) - tocs = [] - toc_separator_chars.sort_by(&:size).reverse.each do |separator_char| - new_toc = Toc.new - new_toc.toc_separator_chars = separator_char - next if new_toc.toc_separator_chars.empty? - - new_toc.toc_series_type = toc.toc_series_type - toc.toc_items.each do |item| - new_toc.add_item(item.title, item.position) if item.title.include?(separator_char) - end - - # Delete the items from the original TOC - new_toc.toc_items.each do |new_item| - toc.toc_items.delete_if { |item| item.title == new_item.title } - end - tocs << new_toc - end tocs end - def calculate_titles(toc) - toc.toc_items.each do |item| - item.title = item.raw_title.split(toc.toc_separator_chars).last&.strip - control = item.raw_title.split(toc.toc_separator_chars).first - toc.toc_items.delete(item) if control.size > 10 - toc.toc_items.delete(item) if toc.toc_separator_chars.nil? || toc.toc_separator_chars.size > 5 - case toc.toc_series_type - when :numeric - if control !~ /\A\d+/ - puts " - Skipping #{item.title}, should start with a number" - toc.toc_items.delete(item) - end - when :roman, :alpha - if control !~ /\A[A-Za-z]/ - puts " - Skipping #{item.title}, should start with a letter" - toc.toc_items.delete(item) - end - else - raise "series type not detected" - end - end - end - - def detect_series_type(toc) - random_items = toc.toc_items.sample(5) - types = random_items.map { |item| detect_series_type_from_item(item) } - # return the most common type - types.max_by { |type| types.count(type) } - end - - def cleanup_toc_items(toc) - raise "series type not detected" unless toc.toc_series_type - - # toc_items are sorted, - current_series_item = nil - next_series_item_should_be = expected_next_series_item(current_series_item) - new_toc_items = [] - - puts " - Cleaning up TOC items" - puts " - Toc separator chars: #{toc.toc_separator_chars}" - - toc.toc_items.each_with_index do |item, _i| - if item.title !~ /\A#{next_series_item_should_be}\s*#{Regexp.quote(toc_separator_chars)}/ - puts "- Skipping #{item.title}, should be #{next_series_item_should_be}#{toc_separator_chars}" - next - end - - new_toc_items << item - current_series_item = next_series_item_should_be - next_series_item_should_be = expected_next_series_item(current_series_item) - end - - toc.toc_items = new_toc_items - end + private - def detect_series_type_from_item(item) - case item.title - when /\A\d+/ - :numeric - when /\A\b(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|IVX|XV)+\b/ - :roman - when /\A[a-zA-Z]+/ - :alpha - else - raise "series type not detected from title #{item.title}" + def detect_separator_chars(title, toc_series_type) # rubocop:disable Metrics/MethodLength + case toc_series_type + when :numeric + detect_numeric_series_separator_chars(title) + when :numeric_with_clause + detect_numeric_with_clause_series_separator_chars(title) + when :roman + detect_roman_series_separator_chars(title) + when :roman_with_title + detect_roman_with_title_series_separator_chars(title) + when :alpha + detect_alpha_series_separator_chars(title) end end - def detect_separator_chars(toc) - separators_chars = case toc.toc_series_type - when :numeric - toc.toc_items.map { |item| detect_numeric_series_separator_chars(item) } - when :roman - toc.toc_items.map { |item| detect_roman_series_separator_chars(item) } - when :alpha - toc.toc_items.map { |item| detect_alpha_series_separator_chars(item) } - else - raise "series type not detected" - end - separators_chars.compact.uniq + def detect_numeric_series_separator_chars(title) + title.match(/(\d+(?:\.\d+)*(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil end - def detect_numeric_series_separator_chars(item) - item.title.split(" ")[0].match(/.*\d([^\d]*)\z/) ? ::Regexp.last_match(1) : nil + def detect_numeric_with_clause_series_separator_chars(title) + title.match(/(?:Cláusula\s+)(\d+(?:\.\d+)*\s*(\.?-?)\s+[^\n]+)/m) ? ::Regexp.last_match(2) : nil end - def detect_roman_series_separator_chars(item) - item.title.match(/\b(IX|IV|V|VI|I|II|III)\b([^\s]+)\s/) ? ::Regexp.last_match(2) : nil + def detect_roman_series_separator_chars(title) + title.match(/((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil end - def detect_alpha_series_separator_chars(item) - item.title.match(/([a-zA-Z])([^\s]+)\s/) ? ::Regexp.last_match(2) : nil + def detect_roman_with_title_series_separator_chars(title) + title.match(/((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil # rubocop:disable Layout/LineLength end - def expected_next_series_item(current_item) - case @toc_series_type - when :numeric - (current_item || 0) + 1 - when :roman - ROMAN_SERIES[(ROMAN_SERIES.index(current_item) || -1) + 1] - when :alpha - ALPHA_SERIES[(ALPHA_SERIES.index(current_item) || -1) + 1] - end + def detect_alpha_series_separator_chars(title) + title.match(/([a-zA-Z]([).-]+)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil end end end diff --git a/spec/document_parser_spec.rb b/spec/document_parser_spec.rb index f7f6733..2749a81 100644 --- a/spec/document_parser_spec.rb +++ b/spec/document_parser_spec.rb @@ -134,7 +134,6 @@ def assert_sections_absent(unexpected_sections) let(:file_path) { "spec/files/66067442.txt" } it "has these sections" do - binding.pry assert_sections_present([ ["TÍTULO I. DISPOSICIONES GENERALES", ""], ["Cláusula 1 Régimen jurídico.", ""],