From a24899f7d30a0adf7110145079ec79b6ac981440 Mon Sep 17 00:00:00 2001 From: Fernando Blat Date: Tue, 19 Nov 2024 10:01:31 +0100 Subject: [PATCH] Refactor regexes and add new test --- lib/section_extractor/toc_parser.rb | 25 ++++++++++++------------- spec/document_parser_spec.rb | 13 +++++++++++++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/lib/section_extractor/toc_parser.rb b/lib/section_extractor/toc_parser.rb index 9348473..5c99294 100644 --- a/lib/section_extractor/toc_parser.rb +++ b/lib/section_extractor/toc_parser.rb @@ -14,24 +14,23 @@ def initialize(content) def call tocs = [] - # TODO: delete me - # re1 = /\n(\d{1,3}[.-][.-]?\s+[^\n]+)\n/mi - re2 = /\n((IX|IV|V|VI|I|II|III)([.-]+\s+[^\n]+))\n/m - re3 = /\n^([a-zA-Z][).-]+\s+[^\n]+)\n/m - re4 = /^(\d+[.\d+]*\.?\-?\s.*)/ - re5 = /\n(ANEXO\s(IX|IV|V|VI|VII|VIII|I|II|III)[.-]*\s+[^\n]+)\n/mi - re6 = /\n(CAPITULO\s(IX|IV|V|VI|VII|VIII|I|II|III|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)[.-]*\s+[^\n]+)\n/mi - re7 = /\n(CAPÍTULO\s(IX|IV|V|VI|VII|VIII|I|II|III|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)[.-]*\s+[^\n]+)\n/mi - - [re2, re3, re4, re5].map do |re| + re_numeric = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m + re_roman = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m + re_alpha = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m + re_special = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi + + [re_numeric, re_roman, re_alpha, re_special].map do |re| toc = Toc.new content.scan(re).each do |match| toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ") + toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":") # Skip the TOC item if it has more than 5 dots + # (this happens in the docs with a TOC) next if toc_item_title.include?(".....") - next if toc_item_title.size > MAX_TOC_ITEM_SIZE + if toc_item_title.size > MAX_TOC_ITEM_SIZE + toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) + end - toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":") puts " - Adding TOC item: #{toc_item_title}" toc.add_item(toc_item_title, content.rindex(match.first)) end @@ -89,7 +88,7 @@ def calculate_titles(toc) item.title = item.raw_title.split(toc.toc_separator_chars).last&.strip control = item.raw_title.split(toc.toc_separator_chars).first toc.toc_items.delete(item) if control.size > 10 - toc.toc_items.delete(item) if toc.toc_separator_chars.size > 5 + toc.toc_items.delete(item) if toc.toc_separator_chars.nil? || toc.toc_separator_chars.size > 5 case toc.toc_series_type when :numeric if control !~ /\A\d+/ diff --git a/spec/document_parser_spec.rb b/spec/document_parser_spec.rb index 7b9a0f5..1540004 100644 --- a/spec/document_parser_spec.rb +++ b/spec/document_parser_spec.rb @@ -116,4 +116,17 @@ def assert_sections_absent(unexpected_sections) ]) end end + + context "in 66067386 doc" do + let(:file_path) { "spec/files/66067386.txt" } + + it "has these sections" do + assert_sections_present([ + ["I.- CARACTERÍSTICAS DE LA PRESTACIÓN", ""], + ["1.1.- Definición", ""], + ["3.1.- El presupuesto base", ""], + ["5.2.- Modalidad de pago del precio", ""] + ]) + end + end end