Skip to content

Commit

Permalink
Improve regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
ferblape committed Nov 21, 2024
1 parent 742be43 commit 28b9080
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 20 deletions.
54 changes: 39 additions & 15 deletions lib/section_extractor/document_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,20 @@ def extract_sections(content, tocs) # rubocop:disable Metrics/AbcSize
# toc_items_to_skip = []

0.upto(toc.toc_items.size - 1) do |index|
section = Section.new(content, toc.toc_items[index], toc.toc_items[index + 1])
section = Section.new(
document_content: content,
toc_item: toc.toc_items[index],
next_toc_item: toc.toc_items[index + 1],
toc_series_type: toc.toc_series_type,
toc_separator_chars: toc.toc_separator_chars
)
sections << section unless section_exists?(sections, section)

# TODO: re-activate when we use the content again
# Skip empty sections, because they are not real sections, but just sentences that start with
# toc item title format
# if section.content.empty?
# toc_items_to_skip << index
# else
# sections << section
# end
end

# TODO: re-activate when we use the content again
# puts "- Skipping #{toc_items_to_skip.join(", ")} empty sections" if toc_items_to_skip.any?
# toc_items_to_skip.each { |index| toc.toc_items.delete_at(index) }
end

sections.sort_by { |s| s.positions.first }
remove_suspicious_sections(
sections.sort_by { |s| s.positions.first }
)
end

def extract_tocs(content)
Expand All @@ -53,5 +48,34 @@ def extract_tocs(content)
def section_exists?(sections, section)
sections.find { |s| s.raw_title == section.raw_title && s.positions&.first == section.positions&.first }
end

def remove_suspicious_sections(sections)
return sections if sections.size < 2

find_suspicious_section_indexes(sections)
end

def find_suspicious_section_indexes(sections)
sections.each_with_index.with_object([]) do |(section, index), _indexes|
previous_section = sections[index - 1]
next_section = sections[index + 1]

next if previous_section.nil? || next_section.nil?

if suspicious_section?(previous_section, section)
puts " - Removing section: #{section.raw_title} - #{section.positions.first}" if ENV["DEBUG"]
sections.delete_at(index)
end
end

sections
end

def suspicious_section?(previous_section, section)
(
previous_section.toc_series_type != section.toc_series_type ||
previous_section.toc_separator_chars != section.toc_separator_chars
) && section.toc_separator_chars == "" && section.toc_series_type == :numeric && !section.raw_title.include?(".")
end
end
end
10 changes: 5 additions & 5 deletions lib/section_extractor/toc_types.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

module SectionExtractor
class TocTypes
RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m
RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m
RE_NUMERIC = /^\n*\s*(\d+(?:\.\d+)*\.?-?\s+[A-Za-z][^\n]+)\n/m
RE_NUMERIC_WITH_CLAUSE = /^\n*\s*((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
RE_ROMAN = /^\n*\s*((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
RE_ROMAN_WITH_TITLE = /^\n*\s*((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
RE_ALPHA = /^\n*\s*([a-zA-Z][).-]+\s+[^\n]+)\n/m

def self.all
{
Expand Down

0 comments on commit 28b9080

Please sign in to comment.