Skip to content

Commit

Permalink
Extract toc types
Browse files Browse the repository at this point in the history
  • Loading branch information
ferblape committed Nov 21, 2024
1 parent 3c9a0cd commit 77dd836
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 16 deletions.
1 change: 1 addition & 0 deletions lib/section_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ class Error < StandardError; end
require_relative "section_extractor/toc_item"
require_relative "section_extractor/toc_parser"
require_relative "section_extractor/toc"
require_relative "section_extractor/toc_types"
require_relative "section_extractor/version"
18 changes: 2 additions & 16 deletions lib/section_extractor/toc_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,7 @@

module SectionExtractor
class TocParser
ROMAN_SERIES = %w[I II III IV V VI VII VIII IX X XI XII XIII XIV XV].freeze
ALPHA_SERIES = ("a".."z").to_a
MAX_TOC_ITEM_SIZE = 60
RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m
RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m
REGEXES_WITH_TYPES = {
numeric: RE_NUMERIC,
numeric_with_clause: RE_NUMERIC_WITH_CLAUSE,
roman: RE_ROMAN,
roman_with_title: RE_ROMAN_WITH_TITLE,
alpha: RE_ALPHA
}

attr_reader :content, :tocs

Expand All @@ -26,8 +12,8 @@ def initialize(content)
end

def call
REGEXES_WITH_TYPES.map do |type, re|
content.scan(re).each do |match|
TocTypes.all.map do |type, options|
content.scan(options[:regexp]).each do |match|
toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ")
toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":")
# Skip the TOC item if it has more than 5 dots
Expand Down
36 changes: 36 additions & 0 deletions lib/section_extractor/toc_types.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# frozen_string_literal: true

module SectionExtractor
class TocTypes
RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m
RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m

def self.all
{
numeric: {
regexp: RE_NUMERIC,
first_value: 1
},
numeric_with_clause: {
regexp: RE_NUMERIC_WITH_CLAUSE,
first_value: 1
},
roman: {
regexp: RE_ROMAN,
first_value: "I"
},
roman_with_title: {
regexp: RE_ROMAN_WITH_TITLE,
first_value: "I"
},
alpha: {
regexp: RE_ALPHA,
first_value: "a"
}
}
end
end
end

0 comments on commit 77dd836

Please sign in to comment.