diff --git a/.aspelldict b/.aspelldict index 214511d..ceeea03 100644 --- a/.aspelldict +++ b/.aspelldict @@ -1,4 +1,4 @@ -personal_ws-1.1 en 164 +personal_ws-1.1 en 166 CDATA CommonMark CounterClockwiseContourIntegral @@ -160,6 +160,8 @@ utf wantarray wget whitespace +www +wwww xfffd xmp zA diff --git a/lib/Markdown/Perl/Inlines.pm b/lib/Markdown/Perl/Inlines.pm index c21988e..4189847 100644 --- a/lib/Markdown/Perl/Inlines.pm +++ b/lib/Markdown/Perl/Inlines.pm @@ -45,6 +45,10 @@ sub render { process_styles($that, $tree); + if ($that->get_use_extended_autolinks) { + $tree->map(sub { create_autolinks($that, $_) }); + } + # At this point we have added the emphasis, strong emphasis, etc. in the tree. $tree->apply( @@ -713,4 +717,54 @@ sub delim_characters { return join('', uniq @c); } +sub create_autolinks { + my ($that, $n) = @_; + if ($n->{type} ne 'text') { + return $n; + } + + my @nodes; + + # TODO: technically we should forbid the presence of _ in the last two parts + # of the domain, according to the gfm spec. + ## no critic (ProhibitComplexRegexes) + while ( + $n->{content} =~ m/ + (? ^ | [ \t\n*_~\(] ) # The link must start after a whitespace or some specific delimiters. + (? + (?: (?https?:\/\/) | www\. ) # It must start by a scheme or the string wwww. + [-_a-zA-Z0-9]+ (?: \. [-_a-zA-Z0-9]+ )* # Then there must be something that looks like a domain + (?: \/ [^ \t\n<]*? )? # Some characters are forbidden in the link. + ) + [?!.,:*_~]* (?: [ \t\n<] | $) # We remove some punctuation from the end of the link. + /x + ## use critic + ) { + my $url = $+{url}; + my $match_start = $LAST_MATCH_START[0] + length($LAST_PAREN_MATCH{prefix}); + my $match_end = $match_start + length($url); + my $has_scheme = exists $LAST_PAREN_MATCH{scheme}; + if ($url =~ m/\)+$/) { + my $nb_final_closing_parens = $LAST_MATCH_END[0] - $LAST_MATCH_START[0]; + my $open = 0; + () = $url =~ m/ \( (?{$open++}) | \) (?{$open--}) /gx; + my $remove = min($nb_final_closing_parens, -$open); + if ($remove > 0) { + $match_end -= $remove; + substr $url, -$remove, $remove, ''; + } + } + # TODO: handle an HTML entity at the end of the link. + if ($match_start > 0) { + push @nodes, new_text(substr $n->{content}, 0, $match_start); + } + my $scheme = $has_scheme ? '' : $that->get_default_extended_autolinks_scheme.'://'; + push @nodes, + new_link($url, type => 'autolink', target => $scheme.$url, debug => 'extended autolink'); + $n = new_text(substr $n->{content}, $match_end); + } + push @nodes, $n if length($n->{content}) > 0; + return @nodes; +} + 1; diff --git a/lib/Markdown/Perl/Options.pm b/lib/Markdown/Perl/Options.pm index e5275eb..b3fed27 100644 --- a/lib/Markdown/Perl/Options.pm +++ b/lib/Markdown/Perl/Options.pm @@ -156,6 +156,14 @@ sub _regex { }; } +sub _word_list { + return sub { + my @a = ref $_[0] eq 'ARRAY' ? @{$_[0]} : split(/,/, $_[0]); + # TODO: validate the values of a. + return \@a; + }; +} + =pod =head2 B I<(boolean, default: true)> @@ -477,19 +485,39 @@ they will be deactivated in the output. =cut -# TODO: this is just a "word list" for now, see if this can be shared with other -# options. -sub _tag_list { - return sub { - my @a = ref $_[0] eq 'ARRAY' ? @{$_[0]} : split(/,/, $_[0]); - # TODO: validate the values of a. - return \@a; - }; -} - _make_option( disallowed_html_tags => [], - _tag_list, + _word_list, github => [qw(title textarea style xmp iframe noembed noframes script plaintext)]); +=pod + +=head2 B I<(boolean, default: true)> + +Allow some links to be recognised when they appear in plain text. These links +must start by C, C, or C. + +=cut + +_make_option( + use_extended_autolinks => 1, + _boolean, ( + markdown => 0, + cmark => 0 + )); + +=pod + +=head2 B I<(enum, default: https)> + +Specify which scheme is added to the beginning of extended autolinks when none +was present initially. + +=cut + +_make_option( + default_extended_autolinks_scheme => 'https', + _enum(qw(http https)), + github => 'http'); + 1; diff --git a/t/901-github-test-suite.t b/t/901-github-test-suite.t index 2bb42bb..6b0a043 100644 --- a/t/901-github-test-suite.t +++ b/t/901-github-test-suite.t @@ -10,10 +10,19 @@ use Test2::V0; # TODO: remove these todos. my %opt = (todo => [198 .. 202, 204, 205, 279, 280, 398, 426, 434 .. 436, - 473 .. 475, 477, 621 .. 631, 652], - # The spec says that some HTML tags are forbidden in the output, but - # they still have examples with these tags. - bugs => [140 .. 142, 145, 147], + 473 .. 475, 477, 626, 628 .. 631, 652], + # These are bugs in the GitHub spec, not in our implementation. All + # of these have been tested to be buggy in the real cmark-gfm + # implementation. + bugs => [ + # The spec says that some HTML tags are forbidden in the output, but + # they still have examples with these tags. + 140 .. 142, 145, 147, + # Some things that are not cmark autolinks are matched by the + # extended autolinks syntax (but the cmark part of the spec is not + # updated). + 616, 619, + ], json_file => "${FindBin::Bin}/data/github.tests.json", test_url => 'https://github.github.com/gfm/#example-%d', spec_tool => "${FindBin::Bin}/../third_party/commonmark-spec/test/spec_tests.py",