Skip to content

Commit

Permalink
Add URL checking feature
Browse files Browse the repository at this point in the history
  • Loading branch information
ejb committed Apr 15, 2015
1 parent 9cc8eaa commit bcc0682
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 18 deletions.
12 changes: 11 additions & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pattern | \/(^http:\\/\\/)\/ | Regex (regular expression) which validates conten
hint | URL of a promo image. | Human-readable description of the pattern.
group | meta | (optional) Used to sort tags into groups.
longname | Teaser image | Descriptive name for humans.
type | image | (optional) Set to 'image' if result should be a JPG/PNG/GIF.
type | image | (optional) Set to 'image' if result should be a JPG/PNG/GIF. Other options: 'url', 'strict-url' (see below for more details).
url | https://dev.twitter.com/cards/types | (optional) URL with further information about meta tag.

### Example regex rules
Expand Down Expand Up @@ -94,12 +94,22 @@ Then in `schema.json`, specify the function's name in the *pattern* field of you
}
```

## Content types

- 'image': Renders content on frontend in <img> tag.
- 'url': Checks header of URL to make sure link is valid, and renders in <a> tag.
- 'strict-url': Same as 'url', but rejects redirect headers (eg. 301s). Useful for canonical URLs.

## Running behind a proxy

Add a file called *proxy.php* to the config directory with a function called `file_get_contents_with_proxy`. This function should accept a URL as an argument, and return the HTML of the specified URL.

## Changelog

### 1.0.1

- New content types: 'url' and 'strict-url'

### 1.0.0

- Initial release
Expand Down
42 changes: 39 additions & 3 deletions api/scraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function checkUrl( $url ) {
if (!$file) {
return false;
}

$html = new simple_html_dom();
$html->load($file);

Expand All @@ -50,6 +50,7 @@ function checkUrl( $url ) {
$attribute = 'innertext';
}
$regex = $item['pattern'];
$type = strtolower($item['type']);

if (substr( $selector, 0, 6 ) === "custom") {
$customFunctionName = str_replace( 'custom:', '', $selector );
Expand Down Expand Up @@ -97,7 +98,24 @@ function checkUrl( $url ) {
}
}


// check URLs are valid
if (($type === 'url') || ($type === 'strict-url')) {
$strict = false;
if ($type === 'strict-url') {
$strict = true;
}
$item_url = $schema[$i]['contents'];
if (function_exists('file_get_contents_with_proxy')) {
file_get_contents_with_proxy( $item_url );
} else {
file_get_contents( $item_url );
}
$headerStatus = checkHeaderStatus($http_response_header, $strict);
$schema[$i]['_requestHeader'] = $http_response_header;
if ( !is_null($http_response_header) && ($headerStatus === false) ) {
$schema[$i]['ok'] = false;
}
}

}
return $schema;
Expand All @@ -110,4 +128,22 @@ function overallCheck( $results ) {
}
}
return true;
}
}

function checkHeaderStatus( $headers, $strict = false ){
$status = $headers[0];
$pattern = '/(\d\d\d)/';

preg_match($pattern, $status, $matches);
$code = intval( $matches[0] );

if ( ($strict === true) && ($code >= 200) && ($code < 300) ) {
return true;
} else if ( ($strict === false) && ($code >= 200) && ($code < 400) ) {
return true;
}
return false;
}



6 changes: 4 additions & 2 deletions config/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
"group": "Main tags",
"longname": "Canonical URL",
"pattern": "\/(^http)\/",
"hint": "Should be the page's URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "link[rel='image_src']",
Expand Down Expand Up @@ -78,7 +79,8 @@
"group": "Facebook",
"longname": "Facebook URL",
"pattern": "\/(^http)\/",
"hint": "Should be page URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "meta[property='og:site_name']",
Expand Down
6 changes: 4 additions & 2 deletions config_examples/schema.long-example.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
"group": "Main tags",
"longname": "Canonical URL",
"pattern": "\/(^http)\/",
"hint": "Should be the page's URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "link[rel='image_src']",
Expand Down Expand Up @@ -78,7 +79,8 @@
"group": "Facebook",
"longname": "Facebook URL",
"pattern": "\/(^http)\/",
"hint": "Should be page URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "meta[property='og:site_name']",
Expand Down
3 changes: 2 additions & 1 deletion config_examples/schema.short-example.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
"group": "Facebook",
"longname": "Facebook URL",
"pattern": "\/(^http)\/",
"hint": "Should be page URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "meta[property='og:site_name']",
Expand Down
6 changes: 4 additions & 2 deletions config_examples/schema.wsjgraphics.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
"group": "Main tags",
"longname": "Canonical URL",
"pattern": "\/(^http)\/",
"hint": "Should be the graphics's final URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "link[rel='image_src']",
Expand Down Expand Up @@ -85,7 +86,8 @@
"group": "Facebook",
"longname": "Facebook URL",
"pattern": "\/(^http)\/",
"hint": "Should be page URL."
"hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.",
"type": "strict-url"
},
{
"selector": "meta[property='og:site_name']",
Expand Down
18 changes: 18 additions & 0 deletions frontend/script.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,24 @@ app.factory('metaHelper', function(){
});


app.directive('formattedContent', function() {
return {
restrict: 'E',
link: function(scope, element, attrs){
var html = attrs.contents;
if (attrs.type === 'image') {
var html = '<img class="teaser-image" src="'+attrs.contents+'">';
}
if (attrs.type.toLowerCase().indexOf('url') > -1) {
var html = '<a href="'+attrs.contents+'" class="teaser-image">'+attrs.contents+'</a>';
}
$(element).html( html );

}
};
});





Expand Down
9 changes: 2 additions & 7 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,15 @@ <h3>{{group}}</h3>
<div class="selector" ng-if="(showInfo || !val.ok)">{{val.selector}}</div>
<div class="error-internal" ng-if="(showInfo || !val.ok)">{{val.hint}} <a ng-if="val.url" href={{val.url}}>More info</a></div>
</div><!--
--><div class="col-item" ng-if="val.type === 'image'">
<img class="teaser-image" ng-src="{{val.contents}}">
</div><!--
--><div class="col-item" ng-if="val.type !== 'image'">
{{val.contents}}
</div>
--><formatted-content class="col-item" contents="{{val.contents}}" type="{{val.type}}">
</div>
</div>
</div>


</div>

<p class="footer"><a target="_self" ng-href="./api/index.php?url={{currentUrl}}">API version of this check</a> | The Meta Tag Checker. v1.0.0 | <a href="https://github.com/dowjones/the-meta-tag-checker">View source on Github</a></p>
<p class="footer"><a target="_self" ng-href="./api/index.php?url={{currentUrl}}">API version of this check</a> | The Meta Tag Checker. v1.0.1 | <a href="https://github.com/dowjones/the-meta-tag-checker">View source on Github</a></p>



Expand Down

0 comments on commit bcc0682

Please sign in to comment.