From bcc0682b96da7484825e3414094502d6e999ddf8 Mon Sep 17 00:00:00 2001 From: ejb Date: Wed, 15 Apr 2015 13:26:46 +0100 Subject: [PATCH] Add URL checking feature --- Readme.md | 12 ++++++- api/scraper.php | 42 +++++++++++++++++++++-- config/schema.json | 6 ++-- config_examples/schema.long-example.json | 6 ++-- config_examples/schema.short-example.json | 3 +- config_examples/schema.wsjgraphics.json | 6 ++-- frontend/script.js | 18 ++++++++++ index.html | 9 ++--- 8 files changed, 84 insertions(+), 18 deletions(-) diff --git a/Readme.md b/Readme.md index 81d2a6a..8615a61 100644 --- a/Readme.md +++ b/Readme.md @@ -34,7 +34,7 @@ pattern | \/(^http:\\/\\/)\/ | Regex (regular expression) which validates conten hint | URL of a promo image. | Human-readable description of the pattern. group | meta | (optional) Used to sort tags into groups. longname | Teaser image | Descriptive name for humans. -type | image | (optional) Set to 'image' if result should be a JPG/PNG/GIF. +type | image | (optional) Set to 'image' if result should be a JPG/PNG/GIF. Other options: 'url', 'strict-url' (see below for more details). url | https://dev.twitter.com/cards/types | (optional) URL with further information about meta tag. ### Example regex rules @@ -94,12 +94,22 @@ Then in `schema.json`, specify the function's name in the *pattern* field of you } ``` +## Content types + +- 'image': Renders content on frontend in tag. +- 'url': Checks header of URL to make sure link is valid, and renders in tag. +- 'strict-url': Same as 'url', but rejects redirect headers (eg. 301s). Useful for canonical URLs. + ## Running behind a proxy Add a file called *proxy.php* to the config directory with a function called `file_get_contents_with_proxy`. This function should accept a URL as an argument, and return the HTML of the specified URL. ## Changelog +### 1.0.1 + +- New content types: 'url' and 'strict-url' + ### 1.0.0 - Initial release diff --git a/api/scraper.php b/api/scraper.php index 597d885..3acccac 100644 --- a/api/scraper.php +++ b/api/scraper.php @@ -30,7 +30,7 @@ function checkUrl( $url ) { if (!$file) { return false; } - + $html = new simple_html_dom(); $html->load($file); @@ -50,6 +50,7 @@ function checkUrl( $url ) { $attribute = 'innertext'; } $regex = $item['pattern']; + $type = strtolower($item['type']); if (substr( $selector, 0, 6 ) === "custom") { $customFunctionName = str_replace( 'custom:', '', $selector ); @@ -97,7 +98,24 @@ function checkUrl( $url ) { } } - + // check URLs are valid + if (($type === 'url') || ($type === 'strict-url')) { + $strict = false; + if ($type === 'strict-url') { + $strict = true; + } + $item_url = $schema[$i]['contents']; + if (function_exists('file_get_contents_with_proxy')) { + file_get_contents_with_proxy( $item_url ); + } else { + file_get_contents( $item_url ); + } + $headerStatus = checkHeaderStatus($http_response_header, $strict); + $schema[$i]['_requestHeader'] = $http_response_header; + if ( !is_null($http_response_header) && ($headerStatus === false) ) { + $schema[$i]['ok'] = false; + } + } } return $schema; @@ -110,4 +128,22 @@ function overallCheck( $results ) { } } return true; -} \ No newline at end of file +} + +function checkHeaderStatus( $headers, $strict = false ){ + $status = $headers[0]; + $pattern = '/(\d\d\d)/'; + + preg_match($pattern, $status, $matches); + $code = intval( $matches[0] ); + + if ( ($strict === true) && ($code >= 200) && ($code < 300) ) { + return true; + } else if ( ($strict === false) && ($code >= 200) && ($code < 400) ) { + return true; + } + return false; +} + + + diff --git a/config/schema.json b/config/schema.json index c73ea56..80edb4d 100644 --- a/config/schema.json +++ b/config/schema.json @@ -44,7 +44,8 @@ "group": "Main tags", "longname": "Canonical URL", "pattern": "\/(^http)\/", - "hint": "Should be the page's URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "link[rel='image_src']", @@ -78,7 +79,8 @@ "group": "Facebook", "longname": "Facebook URL", "pattern": "\/(^http)\/", - "hint": "Should be page URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "meta[property='og:site_name']", diff --git a/config_examples/schema.long-example.json b/config_examples/schema.long-example.json index c73ea56..80edb4d 100644 --- a/config_examples/schema.long-example.json +++ b/config_examples/schema.long-example.json @@ -44,7 +44,8 @@ "group": "Main tags", "longname": "Canonical URL", "pattern": "\/(^http)\/", - "hint": "Should be the page's URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "link[rel='image_src']", @@ -78,7 +79,8 @@ "group": "Facebook", "longname": "Facebook URL", "pattern": "\/(^http)\/", - "hint": "Should be page URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "meta[property='og:site_name']", diff --git a/config_examples/schema.short-example.json b/config_examples/schema.short-example.json index 3518ca4..193bc32 100644 --- a/config_examples/schema.short-example.json +++ b/config_examples/schema.short-example.json @@ -46,7 +46,8 @@ "group": "Facebook", "longname": "Facebook URL", "pattern": "\/(^http)\/", - "hint": "Should be page URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "meta[property='og:site_name']", diff --git a/config_examples/schema.wsjgraphics.json b/config_examples/schema.wsjgraphics.json index 75f0c7c..cd0e447 100644 --- a/config_examples/schema.wsjgraphics.json +++ b/config_examples/schema.wsjgraphics.json @@ -51,7 +51,8 @@ "group": "Main tags", "longname": "Canonical URL", "pattern": "\/(^http)\/", - "hint": "Should be the graphics's final URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "link[rel='image_src']", @@ -85,7 +86,8 @@ "group": "Facebook", "longname": "Facebook URL", "pattern": "\/(^http)\/", - "hint": "Should be page URL." + "hint": "Should be the page's exact URL. May need a slash (/) on the end of the URL.", + "type": "strict-url" }, { "selector": "meta[property='og:site_name']", diff --git a/frontend/script.js b/frontend/script.js index 5ee9abc..02f42e9 100644 --- a/frontend/script.js +++ b/frontend/script.js @@ -86,6 +86,24 @@ app.factory('metaHelper', function(){ }); +app.directive('formattedContent', function() { + return { + restrict: 'E', + link: function(scope, element, attrs){ + var html = attrs.contents; + if (attrs.type === 'image') { + var html = ''; + } + if (attrs.type.toLowerCase().indexOf('url') > -1) { + var html = ''+attrs.contents+''; + } + $(element).html( html ); + + } + }; +}); + + diff --git a/index.html b/index.html index 8f7f79e..23223b2 100644 --- a/index.html +++ b/index.html @@ -46,12 +46,7 @@

{{group}}

{{val.selector}}
{{val.hint}} More info
- -
- {{val.contents}} -
+ --> @@ -59,7 +54,7 @@

{{group}}

- +