Skip to content

Commit

Permalink
Merge pull request #202 from jshemas/oct-21
Browse files Browse the repository at this point in the history
adding a fallback for charset using http-equiv
  • Loading branch information
jshemas authored Oct 22, 2023
2 parents 1802183 + 95d506e commit c4419f2
Show file tree
Hide file tree
Showing 10 changed files with 364 additions and 226 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Change Log

## 6.3.1

- Adding a fallback for `charset` using `http-equiv`
- Updating dependencies to fix npm vulnerabilities

## 6.3.0

- Export `SuccessResult` and `ErrorResult` types
Expand Down
4 changes: 4 additions & 0 deletions lib/fallback.ts
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
ogObject.charset = $('meta').attr('charset');
} else if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
ogObject.charset = $('head > meta[name="charset"]').attr('content');
} else if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
const content = $('head > meta[http-equiv="content-type"]').attr('content');
const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
ogObject.charset = charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8';
} else if (body) {
ogObject.charset = chardet.detect(Buffer.from(body)) || '';
}
Expand Down
448 changes: 252 additions & 196 deletions package-lock.json

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "open-graph-scraper",
"description": "Node.js scraper module for Open Graph and Twitter Card info",
"version": "6.3.0",
"version": "6.3.1",
"license": "MIT",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
Expand All @@ -27,7 +27,7 @@
"dependencies": {
"chardet": "^2.0.0",
"cheerio": "^1.0.0-rc.12",
"undici": "^5.25.4",
"undici": "^5.26.4",
"validator": "^13.11.0"
},
"files": [
Expand All @@ -36,22 +36,22 @@
"index.ts"
],
"devDependencies": {
"@snyk/protect": "^1.1230.0",
"@types/mocha": "^10.0.2",
"@snyk/protect": "^1.1236.0",
"@types/mocha": "^10.0.3",
"@types/node": "^18.18.3",
"@types/validator": "^13.11.2",
"@typescript-eslint/eslint-plugin": "^6.7.4",
"@typescript-eslint/parser": "^6.7.4",
"@types/validator": "^13.11.5",
"@typescript-eslint/eslint-plugin": "^6.8.0",
"@typescript-eslint/parser": "^6.8.0",
"chai": "^4.3.10",
"eslint": "^8.50.0",
"eslint": "^8.52.0",
"eslint-config-airbnb-base": "^15.0.0",
"eslint-config-airbnb-typescript": "^17.1.0",
"eslint-plugin-import": "^2.28.1",
"eslint-plugin-mocha": "^10.2.0",
"eslint-plugin-promise": "^6.1.1",
"mocha": "^10.2.0",
"nyc": "^15.1.0",
"sinon": "^16.0.0",
"sinon": "^17.0.0",
"ts-mocha": "^10.0.0",
"typescript": "^5.2.2"
},
Expand Down
56 changes: 51 additions & 5 deletions tests/integration/basic.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,58 @@ describe('basic', function () {
expect(response).to.be.an('Response');
});
});
it('congress.gov - should return a 403 error', function () {
// this test works locally but fails during the github CI
it('congress.gov - should return og data', function () {
return ogs({ url: 'https://www.congress.gov/bill/117th-congress/house-bill/2617/text' })
.then(function () {
expect().fail('this should not happen');
})
.catch(function ({ error, result, response }) {
.then(function ({ error, result, response }) {
console.log('error:', error);
console.log('result:', result);
expect(error).to.be.eql(false);
expect(result.dcCoverage).to.be.eql('2021-04-16');
expect(result.dcCreator).to.be.eql('Rep. Connolly, Gerald E. [D-VA-11]');
expect(result.dcDate).to.be.eql('12/29/2022');
expect(result.dcIdentifier).to.be.eql('https://www.congress.gov/bill/117th-congress/house-bill/2617/text');
expect(result.dcLanguage).to.be.eql('eng');
expect(result.dcRights).to.be.eql('Text is government work');
expect(result.dcSubject).to.be.eql('Economics and Public Finance');
expect(result.dcTitle).to.be.eql('Text - H.R.2617 - 117th Congress (2021-2022): Consolidated Appropriations Act, 2023');
expect(result.dcType).to.be.eql('webpage');
expect(result.ogDescription).to.be.eql('Text for H.R.2617 - 117th Congress (2021-2022): Consolidated Appropriations Act, 2023');
expect(result.ogTitle).to.be.eql('Text - H.R.2617 - 117th Congress (2021-2022): Consolidated Appropriations Act, 2023');
expect(result.ogType).to.be.eql('website');
expect(result.ogUrl).to.be.eql('https://www.congress.gov/bill/117th-congress/house-bill/2617/text');
expect(result.ogImage).to.be.eql([
{
url: 'https://www.congress.gov/img/opengraph1200by630.jpg',
type: 'jpg',
},
]);
expect(result.ogLocale).to.be.eql('en');
expect(result.charset).to.be.eql('UTF-8');
expect(result.requestUrl).to.be.eql('https://www.congress.gov/bill/117th-congress/house-bill/2617/text');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'dcCoverage',
'dcCreator',
'dcDate',
'dcIdentifier',
'dcLanguage',
'dcRights',
'dcSubject',
'dcTitle',
'dcType',
'ogDescription',
'ogTitle',
'ogType',
'ogUrl',
'ogImage',
'ogLocale',
'charset',
'requestUrl',
'success',
);
expect(response).to.be.an('Response');
}).catch(function ({ error, result, response }) {
console.log('error:', error);
console.log('result:', result);
expect(error).to.be.eql(true);
Expand Down
24 changes: 22 additions & 2 deletions tests/integration/encoding.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ describe('encoding', function () {
type: 'jpeg',
},
]);
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/baidu');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
Expand Down Expand Up @@ -541,7 +541,7 @@ describe('encoding', function () {
type: 'jpg',
},
]);
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/xinhuanet');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
Expand Down Expand Up @@ -577,5 +577,25 @@ describe('encoding', function () {
expect(response).to.be.an('Response');
});
});
it('abehiroshi', function () {
return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' })
.then(function (data) {
const { error, result, response } = data;
console.log('error:', error);
console.log('result:', result);
expect(error).to.be.eql(false);
expect(result.ogTitle).to.be.eql('阿部寛のホームページ');
expect(result.charset).to.be.eql('Shift_JIS');
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'charset',
'ogTitle',
'requestUrl',
'success',
);
expect(response).to.be.an('Response');
});
});
});
});
5 changes: 3 additions & 2 deletions tests/integration/redirect.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import ogs from '../../index';

describe('redirect', function () {
context('should return correct Open Graph Info', function () {
it('nytimes page', function () {
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36';
// nytimes keep blocking requests, will need to find way to bypass this
it.skip('nytimes page', function () {

Check warning on line 8 in tests/integration/redirect.spec.ts

View workflow job for this annotation

GitHub Actions / buildAndTest (18)

Unexpected skipped mocha test

Check warning on line 8 in tests/integration/redirect.spec.ts

View workflow job for this annotation

GitHub Actions / buildAndTest (20)

Unexpected skipped mocha test
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
return ogs({
url: 'https://www.nytimes.com/2016/09/01/arts/design/gallery-hopes-to-sell-kanye-wests-famous-sculpture-for-4-million.html?_r=0',
fetchOptions: { headers: { 'user-agent': userAgent } },
Expand Down
22 changes: 11 additions & 11 deletions tests/integration/static.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ describe('static', function () {
height: '320',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/arstechnica');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -501,7 +501,7 @@ describe('static', function () {
url: 'https://cdn1.cloudpro.co.uk/sites/cloudprod7/files/2020/05/outreach_founding_members.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/cloudpro');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -556,7 +556,7 @@ describe('static', function () {
url: 'https://cnet3.cbsistatic.com/img/0IjS4wIUDkC77PSDb-eyF0aZNw8=/756x567/2020/01/22/931a3fa2-4e0e-4def-bdc2-448926f8da02/5g-phone-2.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/cnet');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -701,7 +701,7 @@ describe('static', function () {
url: 'https://twimgs.com/nojitter/darkreading/dr-logo.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/darkreading');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('iso-8859-1');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -1480,7 +1480,7 @@ describe('static', function () {
url: 'https://www.incimages.com/uploaded_files/image/1024x576/GettyImages-1223427650_430893.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/inc');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -1580,7 +1580,7 @@ describe('static', function () {
url: 'http://is5.mzstatic.com/image/thumb/Purple71/v4/97/0a/71/970a71f1-9c94-cc61-c960-304191a8dc42/source/1200x630bf.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/itunes');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'ogDate',
Expand Down Expand Up @@ -1941,7 +1941,7 @@ describe('static', function () {
url: 'https://images.macrumors.com/article-new/2020/05/greyhoundappletvplus.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/macrumors');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -2125,7 +2125,7 @@ describe('static', function () {
height: '400',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/michaelkors');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -2503,7 +2503,7 @@ describe('static', function () {
},
]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rottentomatoes');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -2749,7 +2749,7 @@ describe('static', function () {
url: 'http://www.thinkgeek.com/images/products/frontsquare/jjip_citizen_playing_cards.jpg',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/thinkgeek');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('ISO-8859-1');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -2809,7 +2809,7 @@ describe('static', function () {
url: 'https://cdn.vox-cdn.com/thumbor/OR6JkRz2SCfX5Ecx6JSCVWk5vs0=/0x0:4746x2373/fit-in/1200x600/cdn.vox-cdn.com/uploads/chorus_asset/file/20030444/524250960.jpg.jpg', alt: 'Jack Dorsey Sydney Photo Shoot',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/vox');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/twitter.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ describe('twitter', function () {
height: '500',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/twitter-dev');
expect(result.charset).to.be.eql('UTF-8');
expect(result.charset).to.be.eql('utf-8');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down
6 changes: 6 additions & 0 deletions tests/unit/fallback.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,12 @@ describe('fallback', function () {
expect(ogObject.charset).to.be.eql('bar');
expect(ogObject).to.have.all.keys('charset');
});
it('when there is a meta tag with http-equiv charset', function () {
const $ = load('<html><head><meta http-equiv="Content-Type" content="text/html; charset=foo_bar"></head></html>');
const ogObject = fallback({}, {}, $, '');
expect(ogObject.charset).to.be.eql('foo_bar');
expect(ogObject).to.have.all.keys('charset');
});
it('when trying to get a charset from the body', function () {
const body = '<html><head></head></html>';
const $ = load(body);
Expand Down

0 comments on commit c4419f2

Please sign in to comment.