Skip to content

Commit

Permalink
Support iframes (#12)
Browse files Browse the repository at this point in the history
* Support iframes.
* Polish README and fix small nits.
* Run tests in Node env.
  • Loading branch information
msindwan authored Mar 4, 2020
1 parent b4a6b74 commit 091148c
Show file tree
Hide file tree
Showing 8 changed files with 20,058 additions and 108 deletions.
40 changes: 20 additions & 20 deletions .eslintrc
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"parserOptions": {
"ecmaVersion": 6,
"sourceType": "module"
},
"env": {
"browser": true,
"node": true
},
"rules": {
"semi": 2,
"indent": [1, 4, {"SwitchCase": 1}],
"no-console": 0,
"eol-last": 1,
"react/prop-types": 0,
"no-control-regex": 0,
"no-fallthrough": 0,
"no-useless-escape": 0,
"max-len": ["error", 131]
},
"extends": ["eslint:recommended"]
"parserOptions": {
"ecmaVersion": 6,
"sourceType": "module"
},
"env": {
"browser": true,
"node": true
},
"rules": {
"semi": 2,
"indent": [1, 4, {"SwitchCase": 1}],
"no-console": 0,
"eol-last": 1,
"react/prop-types": 0,
"no-control-regex": 0,
"no-fallthrough": 0,
"no-useless-escape": 0,
"max-len": ["error", 131]
},
"extends": ["eslint:recommended"]
}
55 changes: 32 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,9 @@

**It has only been tested with MHTML files built and used with the latest versions of Chrome**

To get started, import mhtml2html:

```js
import mhtml2html from 'mhtml2html';
```

Or include it as a script in your webpage:

```html
<script src="https://unpkg.com/mhtml2html@<VERSION>" type="javascript" />
```
### Node.js Example

mhtml2html can be used via the command line:
mhtml2html can be used via the command line (use the `--help` flag to view all options):

```sh
$ mhtml2html <input.mhtml> <output.html>
Expand All @@ -36,15 +24,27 @@ For programmatic usage, mhtml2html can be used provided a WHATWG DOM parser impl

```js
const mhtml2html = require('mhtml2html');
const { JSDOM } = require("jsdom");
const { JSDOM } = require('jsdom');

const mhtml = '<your MHTML string>';
const htmlDoc = mhtml2html.convert(mhtml, (html) => new JSDOM(html));
const htmlDoc = mhtml2html.convert(mhtml, { parseDOM: (html) => new JSDOM(html) });
console.log(htmlDoc);
```

### Browser Example

To get started, import mhtml2html:

```js
import mhtml2html from 'mhtml2html';
```

Or include it as a script in your webpage:

```html
<script src="https://unpkg.com/mhtml2html@<VERSION>" type="javascript" />
```
By default, mhtml2html will use the DOMParser available in most browsers:
```js
Expand All @@ -59,21 +59,29 @@ console.log(html);
### parse
`mhtml2html.parse(mhtml, htmlOnly = false, parseDOM = <function>);`
`mhtml2html.parse(mhtml, { htmlOnly = false, parseDOM = <function> });`
* mhtml: An MHTML String.
* htmlOnly: If set to true, returns the html document without resources.
* parseDOM: A callback that accepts a DOM string and returns a window object (defaults to `DOMParser` only available in browsers)
* options.htmlOnly: If set to true, returns the html document without resources.
* options.parseDOM: A callback that accepts a DOM string and returns a window object (defaults to `DOMParser` only available in browsers).
* Returns an html document without resources if `htmlOnly` is set to true. Otherwise it returns an MHTML parsed object:
``` json
{
"index" : "<html-index-url>",
"assets": {
"media": {
"<asset-url>" : {
"data" : "<resource-string>",
"id": "<frame-id>",
"type": "<resource-type",
"type": "<resource-type>",
"encoding": "<resource-encoding>"
}
},
"frames": {
"<frame-id>": {
"data": "<resource-string>",
"id": "<frame-id>",
"type:": "<resource-type>",
"encoding": "<resource-encoding>"
}
}
Expand All @@ -82,10 +90,11 @@ console.log(html);
### convert
`mhtml2html.convert(mhtml, parseDOM = <function>);`
`mhtml2html.convert(mhtml, { convertIframes = false, parseDOM = <function> });`
* mhtml: An MHTML String or MHTML parsed object.
* parseDOM: A callback that accepts a DOM string and returns a window object (defaults to `DOMParser` only available in browsers)
* options.convertIframes: Whether or not to include iframes in the converted response (defaults to false).
* options.parseDOM: A callback that accepts a DOM string and returns a window object (defaults to `DOMParser` only available in browsers).
* Returns an html window element.
## Development
Expand All @@ -97,7 +106,7 @@ console.log(html);
To build and test mhtml2html:
1. If node_modules haven't been installed already, run `yarn install` from the root directory.
1. If `node_modules` haven't been installed already, run `yarn install` from the root directory.
2. Run `yarn test` to build and test the source code.
## License
Expand Down
48 changes: 30 additions & 18 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,36 @@
**/

const mhtml2html = require('./dist/mhtml2html');
const { JSDOM } = require("jsdom");
const { JSDOM } = require('jsdom');
const yargs = require('yargs');
const fs = require('fs');

// Ensure that an input and output path is provided.
if (process.argv[2] === undefined || process.argv[3] === undefined) {
throw new Error("Path is required. Usage : mhtml2html <input.mhtml> <output.html>");
}
yargs
.command('$0 <input> <output>', 'Converts an mhtml file to a single html file', (yargs) => {
yargs.positional('input', {
describe: 'The path to the input mhtml file',
type: 'string',
}).positional('output', {
describe: 'The path to the output html file',
type: 'string'
})
}, (argv) => {
fs.readFile(argv.input, 'utf8', (err, data) => {
if (err) {
throw err;
}

// Read the file provided and return the html document as a string.
fs.readFile(process.argv[2], 'utf8', (err, data) => {
if (err) {
throw err;
}

const doc = mhtml2html.convert(data, (html) => new JSDOM(html));
fs.writeFile(process.argv[3], doc.serialize(), err => {
if (err) {
return console.log(err);
}
});
});
const doc = mhtml2html.convert(data, { convertIframes: argv.convertIframes, parseDOM: (html) => new JSDOM(html) });
fs.writeFile(argv.output, doc.serialize(), err => {
if (err) {
return console.log(err);
}
});
});
})
.option('convertIframes', {
alias: 'i',
type: 'boolean',
description: 'Include iframes in the converted output'
})
.argv
7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "mhtml2html",
"version": "2.0.0",
"version": "3.0.0",
"description": "Converts an mhtml document to a single html document",
"author": "Mayank Sindwani",
"license": "MIT",
Expand All @@ -24,15 +24,16 @@
"prebuild": "yarn lint",
"build": "webpack --mode=production",
"pretest": "yarn build",
"test": "karma start --single-run"
"test": "yarn mocha tests/ && karma start --single-run"
},
"engines": {
"node": ">=10"
},
"dependencies": {
"base-64": "^0.1.0",
"jsdom": "^15.1.1",
"quoted-printable": "^1.0.1"
"quoted-printable": "^1.0.1",
"yargs": "^15.1.0"
},
"devDependencies": {
"@babel/core": "^7.6.0",
Expand Down
64 changes: 41 additions & 23 deletions src/mhtml2html.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,18 @@ function replaceReferences(media, base, asset) {
return asset;
}

// Converts the provided asset to a data URI based on the encoding.
function convertAssetToDataURI(asset) {
switch(asset.encoding) {
case 'quoted-printable':
return `data:${asset.type};utf8,${escape(QuotedPrintable.decode(asset.data))}`;
case 'base64':
return `data:${asset.type};base64,${asset.data}`;
default:
return `data:${asset.type};base64,${Base64.encode(asset.data)}`;
}
}

// Main module.
const mhtml2html = {

Expand All @@ -93,11 +105,11 @@ const mhtml2html = {
*
* Description: Returns an object representing the mhtml and its resources.
* @param {mhtml} // The mhtml string.
* @param {htmlOnly} // A flag to determine which parsed object to return.
* @param {parseDOM} // The callback to parse an HTML string.
* @param {options.htmlOnly} // A flag to determine which parsed object to return.
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document without resources if htmlOnly === true; an MHTML parsed object otherwise.
*/
parse: (mhtml, htmlOnly = false, parseDOM = defaultDOMParser) => {
parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
const MHTML_FSM = {
MHTML_HEADERS : 0,
MTHML_CONTENT : 1,
Expand Down Expand Up @@ -224,7 +236,7 @@ const mhtml2html = {
frames[id] = asset;
}

// Associate the first frame with the location.
// Keep track of resources by location.
if (typeof location !== 'undefined' && typeof media[location] === 'undefined') {
media[location] = asset;
}
Expand Down Expand Up @@ -275,11 +287,11 @@ const mhtml2html = {
*
* Description: Accepts an mhtml string or parsed object and returns the converted html.
* @param {mhtml} // The mhtml string or object.
* @param {parseDOM} // The callback to parse an HTML string.
* @param {options.convertIframes} // Whether or not to include iframes in the converted response (defaults to false).
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document element.
*/
convert: (mhtml, parseDOM = defaultDOMParser) => {
let utf8String, b64String; // Encoded references.
convert: (mhtml, { convertIframes = false, parseDOM = defaultDOMParser } = {}) => {
let index, media, frames; // Record-keeping.
let style, base, img; // DOM objects.
let href, src; // References.
Expand Down Expand Up @@ -346,22 +358,10 @@ const mhtml2html = {
img = null;
if (typeof media[src] !== 'undefined' && media[src].type.includes('image')) {
// Embed the image into the document.
switch(media[src].encoding) {
case 'quoted-printable':
utf8String = QuotedPrintable.decode(media[src].data);
img = `data:${media[src].type};utf8,${escape(utf8String)}`;
break;
case 'base64':
img = `data:${media[src].type};base64,${media[src].data}`;
break;
default:
try {
b64String = Base64.encode(media[src].data);
img = `data:${media[src].type};base64,${b64String}`;
} catch(e) {
console.warn(e);
}
break;
try {
img = convertAssetToDataURI(media[src]);
} catch(e) {
console.warn(e);
}
if (img !== null) {
child.setAttribute('src', img);
Expand All @@ -370,6 +370,24 @@ const mhtml2html = {
child.style.cssText = replaceReferences(media, index, child.style.cssText);
break;

case 'IFRAME':
if (convertIframes === true && src) {
const id = `<${src.split('cid:')[1]}>`;
const frame = frames[id];

if (frame && frame.type === 'text/html') {
const iframe = mhtml2html.convert({
media: Object.assign({}, media, { [id] : frame }),
frames: frames,
index: id,
}, { convertIframes, parseDOM });
child.src = `data:text/html;charset=utf-8,${encodeURIComponent(
iframe.window.document.documentElement.outerHTML
)}`;
}
}
break;

default:
if (child.style) {
child.style.cssText = replaceReferences(media, index, child.style.cssText);
Expand Down
Loading

0 comments on commit 091148c

Please sign in to comment.