Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
Unknown authored and Unknown committed Sep 26, 2018
0 parents commit bb36479
Show file tree
Hide file tree
Showing 2 changed files with 412 additions and 0 deletions.
398 changes: 398 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,398 @@
const puppeteer = require('puppeteer');

const DEFAULT_OPTIONS = {
launch: {
// headless: false
},
pages: 99,
timeline: 'light',
replies: true,
parents: true,
quote: true,
loadWait: 1000,
doScreenshot: false,
screenshot: {
type: "png",
},
ss_content_type: 'image/png',
viewport: {
width: 1280,
height: 800
},
debug: false
}

const parseTweet = async (page, tweet, options) => {
// console.log('tweet found');
// console.log(tweet);

const tweetData = await page.evaluate(element => {
return {
tweetId: element.getAttribute('data-tweet-id'),
screenName: element.getAttribute('data-screen-name'),
fullName: element.getAttribute('data-name'),
permaLink: element.getAttribute('data-permalink-path'),
userId: element.getAttribute('data-user-id'),
retweetId: element.getAttribute('data-retweet-id'),
retweeter: element.getAttribute('data-retweeter'),
conversationId: element.getAttribute('data-conversation-id'),
mentions: element.getAttribute('data-mentions') ? element.getAttribute('data-mentions').split(' ') : null,
quality: element.getAttribute('data-conversation-section-quality'),
};
}, tweet);
// consoleok.log(tweetData);

const timestamp = await tweet.$eval('.tweet-timestamp span._timestamp',
node => node.getAttribute('data-time'));
const tweetText = await tweet.$eval('.tweet-text',
node => node.innerText);
const tweetHTML = await tweet.$eval('.tweet-text',
node => node.innerHTML);
const avatar = await tweet.$eval('.account-group img.avatar',
node => node.getAttribute("src"));

const images_t = await tweet.$$eval('.AdaptiveMedia-photoContainer',
nodes => nodes.map(n => n.getAttribute('data-image-url')));
const images_v = await tweet.$$eval('.AdaptiveMedia-photoContainer video',
nodes => nodes.map(node => node.getAttribute('poster')))
const images = images_t.concat(images_v);

const retweet_num = await tweet.$eval('.stream-item-footer .ProfileTweet-action--retweet .ProfileTweet-actionCount',
node => node.getAttribute('data-tweet-stat-count'));
const favorite_num = await tweet.$eval('.stream-item-footer .ProfileTweet-action--favorite .ProfileTweet-actionCount',
node => node.getAttribute('data-tweet-stat-count'));
const reply_num = await tweet.$eval('.stream-item-footer .ProfileTweet-action--reply .ProfileTweet-actionCount',
node => node.getAttribute('data-tweet-stat-count'));

const quoteTweet = options.quote ? await tweet.$('.QuoteTweet .QuoteTweet-container .QuoteTweet-innerContainer')
.then(async quote => {
if (!quote) return null;
if (options.debug) console.log('quote found');
// console.log(quote);

let quoteData = await page.evaluate(element => {
return {
tweetId: element.getAttribute('data-item-id'),
screenName: element.getAttribute('data-screen-name'),
permaLink: element.getAttribute('href'),
userId: element.getAttribute('data-user-id'),
conversationId: element.getAttribute('data-conversation-id'),
quality: element.getAttribute('data-conversation-section-quality'),
};
}, quote);
// console.log(quoteData);

const fullName = await quote.$eval('.tweet-content .QuoteTweet-originalAuthor .QuoteTweet-fullname',
node => node.innerText);
const quoteText = await quote.$eval('.tweet-content .QuoteTweet-text',
node => node.innerText);
const quoteHTML = await quote.$eval('.tweet-content .QuoteTweet-text',
node => node.innerHTML);

let images = await quote.$$eval('.tweet-content .QuoteMedia-photoContainer',
nodes => nodes.map(n => n.getAttribute('data-image-url')));
const images_v = await quote.$$eval('.tweet-content .QuoteMedia-videoPreview img',
nodes => nodes.map(n => n.getAttribute('src')));
images = images.concat(images_v);

const mentions = await quote.$$eval('.tweet-content .QuoteTweet-authorAndText .ReplyingToContextBelowAuthor .username > b',
nodes => nodes.map( node => node.innerText ));

quoteData = Object.assign({}, quoteData, {fullName, mentions, quoteText, quoteHTML, images});
// console.log(quoteData);

return quoteData;
}).catch(err => {
console.log('Quote Error or Not Found', err);
}) : null;

const screenshot = options.doScreenshot ? await tweet.screenshot(Object.assign({}, {
type: "png",
}, options.screenshot || null)) : null;

const out = Object.assign({}, tweetData, {avatar, timestamp, tweetText, tweetHTML, images, retweet_num,
favorite_num, reply_num, quoteTweet, screenshot: screenshot ?
'data:'+ (options.ss_content_type || 'image/png') + ';base64,' + screenshot.toString('base64') : null });
// console.log(out);

return out;
};

const loadStream = async (page, container, options) => {
let position = null;
await container.$('div.stream-container > div.stream > div.stream-footer')
.then(async footer => {

const position_cb = async () => {
return await container.$eval('div.stream-container',
node => node.getAttribute('data-min-position'))
.catch(err => {
console.log("Cannot Find Data Position Error: ", err);
});
};

let pages = (options.pages || 1) - 1;
let position = await position_cb();
let oldp = null;

while (position != oldp && pages > 0) {
oldp = position;
await footer.hover()
.then(async () => {
// console.log('Seek.');
await page.waitFor(options.loadWait || 1000);
})
.catch(err => {
console.log("Page Seek Error: ", err);
});

position = await position_cb();
pages--;
}
});

await container.$$('div.stream-container > div.stream > .stream-items li[data-expansion-url] > a')
.then(async items => {
for (var i in items) {
await items[i].click().catch(err => {
console.log(`Click ${i} Failed.`, err);
});
await page.waitFor(50);
}
})
.catch(err => {
console.log('Hidden Reply Expansion Error.', err);
});
}

const parseContainer = async (page, container, options) => {
// console.log('container found');
// console.log(container);

let tweetData = await container.$('.permalink-tweet-container > div.tweet')
.then(tweet => {
return parseTweet(page, tweet, options);
})
.catch(err => {
console.log("Tweet Parsing Failed.");
throw err;
});
// console.log(tweetData);

let parent_list = null,
reply_list = null,
parents = null,
replies = null;

if (options.timeline != false) {
await container.$('.permalink-replies div#descendants').then(async replies => {
await loadStream(page, replies, options);
});

parent_list = options.parents ? await container.$$eval('.permalink-in-reply-tos div#ancestors .stream-items li[data-item-id]',
nodes => nodes.map(node => node.getAttribute('data-item-id'))) : null;

reply_list = options.replies ? await container.$$eval('.permalink-replies div#descendants .stream-items li[data-item-id]',
nodes => nodes.map(node => {
if (node.getAttribute('data-retweet-id')) {
return node.getAttribute('data-retweet-id');
}
return node.getAttribute('data-item-id');
})) : null;

const map_tweets = async nodes => {
console.log(nodes);
for (var i in nodes) {
nodes[i] = await parseTweet(page, nodes[i], options);
}
return nodes;
};

replies = options.timeline == 'full' && options.replies ?
await container.$$('.permalink-replies div#descendants .stream-items li[data-item-id] .tweet')
.then(map_tweets) : null;

parents = options.timeline == 'full' && options.parents ?
await container.$$('.permalink-in-reply-tos div#ancestors .stream-items li[data-item-id] .tweet')
.then(map_tweets) : null;
}

tweetData = Object.assign({}, tweetData, {parent_list, reply_list});

const screenshot = options.doScreenshot ? await container.screenshot(Object.assign({}, {
type: "png",
}, options.screenshot || null)) : null;

return {tweetData, parents, replies, screenshot: screenshot ?
'data:'+ (options.ss_content_type || 'image/png') + ';base64,' + screenshot.toString('base64') : null};
};

const parseTimeline = async (page, container, options) => {

await loadStream(page, container, options);

const map_tweets = async nodes => {
for (var i in nodes) {
nodes[i] = await parseTweet(page, nodes[i], options);
}
return nodes;
};

let tweet_list;

if (options.timeline == 'full') {
tweet_list = await container.$$('.stream-items li[data-item-id] .tweet')
.then(map_tweets);

} else if (options.timeline == 'light') {
tweet_list = await container.$$eval('.stream-items li[data-item-id]',
nodes => nodes.map(node => {
if (node.getAttribute('data-retweet-id')) {
return node.getAttribute('data-retweet-id');
}
return node.getAttribute('data-item-id');
}));
}
// console.log(tweet_list);

return tweet_list;
};

class ScrapeTweet {

constructor(props) {
props.options = Object.assign({}, DEFAULT_OPTIONS, props.options || null);

this.page = props.page || null;
this.options = props.options || null;
}

async close() {
if (this.browser) {
return this.browser.close();
}
}

async getBrowser() {
if (!this.browser && this.page) {
this.browser = this.page.browser();
}

if (!this.browser) {
const opts = Object.assign({}, {
args: ['--disable-gpu', '--no-sandbox', '--single-process', '--disable-web-security',
'--disable-dev-profile', '--disable-dev-shm-usage', '--no-zygote'],
ignoreHTTPSErrors: true
}, this.options ? (this.options.launch || null) : null);

this.browser = await puppeteer.launch(opts);
}

return this.browser;
}

async getPage() {
if (!this.page) {
const browser = await this.getBrowser();
this.page = await browser.newPage();
this.page.setViewport(Object.assign({}, {
width: 1280,
height: 800
}, this.options ? (this.options.viewport || null) : null));

// twitter doesn't take kindly to linux
let userAgent = await browser.userAgent();
userAgent = userAgent.replace(/(Mozilla\/\d+\.\d+\s+)\([^\)]+\)/i, '$1(Windows NT 10.0; Win64; x64)');
userAgent = userAgent.replace(/HeadlessChrome/, 'Chrome');

this.page.setUserAgent(userAgent);
}

// this.page.on('response', response => {
// console.log('response', response.request().resourceType(), response.url());
// })

return this.page;
}

async getTweet(url) {
const page = await this.getPage();
// console.log(page);

return await page.goto(url)
.then(async response => {
if (!response.ok()) {
throw "Bad Response";
}

const container = await this.page.$('div.permalink-container')
.then(container => {
return parseContainer(this.page, container, this.options);
})
.catch(err => {
console.log('Container Parse Failed.', err);
});

return container;

}).catch(err => {
console.log(err);
});
}

async getTimeline(url) {
const page = await this.getPage();

return await page.goto(url)
.then(async response => {
if (!response.ok()) {
throw "Bad Response";
}

const container = await this.page.$('div#timeline')
.then(container => {
return parseTimeline(this.page, container, this.options);
})
.catch(err => {
console.log('Timeline Parse Failed.', err);
});

const screenshot = this.options.doScreenshot ? await container.screenshot(Object.assign({}, {
type: "png",
}, this.options.screenshot || null)) : null;

return {tweets: container, screenshot: screenshot ?
'data:'+ (options.ss_content_type || 'image/png') + ';base64,' + screenshot.toString('base64') : null};

}).catch(err => {
console.log(err);
});
}

async getSearch(url) {
const page = await this.getPage();

return await page.goto(url)
.then(async response => {
if (!response.ok()) {
throw "Bad Response";
}

const container = await this.page.$('div#timeline')
.then(container => {
// console.log(container);
return parseTimeline(this.page, container, this.options);
})
.catch(err => {
console.log('Timeline Parse Failed.', err);
});

return container;

}).catch(err => {
console.log(err);
});
}
}

module.exports = ScrapeTweet;
Loading

0 comments on commit bb36479

Please sign in to comment.