Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output args, refactoring #102

Merged
merged 4 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions __tests__/pptr-utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import { dedupLinks, getSocialLinks } from '../src/pptr-utils/get-links';
import { SOCIAL_URLS } from '../src/pptr-utils/default';
import { LinkObject } from '../src/types';

describe('get-links', () => {
describe('dedupLinks', () => {
it('removes duplicates from an array of links', () => {
const links:LinkObject[] = [
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
];
const result = dedupLinks(links);
expect(result.length).toBe(2);
expect(result).toStrictEqual([
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
]);
});
it('returns original array of links if no duplicates', () => {
const links: LinkObject[] = [
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
];
const result = dedupLinks(links);
expect(result).toHaveLength(2);
expect(result).toStrictEqual(links);
});
});

describe('getSocialLinks', () => {
it('filters out only social links from a list of LinkObjects', () => {
const links:LinkObject[] = [
{
href: 'url1.com',
innerHtml: 'html',
innerText: 'hello world!'
},
{
href: 'www.facebook.com',
innerHtml: 'facebook',
innerText: 'fb'
},
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(2);
expect(results).toStrictEqual([
{
href: 'www.facebook.com',
innerHtml: 'facebook',
innerText: 'fb'
} ,
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
}
]);
});
it('doesn\'t recognize urls ending in a social url', () => {
const links = [
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'www.fix.com',
innerHtml: 'fix',
innerText: 'fix'
},
{
href: 'fix.com',
innerHtml: 'fix',
innerText: 'fix'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(2);
expect(results).toStrictEqual([
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'x.com',
innerHtml: 'x',
innerText: 'x'
},
]);
});
it('returns original array if it contains only social links', () => {
const links = [
{
href: 'www.evernote.com',
innerHtml: 'evernote',
innerText: 'evernote'
},
{
href: 'www.tiktok.com',
innerHtml: 'tiktok',
innerText: 'tiktok'
}
];
const result = getSocialLinks(links);
expect(result).toHaveLength(links.length);
expect(result).toStrictEqual(links);
});
it('recognizes every social link', () => {
const links:LinkObject[] = SOCIAL_URLS.map(url => ({ href: `www.${url}`, innerText: 'text', innerHtml: 'html'}));
const result = getSocialLinks(links);
expect(result).toHaveLength(links.length);
expect(result).toStrictEqual(links);
});
it('recognizes different versions of a social link', () => {
const links = [
{
href: 'snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'www.snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'http://snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'https://snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'http://www.snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'www.snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'subdomain.snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(links.length);
expect(results).toStrictEqual(links);
});
});
});
12 changes: 10 additions & 2 deletions src/collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,26 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
const logger = getLogger({ outDir: args.outDir, quiet: args.quiet });

const output: any = {
args: args.title,
title: args.title,
uri_ins: inUrl,
uri_dest: null,
uri_redirects: null,
secure_connection: {},
host: new URL(inUrl).hostname,
config: {
emulateDevice: args.emulateDevice,
cleareCache: args.clearCache,
captureHar: args.captureHar,
captureLinks: args.captureLinks,
enableAdBlock: args.enableAdBlock,
numPages: args.numPages
saveBrowserProfile: args.saveBrowserProfile,
numPages: args.numPages,
defaultTimeout: args.defaultTimeout,
defaultWaitUntil: args.defaultWaitUntil,
headless: args.headless,
headers: args.headers,
extraChromiumArgs: args.extraChromiumArgs,
extraPuppeteerOptions: args.extraPuppeteerOptions,
},
browser: null,
script: {
Expand Down
33 changes: 33 additions & 0 deletions src/pptr-utils/default.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,36 @@ export const defaultPuppeteerBrowserOptions = {
defaultViewport: null,
headless: true
};

export const SOCIAL_URLS = [
'facebook.com',
'linkedin.com',
'twitter.com',
'youtube.com',
'instagram.com',
'flickr.com',
'tumblr.com',
'snapchat.com',
'whatsapp.com',
'docs.google.com',
'goo.gl',
'pinterest.com',
'bit.ly',
'evernote.com',
'eventbrite.com',
'dropbox.com',
'slideshare.net',
'vimeo.com',
'x.com',
'bsky.app',
'tiktok.com',
'mastodon.social',
'threads.net',
'wechat.com',
'messenger.com',
'telegram.org',
'douyin.com',
'kuaishou.com',
'weibo.com',
'im.qq.com',
];
43 changes: 9 additions & 34 deletions src/pptr-utils/get-links.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { LinkObject } from '../types';
import { hasOwnProperty } from '../utils';
import { SOCIAL_URLS } from './default';

export const getLinks = async (page): Promise<LinkObject[]> => {
return page.evaluate(() => {
Expand All @@ -25,44 +26,18 @@ export const getLinks = async (page): Promise<LinkObject[]> => {
});
};

// https://dev.to/vuevixens/removing-duplicates-in-an-array-of-objects-in-js-with-sets-3fep
export const dedupLinks = (links_with_duplicates: LinkObject[]) => {
const sanitizedLinks = links_with_duplicates.filter(f => f && hasOwnProperty(f, 'href'));
const dedupedLinkArray = Array.from(new Set(sanitizedLinks));
// I don't think the bellow modification actually does anything,
// but I'm gonna write tests for this function before pulling the plug
const links = dedupedLinkArray
.map((link:LinkObject) => link.href)
.map(href => {
return links_with_duplicates.find(link => link.href === href);
});
return links;
};
// Uses Set to remove duplicates by reducing LinkObjects to their href property, deduping via Set,
// then reconstituting an array of full LinkObjects
export const dedupLinks = (links_with_duplicates: LinkObject[]):LinkObject[] => {
const sanitized_links = links_with_duplicates.filter(f => f && hasOwnProperty(f, 'href')).map(link => link.href);
const deduped_href_array = Array.from(new Set(sanitized_links));

const SOCIAL_URLS = [
'facebook.com',
'linkedin.com',
'twitter.com',
'youtube.com',
'instagram.com',
'flickr.com',
'tumblr.com',
'snapchat.com',
'whatsapp.com',
'docs.google.com',
'goo.gl',
'pinterest.com',
'bit.ly',
'plus.google.com',
'evernote.com',
'eventbrite.com',
'dropbox.com',
'slideshare.net',
'vimeo.com'
];
return deduped_href_array.map(href => links_with_duplicates.find(link => link.href === href));
};

export const getSocialLinks = (links: LinkObject[]): LinkObject[] => {
const spRegex = new RegExp(`\\b(${SOCIAL_URLS.join('|')})\\b`, 'i');
console.log(spRegex);
BatMiles marked this conversation as resolved.
Show resolved Hide resolved
return links.filter(link => {
return link.href.match(spRegex);
});
Expand Down
Loading