Skip to content

Commit

Permalink
Merge pull request #4155 from omnivore-app/fingerprint
Browse files Browse the repository at this point in the history
inject fingerprint
  • Loading branch information
sywhb authored Jul 11, 2024
2 parents 57d546b + 8e5439f commit 3d791a7
Show file tree
Hide file tree
Showing 12 changed files with 216 additions and 71 deletions.
1 change: 1 addition & 0 deletions packages/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
"express-prom-bundle": "^7.0.0",
"express-rate-limit": "^6.3.0",
"fast-safe-stringify": "^2.1.1",
"fingerprint-generator": "^2.1.52",
"firebase-admin": "^11.5.0",
"googleapis": "^125.0.0",
"graphql": "^15.3.0",
Expand Down
8 changes: 2 additions & 6 deletions packages/api/src/jobs/rss/refreshFeed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import createHttpTaskWithToken from '../../utils/createTask'
import { cleanUrl } from '../../utils/helpers'
import { createThumbnailProxyUrl } from '../../utils/imageproxy'
import { logger } from '../../utils/logger'
import { rssParserConfig } from '../../utils/parser'
import { RSSRefreshContext } from './refreshAllFeeds'

type FolderType = 'following' | 'inbox'
Expand Down Expand Up @@ -181,15 +182,10 @@ const getThumbnail = (item: RssFeedItem) => {
export const fetchAndChecksum = async (url: string) => {
try {
const response = await axios.get(url, {
...rssParserConfig(),
responseType: 'arraybuffer',
timeout: 60_000,
maxRedirects: 10,
headers: {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
Accept:
'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml, text/html;q=0.4',
},
})

const hash = crypto.createHash('sha256')
Expand Down
4 changes: 2 additions & 2 deletions packages/api/src/resolvers/discover_feeds/add.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
MutationAddDiscoverFeedArgs,
} from '../../generated/graphql'
import { authorized } from '../../utils/gql-utils'
import { RSS_PARSER_CONFIG } from '../../utils/parser'
import { rssParserConfig } from '../../utils/parser'

const parser = new XMLParser({
ignoreAttributes: false,
Expand Down Expand Up @@ -93,7 +93,7 @@ const addNewSubscription = async (
userId: string
): Promise<AddDiscoverFeedSuccess | AddDiscoverFeedError> => {
// First things first, we need to validate that this is an actual RSS or ATOM feed.
const response = await axios.get(url, RSS_PARSER_CONFIG)
const response = await axios.get(url, rssParserConfig())
const content = response.data

const contentType = response.headers['content-type']
Expand Down
4 changes: 2 additions & 2 deletions packages/api/src/resolvers/subscriptions/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ import { analytics } from '../../utils/analytics'
import { enqueueRssFeedFetch } from '../../utils/createTask'
import { authorized } from '../../utils/gql-utils'
import { getAbsoluteUrl, keysToCamelCase } from '../../utils/helpers'
import { parseFeed, parseOpml, RSS_PARSER_CONFIG } from '../../utils/parser'
import { parseFeed, parseOpml, rssParserConfig } from '../../utils/parser'

type PartialSubscription = Omit<Subscription, 'newsletterEmail'>

Expand Down Expand Up @@ -442,7 +442,7 @@ export const scanFeedsResolver = authorized<

try {
// fetch page content and parse feeds
const response = await axios.get(url, RSS_PARSER_CONFIG)
const response = await axios.get(url, rssParserConfig())
const content = response.data as string
// check if the content is html or xml
const contentType = response.headers['Content-Type']
Expand Down
5 changes: 5 additions & 0 deletions packages/api/src/utils/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import languages from '@cospired/i18n-iso-languages'
import { countWords } from 'alfaaz'
import crypto from 'crypto'
import { FingerprintGenerator } from 'fingerprint-generator'
import Redis from 'ioredis'
import { parseHTML } from 'linkedom'
import normalizeUrl from 'normalize-url'
Expand Down Expand Up @@ -286,3 +287,7 @@ export const getClientFromUserAgent = (userAgent: string): string => {

export const lanaugeToCode = (language: string): string =>
languages.getAlpha2Code(language, 'en') || 'en'

export const generateFingerprint = () => {
return new FingerprintGenerator().getFingerprint()
}
23 changes: 13 additions & 10 deletions packages/api/src/utils/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import { env } from '../env'
import { PageType, PreparedDocumentInput } from '../generated/graphql'
import { userRepository } from '../repository/user'
import { ArticleFormat } from '../resolvers/article'
import { generateFingerprint } from './helpers'
import {
EmbeddedHighlightData,
findEmbeddedHighlight,
Expand Down Expand Up @@ -77,15 +78,17 @@ const DOM_PURIFY_CONFIG = {
const ARTICLE_PREFIX = 'omnivore:'

export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const RSS_PARSER_CONFIG = {
timeout: 20000, // 20 seconds
headers: {
// some rss feeds require user agent
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
Accept:
'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4, text/html;q=0.2',
},
export const rssParserConfig = () => {
const fingerprint = generateFingerprint()

return {
headers: {
'user-agent': fingerprint.headers['user-agent'],
accept:
'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4, text/html;q=0.2',
},
timeout: 20000, // 20 seconds
}
}

/** Hook that prevents DOMPurify from removing youtube iframes */
Expand Down Expand Up @@ -836,7 +839,7 @@ export const parseFeed = async (
}
}

const parser = new Parser(RSS_PARSER_CONFIG)
const parser = new Parser(rssParserConfig())

const feed = content
? await parser.parseString(content)
Expand Down
2 changes: 1 addition & 1 deletion packages/content-fetch/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM node:18.16
LABEL org.opencontainers.image.source="https://github.com/omnivore-app/omnivore"

# Installs latest Chromium (92) package.
# Installs latest Chromium package.
RUN apt-get update && apt-get install -y \
chromium \
ca-certificates \
Expand Down
7 changes: 6 additions & 1 deletion packages/content-fetch/src/request_handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH
: new Storage()
const bucketName = process.env.GCS_UPLOAD_BUCKET || 'omnivore-files'

const NO_CACHE_URLS = [
'https://deviceandbrowserinfo.com/are_you_a_bot',
'https://deviceandbrowserinfo.com/info_device',
]

const uploadToBucket = async (filePath: string, data: string) => {
await storage
.bucket(bucketName)
Expand Down Expand Up @@ -198,7 +203,7 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
fetchResult = await fetchContent(url, locale, timezone)
console.log('content has been fetched')

if (fetchResult.content) {
if (fetchResult.content && !NO_CACHE_URLS.includes(url)) {
const cacheResult = await cacheFetchResult(
redisDataSource,
key,
Expand Down
8 changes: 4 additions & 4 deletions packages/puppeteer-parse/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
"crypto": "^1.0.1",
"dompurify": "^2.4.1",
"linkedom": "^0.14.9",
"puppeteer-core": "^22.8.0",
"puppeteer-extra": "^3.3.4",
"puppeteer-extra-plugin-adblocker": "^2.13.5",
"puppeteer-extra-plugin-stealth": "^2.11.1",
"puppeteer-core": "^22.12.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"urlsafe-base64": "^1.0.0"
},
"devDependencies": {
Expand Down
16 changes: 8 additions & 8 deletions packages/puppeteer-parse/src/browser.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Browser } from 'puppeteer-core'
import { Browser, Target } from 'puppeteer-core'
import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
Expand Down Expand Up @@ -26,26 +26,22 @@ export const getBrowser = async (): Promise<Browser> => {
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--disable-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--window-size=1920,1080',
'--disable-extensions',
'--disable-dev-shm-usage',
'--no-first-run',
'--disable-background-networking',
'--use-gl=swiftshader',
'--disable-gpu',
'--disable-software-rasterizer',
],
defaultViewport: {
deviceScaleFactor: 1,
Expand All @@ -56,9 +52,13 @@ export const getBrowser = async (): Promise<Browser> => {
width: 1920,
},
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
// run in shell mode if headless
headless: process.env.LAUNCH_HEADLESS === 'true' ? 'shell' : false,
timeout: 10_000, // 10 seconds
dumpio: true, // show console logs in the terminal
// filter out targets
targetFilter: (target: Target) =>
target.type() !== 'other' || !!target.url(),
})) as Browser

const version = await browserInstance.version()
Expand Down
46 changes: 12 additions & 34 deletions packages/puppeteer-parse/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@ import path from 'path'
import { BrowserContext, Page, Protocol } from 'puppeteer-core'
import { getBrowser } from './browser'

const DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com']

const ALLOWED_CONTENT_TYPES = [
Expand All @@ -20,21 +15,6 @@ const ALLOWED_CONTENT_TYPES = [
'text/plain',
'application/pdf',
]
const REQUEST_TIMEOUT = 30000

const userAgentForUrl = (url: string) => {
try {
const u = new URL(url)
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
}

const fetchContentWithScrapingBee = async (url: string) => {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
Expand All @@ -45,7 +25,7 @@ const fetchContentWithScrapingBee = async (url: string) => {
premium_proxy: 'true',
country_code: 'us',
},
timeout: REQUEST_TIMEOUT,
timeout: 10_000,
})

const dom = parseHTML(response.data).document
Expand Down Expand Up @@ -80,11 +60,11 @@ export const fetchContent = async (
}
console.log(`content-fetch request`, logRecord)

let context: BrowserContext | undefined,
page: Page | undefined,
let page: Page | undefined,
title: string | undefined,
content: string | undefined,
contentType: string | undefined
contentType: string | undefined,
context: BrowserContext | undefined

try {
url = getUrl(url)
Expand Down Expand Up @@ -168,11 +148,11 @@ export const fetchContent = async (

throw e
} finally {
// close browser context if it was opened
// close browser context if it was created
if (context) {
console.info('closing context...', url)
console.info('closing page...', url)
await context.close()
console.info('context closed', url)
console.info('page closed', url)
}

console.info(`content-fetch result`, logRecord)
Expand Down Expand Up @@ -241,20 +221,17 @@ async function retrievePage(
}

const browser = await getBrowser()
// create a new incognito browser context
const context = await browser.createBrowserContext()

// Puppeteer fails during download of PDf files,
// so record the failure and use those items
let lastPdfUrl
let page
try {
page = await context.newPage()
const page = await context.newPage()

if (!enableJavascriptForUrl(url)) {
await page.setJavaScriptEnabled(false)
}
await page.setUserAgent(userAgentForUrl(url))

// set locale for the page
if (locale) {
Expand Down Expand Up @@ -359,7 +336,7 @@ async function retrievePage(

const response = await page.goto(url, {
timeout: 30 * 1000,
waitUntil: ['networkidle2'],
waitUntil: ['networkidle0'],
})
if (!response) {
throw new Error('No response from page')
Expand All @@ -371,12 +348,11 @@ async function retrievePage(
logRecord.finalUrl = finalUrl
logRecord.contentType = contentType

return { context, page, finalUrl, contentType }
return { page, finalUrl, contentType, context }
} catch (error) {
if (lastPdfUrl) {
return {
context,
page,
finalUrl: lastPdfUrl,
contentType: 'application/pdf',
}
Expand All @@ -392,6 +368,8 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
title = await page.title()
logRecord.title = title

await page.waitForSelector('body')

const pageScrollingStart = Date.now()
/* scroll with a 5 seconds timeout */
try {
Expand Down
Loading

0 comments on commit 3d791a7

Please sign in to comment.