Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ga): sync ga data lambda #93

Merged
merged 16 commits into from
Nov 20, 2023
Merged
5 changes: 2 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ jobs:
MATTERS_PG_PASSWORD: postgres
MATTERS_PG_DATABASE: matters-test
MATTERS_NEW_FEATURE_TAG_ID: 1
MATTERS_PG_RO_CONNECTION_STRING: postgresql://postgres:postgres@localhost/test_matters-test
MATTERS_PG_RO_CONNECTION_STRING: postgresql://postgres:postgres@localhost/matters-test
MATTERS_CACHE_HOST: localhost


MATTERS_TEST_DB_SETUP: 1
13 changes: 13 additions & 0 deletions bin/sync-ga4-data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import fs from "fs";
import { fetchGA4Data, convertAndMerge, saveGA4Data } from "../lib/ga4.js";

const main = async () => {
const startDate = "2021-10-15";
const endDate = "2023-10-19";
const data = await fetchGA4Data({ startDate, endDate });
fs.writeFileSync("./data.json", JSON.stringify(data));
const convertedData = await convertAndMerge(data);
await saveGA4Data(convertedData, { startDate, endDate });
};

main();
8 changes: 6 additions & 2 deletions handlers/sync-audit-log-to-bigquery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,12 @@ const processAndDumpLocal = async (
action: data.action ? data.action.slice(0, 50) : undefined,
entity: data.entity ? data.entity.slice(0, 50) : undefined,
entity_id: data.entityId,
old_value: data.oldValue ? String(data.oldValue).slice(0, 255) : undefined,
new_value: data.newValue ? String(data.newValue).slice(0, 255) : undefined,
old_value: data.oldValue
? String(data.oldValue).slice(0, 255)
: undefined,
new_value: data.newValue
? String(data.newValue).slice(0, 255)
: undefined,
status: data.status.slice(0, 10),
request_id: requestId ? requestId.slice(0, 36) : undefined,
ip: data.ip ? data.ip.slice(0, 45) : undefined,
Expand Down
37 changes: 37 additions & 0 deletions handlers/sync-ga4-data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import {
fetchGA4Data,
convertAndMerge,
saveGA4Data,
getLocalDateString,
} from "../lib/ga4.js";

// envs
// MATTERS_GA4_PROPERTY_ID;
// MATTERS_GA4_PROJECT_ID;
// MATTERS_GA4_CLIENT_EMAIL;
// MATTERS_GA4_PRIVATE_KEY;

// AWS EventBridge can configure the input event sent to Lambda,
// see https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-transform-target-input.html for info.
type Event = {
data: {
type: "today" | "yesterday";
};
};

const getDate = (type: "today" | "yesterday") => {
const date = new Date();
if (type === "yesterday") {
date.setDate(date.getDate() - 1);
}
return getLocalDateString(date);
};

export const handler = async (event: Event) => {
const { type } = event.data;
const startDate = getDate(type);
const endDate = startDate;
robertu7 marked this conversation as resolved.
Show resolved Hide resolved
const data = await fetchGA4Data({ startDate, endDate });
const convertedData = await convertAndMerge(data);
await saveGA4Data(convertedData, { startDate, endDate });
};
1 change: 0 additions & 1 deletion jest.config.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ module.exports = {
testRegex: '(/__tests__/.*|(\\.|/)(test|spec))\\.tsx?$',
testPathIgnorePatterns: ['/node_modules/', '/matters-server/'],
globalSetup: '<rootDir>/matters-server/db/globalTestSetup.js',
globalTeardown: '<rootDir>/matters-server/db/globalTestTeardown.js',
transform: {
'\\.tsx?$': ['ts-jest', {
useESM: true,
Expand Down
34 changes: 34 additions & 0 deletions lib/__test__/ga4.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { pgKnex as knex } from "../db";
import { saveGA4Data, TABLE_NAME, getLocalDateString } from "../ga4";

test("saveGA4Data", async () => {
const startDate = "2021-01-01";
const endDate = "2021-01-01";

// insert
await saveGA4Data({ "1": 1, "2": 2 }, { startDate, endDate });

const rows = await knex(TABLE_NAME)
.select("*")
.where({ dateRange: `[${startDate}, ${endDate}]` });
expect(rows.length).toBe(2);

// insert and update
await saveGA4Data({ "1": 2, "3": 3 }, { startDate, endDate });

const rows2 = await knex(TABLE_NAME)
.select("*")
.where({ dateRange: `[${startDate}, ${endDate}]` });
expect(rows2.length).toBe(3);
for (const row of rows2) {
if (row.articleId === "1") {
expect(row.totalUsers).toBe("2");
}
}
});

test("getLocalDateString", async () => {
const date = new Date("2021-01-01");
const dateStr = getLocalDateString(date);
expect(dateStr).toBe("2021-01-01");
});
4 changes: 1 addition & 3 deletions lib/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@ import { getKnexClient, getPostgresJsClient } from "./utils/db.js";
const CLOUDFLARE_IMAGE_ENDPOINT = process.env.CLOUDFLARE_IMAGE_ENDPOINT || "";
const MATTERS_AWS_S3_ENDPOINT = process.env.MATTERS_AWS_S3_ENDPOINT || "";

const isTest = process.env.MATTERS_ENV === "test";
const dbHost = process.env.MATTERS_PG_HOST || "";
const dbUser = process.env.MATTERS_PG_USER || "";
const dbPasswd = process.env.MATTERS_PG_PASSWORD || "";
const _dbName = process.env.MATTERS_PG_DATABASE || "";
const dbName = isTest ? "test_" + _dbName : _dbName;
const dbName = process.env.MATTERS_PG_DATABASE || "";

const databaseURL =
process.env.PG_CONNECTION_STRING ||
Expand Down
193 changes: 193 additions & 0 deletions lib/ga4.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import { BetaAnalyticsDataClient } from "@google-analytics/data";

import { pgKnexRO as knexRO, pgKnex as knex } from "./db.js";

const propertyId = process.env.MATTERS_GA4_PROPERTY_ID;
const projectId = process.env.MATTERS_GA4_PROJECT_ID;
const clientEmail = process.env.MATTERS_GA4_CLIENT_EMAIL;
const privateKey = process.env.MATTERS_GA4_PRIVATE_KEY || "";

export const TABLE_NAME = "article_ga4_data";

interface Row {
path: string;
totalUsers: string;
}

interface MergedData {
[key: string]: number;
}

export const getLocalDateString = (date: Date) => {
// return utc+8 date string in YYYY-MM-DD format
return date.toLocaleDateString("sv", { timeZone: "Asia/Taipei" });
Copy link
Contributor

@tx0c tx0c Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this sv using a Swedish locale? maybe toISOString is better? does here require a UTC+8 timezone cutoff of each day?

> (new Date).toISOString().substring(0, 10)
'2023-11-14'

Copy link
Contributor Author

@gary02 gary02 Nov 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does here require a UTC+8 timezone cutoff of each day?

Yes, as GA4 timezone settings is UTC+8

};

export const fetchGA4Data = async ({
startDate,
endDate,
}: {
startDate: string;
endDate: string;
}): Promise<Row[]> => {
const analyticsDataClient = new BetaAnalyticsDataClient({
projectId,
credentials: {
client_email: clientEmail,
private_key: privateKey.replace(/\\n/g, "\n"),
},
});
const limit = 10000;
let offset = 0;
const result: Row[] = [];
for (;;) {
const res = await request(
{ startDate, endDate, limit, offset },
analyticsDataClient
);
result.push(...res);
offset += limit;
if (res.length < limit) {
break;
}
}
return result;
};

export const saveGA4Data = async (
data: MergedData,
{ startDate, endDate }: { startDate: string; endDate: string }
) => {
const rows = Object.entries(data).map(([id, totalUsers]) => ({
articleId: id,
totalUsers,
dateRange: `[${startDate}, ${endDate}]`,
}));
const updateRows = [];
const insertRows = [];
for (const { articleId, dateRange, totalUsers } of rows) {
const res = await knexRO(TABLE_NAME)
.where({ articleId, dateRange })
.select("id", "totalUsers")
.first();
if (res && res.totalUsers) {
if (res.totalUsers !== String(totalUsers)) {
// only update when totalUsers is different
updateRows.push({ id: res.id, totalUsers });
}
} else {
insertRows.push({ articleId, dateRange, totalUsers });
}
}
if (updateRows.length > 0) {
for (const { id, totalUsers } of updateRows) {
await knex(TABLE_NAME).update({ totalUsers }).where({ id: id });
}
}
if (insertRows.length > 0) {
await knex(TABLE_NAME).insert(insertRows);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and do you have estimation of how many rows will be there each day? what's growth estimation of this table per month;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and there's a way to upsert might be useful in the case, one line to handle all rows:

INSERT INTO table (...) VALUES (...)
ON CONFLICT (id)
DO UPDATE SET total_users=EXCLUDED.total_users ;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

upsert way doesn't work here, because table article_ga4_data has an exclusion constraint which ON CONFLICT syntax doesn't support

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have estimation of how many rows will be there each day? what's growth estimation of this table per month;

roughly 6k rows a day

}
};

export const convertAndMerge = async (rows: Row[]): Promise<MergedData> => {
const converted = Promise.all(
rows.map(async (row) => ({
id: await pathToId(row.path),
totalUsers: parseInt(row.totalUsers, 10),
}))
);
const res: MergedData = {};
for (const row of await converted) {
if (row.id in res) {
res[row.id] += row.totalUsers;
} else {
res[row.id] = row.totalUsers;
}
}
return res;
};

const pathToId = async (path: string) => {
const [_, __, articlePath] = path.split("/");
if (articlePath) {
const parts = articlePath.split("-");
const idLike = parts[0];
const hash = parts[parts.length - 1];
if (!isNaN(parseInt(idLike))) {
return idLike;
} else {
return hashToId(hash);
}
}
};

const hashToId = async (hash: string) => {
const res = await knexRO("article")
.where({ mediaHash: hash })
.select("id")
.first();
if (res) {
return res.id;
} else {
return null;
}
};

// https://developers.google.com/analytics/devguides/reporting/data/v1
const request = async (
{
startDate,
endDate,
limit,
offset,
}: {
startDate: string;
endDate: string;
limit: number;
offset: number;
},
client: BetaAnalyticsDataClient
): Promise<Row[]> => {
const [response] = await client.runReport({
property: `properties/${propertyId}`,
dateRanges: [
{
startDate,
endDate,
Comment on lines +160 to +161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we need to check dates here? startDate <= endDate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

invalid input would fail the GA4 api request

},
],
dimensions: [
{
name: "pagePath",
},
],
dimensionFilter: {
filter: {
fieldName: "pagePath",
stringFilter: {
matchType: "BEGINS_WITH",
value: "/@",
},
},
},
metrics: [
{
name: "totalUsers",
//name: 'activeUsers',
},
],
limit,
offset,
returnPropertyQuota: true,
});
if (response && response.rows) {
console.log(response.propertyQuota);
console.log(`total rows count: ${response.rowCount}`);
return response.rows.map((row) => ({
path: (row.dimensionValues && row.dimensionValues[0].value) ?? "",
totalUsers: (row.metricValues && row.metricValues[0].value) ?? "0",
}));
} else {
throw new Error("No response received.");
}
};
2 changes: 1 addition & 1 deletion matters-server
Submodule matters-server updated 191 files
Loading
Loading