Skip to content

Commit

Permalink
Merge pull request #52 from adityak74/api-server-crawler
Browse files Browse the repository at this point in the history
feat: create crawler api server
  • Loading branch information
steve8708 authored Dec 25, 2023
2 parents c34bde5 + 7707146 commit 4ccf3b3
Show file tree
Hide file tree
Showing 9 changed files with 8,931 additions and 502 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
API_PORT=5000
API_HOST=localhost
MAX_PAGES_TO_CRAWL=45
NODE_ENV=development
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ storage

# any output from the crawler
*.json
.env
pnpm-lock.yaml
9,306 changes: 8,810 additions & 496 deletions package-lock.json

Large diffs are not rendered by default.

17 changes: 15 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,42 @@
"description": "Crawl a site to generate knowledge files to create your own custom GPT",
"dependencies": {
"commander": "^11.1.0",
"cors": "^2.8.5",
"crawlee": "^3.0.0",
"dotenv": "^16.3.1",
"express": "^4.18.2",
"express-fileupload": "^1.4.3",
"cross-env": "^7.0.3",
"glob": "^10.3.10",
"gpt-tokenizer": "^2.1.2",
"inquirer": "^9.2.12",
"playwright": "*",
"zod": "^3.22.4"
"prettier": "^3.1.0",
"swagger-ui-express": "^5.0.0"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/cors": "^2.8.17",
"@types/express": "^4.17.21",
"@types/express-fileupload": "^1.4.4",
"@semantic-release/changelog": "^6.0.3",
"@semantic-release/git": "^10.0.1",
"@types/inquirer": "^9.0.7",
"@types/node": "^20.0.0",
"prettier": "^3.1.0",
"semantic-release": "^22.0.8",
"ts-node": "^10.8.0",
"typescript": "^5.0.0"
"typescript": "^5.0.0",
"@types/swagger-ui-express": "^4.1.6",
"swagger-autogen": "^2.23.7",
"zod": "^3.22.4"
},
"scripts": {
"semantic-release": "semantic-release",
"preinstall": "npx playwright install",
"start": "npm run start:dev",
"start:server": "NODE_ENV=development npm run build && node dist/src/server.js",
"start:server:prod": "npm run build && node dist/src/server.js",
"start:cli": "cross-env NODE_ENV=development npm run build && node dist/src/cli.js",
"start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
"start:prod": "node dist/src/main.js",
Expand Down
4 changes: 3 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { z } from "zod";

import type { Page } from "playwright";
import { configDotenv } from "dotenv";

configDotenv();

const Page: z.ZodType<Page> = z.any();

Expand Down
42 changes: 39 additions & 3 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import { glob } from "glob";
import { Config, configSchema } from "./config.js";
import { Page } from "playwright";
import { isWithinTokenLimit } from "gpt-tokenizer";
import { PathLike } from "fs";

let pageCounter = 0;
let crawler: PlaywrightCrawler;

export function getPageHtml(page: Page, selector = "body") {
return page.evaluate((selector) => {
Expand Down Expand Up @@ -52,7 +54,7 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const crawler = new PlaywrightCrawler({
crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title();
Expand Down Expand Up @@ -145,6 +147,7 @@ export async function crawl(config: Config) {
}

export async function write(config: Config) {
let nextFileNameString: PathLike = "";
const jsonFiles = await glob("storage/datasets/default/*.json", {
absolute: true,
});
Expand All @@ -165,8 +168,14 @@ export async function write(config: Config) {
`${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;

const writeBatchToFile = async (): Promise<void> => {
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
nextFileNameString = nextFileName();
await writeFile(
nextFileNameString,
JSON.stringify(currentResults, null, 2),
);
console.log(
`Wrote ${currentResults.length} items to ${nextFileNameString}`,
);
currentResults = [];
currentSize = 0;
fileCounter++;
Expand Down Expand Up @@ -215,4 +224,31 @@ export async function write(config: Config) {
if (currentResults.length > 0) {
await writeBatchToFile();
}

return nextFileNameString;
}

class GPTCrawlerCore {
config: Config;

constructor(config: Config) {
this.config = config;
}

async crawl() {
await crawl(this.config);
}

async write(): Promise<PathLike> {
// we need to wait for the file path as the path can change
return new Promise((resolve, reject) => {
write(this.config)
.then((outputFilePath) => {
resolve(outputFilePath);
})
.catch(reject);
});
}
}

export default GPTCrawlerCore;
44 changes: 44 additions & 0 deletions src/server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import express from "express";
import cors from "cors";
import { readFile } from "fs/promises";
import { Config, configSchema } from "./config.js";
import { configDotenv } from "dotenv";
import swaggerUi from "swagger-ui-express";
// @ts-ignore
import swaggerDocument from "../swagger-output.json" assert { type: "json" };
import GPTCrawlerCore from "./core.js";
import { PathLike } from "fs";

configDotenv();

const app = express();
const port = Number(process.env.API_PORT) || 3000;
const hostname = process.env.API_HOST || "localhost";

app.use(cors());
app.use(express.json());
app.use("/api-docs", swaggerUi.serve, swaggerUi.setup(swaggerDocument));

// Define a POST route to accept config and run the crawler
app.post("/crawl", async (req, res) => {
const config: Config = req.body;
try {
const validatedConfig = configSchema.parse(config);
const crawler = new GPTCrawlerCore(validatedConfig);
await crawler.crawl();
const outputFileName: PathLike = await crawler.write();
const outputFileContent = await readFile(outputFileName, "utf-8");
res.contentType("application/json");
return res.send(outputFileContent);
} catch (error) {
return res
.status(500)
.json({ message: "Error occurred during crawling", error });
}
});

app.listen(port, hostname, () => {
console.log(`API server listening at http://${hostname}:${port}`);
});

export default app;
14 changes: 14 additions & 0 deletions swagger.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import swaggerAutogen from "swagger-autogen";

const doc = {
info: {
title: "GPT Crawler API",
description: "GPT Crawler",
},
host: "localhost:5000",
};

const outputFile = "swagger-output.json";
const routes = ["./src/server.ts"];

swaggerAutogen()(outputFile, routes, doc);
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"module": "ES2022",
"target": "ES2022",
"outDir": "dist",
"moduleResolution": "node",
"resolveJsonModule": true,
"noUnusedLocals": false,
"skipLibCheck": true,
Expand Down

0 comments on commit 4ccf3b3

Please sign in to comment.