Skip to content

Commit

Permalink
Nick:
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed Apr 6, 2024
1 parent 9d28744 commit b6aed88
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@mendable/data-connectors",
"version": "0.0.48",
"version": "0.0.49-beta.2",
"description": "Data connectors for LLMs. Made by Mendable.ai",
"main": "dist/index.js",
"module": "dist/index.mjs",
Expand Down
10 changes: 7 additions & 3 deletions src/providers/WebScraper/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,35 @@ export class WebCrawler {
private maxCrawledLinks: number;
private visited: Set<string> = new Set();
private crawledUrls: Set<string> = new Set();
private limit: number;

constructor({
initialUrl,
includes,
excludes,
maxCrawledLinks = 1000,
limit = 10000,
}: {
initialUrl: string;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
}) {
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; // Initialize the base URL
this.includes = includes ?? [];
this.excludes = excludes ?? [];
this.maxCrawledLinks = maxCrawledLinks;
this.limit = limit;
}

public async start(inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5): Promise<string[]> {
public async start(inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5, limit: number = 10000): Promise<string[]> {
// Attempt to fetch and return sitemap links before any crawling
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
// console.log('Sitemap found, returning sitemap links.');
return sitemapLinks;
return sitemapLinks.slice(0, limit);
}
// Proceed with crawling if no sitemap links found
return await this.crawlUrls([this.initialUrl], concurrencyLimit, inProgress);
Expand All @@ -50,7 +54,7 @@ export class WebCrawler {
inProgress?: (progress: Progress) => void
): Promise<string[]> {
const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= this.maxCrawledLinks) {
if (this.crawledUrls.size >= this.maxCrawledLinks ) {
callback();
return;
}
Expand Down
9 changes: 7 additions & 2 deletions src/providers/WebScraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export type WebScraperOptions = {
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;

};
concurrentRequests?: number;
};
Expand All @@ -24,6 +26,7 @@ export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
private excludes: string[];
private maxCrawledLinks: number;
private returnOnlyUrls: boolean;
private limit: number = 10000;
private concurrentRequests: number = 20;

authorize(): void {
Expand Down Expand Up @@ -72,8 +75,9 @@ export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
includes: this.includes,
excludes: this.excludes,
maxCrawledLinks: this.maxCrawledLinks,
limit: this.limit,
});
const links = await crawler.start(inProgress);
const links = await crawler.start(inProgress,5,this.limit);
if (this.returnOnlyUrls) {
return links.map((url) => ({
content: "",
Expand All @@ -91,7 +95,7 @@ export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
if (this.mode === "sitemap") {
const links = await getLinksFromSitemap(this.urls[0]);
console.log(`Found ${links.length} urls in sitemap`);
return this.convertUrlsToDocuments(links, inProgress);
return this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
}

throw new Error("Method not implemented.");
Expand All @@ -108,5 +112,6 @@ export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
this.excludes = options.crawlerOptions?.excludes ?? [];
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
this.limit = options.crawlerOptions?.limit ?? 10000;
}
}

0 comments on commit b6aed88

Please sign in to comment.