From 628eacd8e8fdee23e520627fa8cce77f1bb1b9e5 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 23 May 2023 14:59:17 -0700 Subject: [PATCH 1/7] implement aside usage for use with components not directly related to content, to be skipped by DocSearch crawler --- src/components/BannerNotification/index.tsx | 1 + src/components/CallToContribute.tsx | 1 + src/components/Callout.tsx | 1 + src/components/CalloutBanner.tsx | 1 + src/components/Quiz/QuizWidget.tsx | 2 +- src/components/TranslationBanner.tsx | 1 + src/components/TranslationBannerLegal.tsx | 1 + src/components/UpgradeStatus.tsx | 1 + 8 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/components/BannerNotification/index.tsx b/src/components/BannerNotification/index.tsx index d8d3728f1e9..f1dcae055f0 100644 --- a/src/components/BannerNotification/index.tsx +++ b/src/components/BannerNotification/index.tsx @@ -16,6 +16,7 @@ const BannerNotification: React.FC = ({ <> {shouldShow && (
= ({ editPath }) => { return ( = ({ ...rest }) => ( = ({ quizKey, maxQuestions }) => { // Render QuizWidget component return ( - + = ({ return ( = ({ return ( = ({ return ( Date: Tue, 23 May 2023 14:59:33 -0700 Subject: [PATCH 2/7] add util helper functions for DocSearch handling --- src/utils/sanitizeHitTitle.ts | 6 ++++++ src/utils/url.ts | 8 ++++++++ 2 files changed, 14 insertions(+) create mode 100644 src/utils/sanitizeHitTitle.ts diff --git a/src/utils/sanitizeHitTitle.ts b/src/utils/sanitizeHitTitle.ts new file mode 100644 index 00000000000..04d4d61291b --- /dev/null +++ b/src/utils/sanitizeHitTitle.ts @@ -0,0 +1,6 @@ +export const sanitizeHitTitle = (value: string): string => { + const newValue = value.replaceAll(""", '"').replaceAll("&", "&") + const siteNameIndex = value.lastIndexOf(" | ") + if (siteNameIndex < 0) return newValue + return newValue.substring(0, siteNameIndex) +} diff --git a/src/utils/url.ts b/src/utils/url.ts index b7a31a1becb..106233b8b3e 100644 --- a/src/utils/url.ts +++ b/src/utils/url.ts @@ -12,3 +12,11 @@ export const isGlossary = (href: string): boolean => export const isStatic = (href: string): boolean => href.includes("static") export const isPdf = (href: string): boolean => href.includes(".pdf") + +export const sanitizeHitUrl = (url: string): string => + url + .replace(/^https?:\/\/[^\/]+(?=\/)/, "") + .replace("#gatsby-focus-wrapper", "") + .replace("#main-content", "") + .replace("#content", "") + .replace("#top", "") From 166375b5031ee19e0d111f7c7d792f0169c6a22a Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 23 May 2023 15:05:10 -0700 Subject: [PATCH 3/7] update Search/index.tsx --- src/components/Search/index.tsx | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/components/Search/index.tsx b/src/components/Search/index.tsx index 2677ed605d6..79ba9ab3098 100644 --- a/src/components/Search/index.tsx +++ b/src/components/Search/index.tsx @@ -16,6 +16,9 @@ import { useDocSearchKeyboardEvents } from "@docsearch/react" import { DocSearchHit } from "@docsearch/react/dist/esm/types" import SearchButton from "./SearchButton" import SearchModal from "./SearchModal" +import { sanitizeHitUrl } from "../../utils/url" +import { sanitizeHitTitle } from "../../utils/sanitizeHitTitle" + // Styles import "@docsearch/css" @@ -49,21 +52,7 @@ const Search = forwardRef<{}, "button">((_, ref) => { const appId = process.env.GATSBY_ALGOLIA_APP_ID || "" const apiKey = process.env.GATSBY_ALGOLIA_SEARCH_KEY || "" const indexName = - process.env.GATSBY_ALGOLIA_BASE_SEARCH_INDEX_NAME || "prod-ethereum-org" - - const sanitizeHitUrl = (url: string): string => - url - .replace(/^https?:\/\/[^\/]+(?=\/)/, "") - .replace("#main-content", "") - .replace("#content", "") - .replace("#top", "") - - const sanitizeHitTitle = (value: string): string => { - const newValue = value.replaceAll(""", '"') - const siteNameIndex = value.lastIndexOf(" | ") - if (siteNameIndex < 0) return newValue - return newValue.substring(0, siteNameIndex) - } + process.env.GATSBY_ALGOLIA_BASE_SEARCH_INDEX_NAME || "ethereumorg" // Check for the breakpoint with theme token const xlBp = useToken("breakpoints", "xl") @@ -89,7 +78,7 @@ const Search = forwardRef<{}, "button">((_, ref) => { /> )} - {isOpen ? ( + {isOpen && ( ((_, ref) => { }, }} /> - ) : null} + )} ) From a7632836d587fd5e796c596669b1257c8fcc4fc1 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Tue, 23 May 2023 15:13:53 -0700 Subject: [PATCH 4/7] Update site-search.md documentation --- docs/site-search.md | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/docs/site-search.md b/docs/site-search.md index d28873ffbe7..14d2a59bb9b 100644 --- a/docs/site-search.md +++ b/docs/site-search.md @@ -1,32 +1,28 @@ # Site search on ethereum.org -TL;DR: we use Algolia to implement a site search feature on ethereum.org. +TL;DR: we use Algolia to implement a site search feature on ethereum.org. As an open source project, Algolia has sponsored the crawling and indexing of the entire site. -## What do we use Algolia and Docsearch for? +## What do we use Algolia and DocSearch for? -Algolia allows us to index the content on ethereum.org and implement a powerful site search tool on ethereum.org. In order to create the index of our content, we use a web crawling tool called Docsearch. Docsearch takes a start_urls of ethereum.org and crawls the site to index the content based on a [docsearchConfig file](https://github.com/ethereum/ethereum-org-website/blob/dev/.github/workflows/docsearchConfig.json). +Algolia allows us to index the content on ethereum.org and implement a powerful site search tool on ethereum.org. In order to create the index of our content, we use a web crawling tool called DocSearch. DocSearch takes a starting URL of ethereum.org and crawls the site to index the content, based on a custom configuration setup held with the service. -We kick off the crawling and indexing of ethereum.org through a GitHub Action that triggers on the merge to `master` branch. [View the GitHub Action](https://github.com/ethereum/ethereum-org-website/blob/dev/.github/workflows/docsearch-crawl.yml). +Site crawling and indexing is performed by default on a weekly basis on Friday afternoons. This is performed automatically by Algolia servers, which scrape the entire production site of ethereum.org to build an index. This index is hosted by Algolia for use on the site. -## Docsearch Config +## DocSearch Config -Some important notes about the docsearch config file: +Some important notes about the DocSearch config: ### Configuration -- `index_name` is the name of the algolia index where the generated index will be uploaded to. -- `start_urls` are the urls that the crawler will start from. Some important attributes in the `start_urls` that we use are: - - `lang`: regex path to different languages that the site is translated to that need crawling. Since ethereum.org is translated to 37+ languages, we need to be able to crawl the website in each language for indexing. - - `page_rank`: the rank of pages that breaks ties when multiple query results have the same weight. This weight is derived from the selectors. -- `stop_urls` is used to strip out query parameters in the websites urls. We were running into issues where we were getting duplicate query results due to query parameters making urls unique. Stripping these out solved our deduplication problem. -- selectors are used to specify what the crawler should look for when weighting content for the index. - -### Generation - -We generate the docsearchConfig.json file using a [script](https://github.com/ethereum/ethereum-org-website/blob/dev/.github/workflows/docsearchConfigScript.js). This allows us to dynamically pull in the languages the websites support from the [translations.json data file](https://github.com/ethereum/ethereum-org-website/blob/dev/src/data/translations.json). Our GitHub action executes this script. +- `indexName` is the name of the Algolia index where the generated index will be uploaded to +- `startUrls` are the urls that the crawler will start from +- Translated pages are automatically faceted for search results based on the `` attribute of each page +- Selectors are used to specify what the crawler should look for when weighting content for the index. +- CheerioAPI can be utilized within the crawler using the `$` selector to manipulate the DOM before indexing each page +- Elements to be ignored are removed before indexing using the CheerioAPI library: `$('selector').remove()`. This includes `aside`, `nav`, `footer` and `style` elements. +- While building pages, semantic naming with the aforementioned elements, i.e. `aside`, will ignore any content contained within. This is beneficial for content that is not directly related to the page content, such as callouts, banners, quiz content, or navigation elements. ## Resources - [Algolia documentation](https://www.algolia.com/doc/) -- [Docsearch documentation](https://docsearch.algolia.com/docs/what-is-docsearch) -- [Docsearch scraper Docker image](https://hub.docker.com/r/algolia/docsearch-scraper) +- [DocSearch documentation](https://docsearch.algolia.com/docs/what-is-docsearch) From a0a9d306d910ea6e7a9d27509b26a7a6d9648d49 Mon Sep 17 00:00:00 2001 From: Paul Wackerow <54227730+wackerow@users.noreply.github.com> Date: Fri, 19 May 2023 17:35:04 -0700 Subject: [PATCH 5/7] move main-content anchor to content div for parsing by DocSearch, retains same skip-link functionality --- src/components/Layout.tsx | 9 ++++++--- src/components/SkipLink.tsx | 4 ---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/components/Layout.tsx b/src/components/Layout.tsx index 394ae52442c..366bc2f2e19 100644 --- a/src/components/Layout.tsx +++ b/src/components/Layout.tsx @@ -15,7 +15,7 @@ import SideNavMobile from "./SideNavMobile" import TranslationBanner from "./TranslationBanner" import TranslationBannerLegal from "./TranslationBannerLegal" import FeedbackWidget from "./FeedbackWidget" -import { SkipLink, SkipLinkAnchor } from "./SkipLink" +import { SkipLink } from "./SkipLink" import { ZenModeContext } from "../contexts/ZenModeContext" @@ -135,8 +135,11 @@ const Layout: React.FC = ({