From 1a99ce6a37b48e81d611496e79f83a10d33b7895 Mon Sep 17 00:00:00 2001 From: Evan Bonsignori Date: Thu, 7 Nov 2024 10:15:57 -0800 Subject: [PATCH] `src/search` refactor + new endpoint: AI Search Autocomplete (#52822) --- ...arch.yml => index-autocomplete-search.yml} | 17 +- ...rch-pr.yml => index-general-search-pr.yml} | 23 +- ...ticsearch.yml => index-general-search.yml} | 12 +- .gitignore | 6 + package-lock.json | 27 + package.json | 20 +- src/fixtures/tests/breadcrumbs.ts | 4 +- src/frame/middleware/api.ts | 2 +- src/frame/middleware/index.ts | 4 +- src/frame/tests/favicons.ts | 12 +- src/frame/tests/manifest.ts | 3 + .../lib/release-templates/release-steps-1.md | 8 +- src/languages/tests/frame.ts | 6 +- .../scripts/rendered-content-link-checker.ts | 20 +- src/search/README.md | 66 +- src/search/components/Aggregations.tsx | 3 +- src/search/components/SearchResults.tsx | 29 +- src/search/components/ValidationErrors.tsx | 4 +- .../components/context/SearchContext.tsx | 7 +- src/search/components/index.tsx | 11 +- src/search/components/types.ts | 59 +- src/search/lib/config.js | 5 - src/search/lib/elasticsearch-indexes.ts | 91 +++ src/search/lib/elasticsearch-versions.ts | 107 ++++ .../ai-search-autocomplete.ts | 125 ++++ .../general-autocomplete.ts | 100 +++ .../general-search.ts} | 364 ++++------- .../helpers/elasticsearch-highlight-config.ts | 86 +++ .../lib/get-elasticsearch-results/types.ts | 23 + src/search/lib/helpers/get-client.ts | 31 + src/search/lib/helpers/old-version-logic.ts | 44 ++ src/search/lib/helpers/strings.ts | 10 + .../lib/utils.ts => lib/helpers/time.ts} | 25 + .../get-search-from-request-params.ts | 96 +++ .../search-params-objects.ts | 153 +++++ src/search/lib/search-request-params/types.ts | 52 ++ src/search/middleware/contextualize.js | 153 ----- .../middleware/general-search-middleware.ts | 174 ++++++ src/search/middleware/get-search-request.js | 229 ------- src/search/middleware/search-routes.ts | 150 +++++ src/search/middleware/search.js | 160 ----- src/search/pages/search.tsx | 25 +- .../{analyze-text.js => analyze-text.ts} | 86 +-- src/search/scripts/index-elasticsearch.js | 575 ------------------ src/search/scripts/index-test-fixtures.sh | 9 +- src/search/scripts/index/README.md | 24 + .../scripts/index/index-autocomplete.ts | 167 ----- src/search/scripts/index/index-cli.ts | 158 +++++ src/search/scripts/index/index.ts | 44 -- src/search/scripts/index/lib/get-client.ts | 27 - .../index/lib/index-ai-search-autocomplete.ts | 112 ++++ .../index/lib/index-general-autocomplete.ts | 134 ++++ .../scripts/index/lib/index-general-search.ts | 145 +++++ src/search/scripts/index/lib/populate.ts | 107 ---- src/search/scripts/index/lib/repoint-alias.ts | 77 --- src/search/scripts/index/types.ts | 57 +- src/search/scripts/index/utils/constants.ts | 11 + .../utils/indexing-elasticsearch-utils.ts | 178 ++++++ src/search/scripts/index/utils/mappings.ts | 52 ++ .../{lib => utils}/retry-on-error-test.ts | 6 +- src/search/scripts/index/utils/settings.ts | 118 ++++ src/search/scripts/retry-on-error-test.js | 76 --- src/search/scripts/scrape/README.md | 40 ++ .../lib/build-records.ts} | 40 +- .../{domwaiter.js => scrape/lib/domwaiter.ts} | 37 +- .../lib/find-indexable-pages.ts} | 9 +- .../lib/parse-page-sections-into-records.ts} | 14 +- .../lib/popular-pages.ts} | 38 +- .../lib/scrape-into-index-json.ts} | 42 +- .../lib/search-index-records.ts} | 35 +- .../scrape-cli.ts} | 80 +-- src/search/scripts/scrape/types.ts | 70 +++ src/search/scripts/search-index-records.js | 17 - .../tests/api-ai-search-autocomplete.ts | 164 +++++ ....js => api-general-autocomplete-search.ts} | 59 +- .../tests/{api-search.js => api-search.ts} | 184 +++--- .../queries/en/enterprise-cloud/queries.json | 52 ++ .../queries/en/free-pro-team/queries.json | 52 ++ ...b-docs_general-search_fpt_en-records.json} | 0 ...b-docs_general-search_fpt_ja-records.json} | 0 ...-docs_general-search_ghec_en-records.json} | 0 ...-docs_general-search_ghec_ja-records.json} | 0 ...js => parse-page-sections-into-records.ts} | 75 +-- .../tests/{rendering.js => rendering.ts} | 12 +- src/search/tests/search.js | 37 -- src/search/tests/search.ts | 40 ++ src/search/tests/topics.js | 39 -- src/search/tests/topics.ts | 44 ++ src/search/types.ts | 76 +++ src/tests/README.md | 4 + src/tests/helpers/e2etest-ts.ts | 181 ++++++ tsconfig.json | 1 + 92 files changed, 3697 insertions(+), 2454 deletions(-) rename .github/workflows/{index-autocomplete-elasticsearch.yml => index-autocomplete-search.yml} (69%) rename .github/workflows/{sync-search-pr.yml => index-general-search-pr.yml} (81%) rename .github/workflows/{sync-search-elasticsearch.yml => index-general-search.yml} (94%) delete mode 100644 src/search/lib/config.js create mode 100644 src/search/lib/elasticsearch-indexes.ts create mode 100644 src/search/lib/elasticsearch-versions.ts create mode 100644 src/search/lib/get-elasticsearch-results/ai-search-autocomplete.ts create mode 100644 src/search/lib/get-elasticsearch-results/general-autocomplete.ts rename src/search/{middleware/es-search.js => lib/get-elasticsearch-results/general-search.ts} (59%) create mode 100644 src/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config.ts create mode 100644 src/search/lib/get-elasticsearch-results/types.ts create mode 100644 src/search/lib/helpers/get-client.ts create mode 100644 src/search/lib/helpers/old-version-logic.ts create mode 100644 src/search/lib/helpers/strings.ts rename src/search/{scripts/index/lib/utils.ts => lib/helpers/time.ts} (56%) create mode 100644 src/search/lib/search-request-params/get-search-from-request-params.ts create mode 100644 src/search/lib/search-request-params/search-params-objects.ts create mode 100644 src/search/lib/search-request-params/types.ts delete mode 100644 src/search/middleware/contextualize.js create mode 100644 src/search/middleware/general-search-middleware.ts delete mode 100644 src/search/middleware/get-search-request.js create mode 100644 src/search/middleware/search-routes.ts delete mode 100644 src/search/middleware/search.js rename src/search/scripts/{analyze-text.js => analyze-text.ts} (61%) delete mode 100755 src/search/scripts/index-elasticsearch.js create mode 100644 src/search/scripts/index/README.md delete mode 100644 src/search/scripts/index/index-autocomplete.ts create mode 100644 src/search/scripts/index/index-cli.ts delete mode 100644 src/search/scripts/index/index.ts delete mode 100644 src/search/scripts/index/lib/get-client.ts create mode 100644 src/search/scripts/index/lib/index-ai-search-autocomplete.ts create mode 100644 src/search/scripts/index/lib/index-general-autocomplete.ts create mode 100644 src/search/scripts/index/lib/index-general-search.ts delete mode 100644 src/search/scripts/index/lib/populate.ts delete mode 100644 src/search/scripts/index/lib/repoint-alias.ts create mode 100644 src/search/scripts/index/utils/constants.ts create mode 100644 src/search/scripts/index/utils/indexing-elasticsearch-utils.ts create mode 100644 src/search/scripts/index/utils/mappings.ts rename src/search/scripts/index/{lib => utils}/retry-on-error-test.ts (97%) create mode 100644 src/search/scripts/index/utils/settings.ts delete mode 100644 src/search/scripts/retry-on-error-test.js create mode 100644 src/search/scripts/scrape/README.md rename src/search/scripts/{build-records.js => scrape/lib/build-records.ts} (75%) rename src/search/scripts/{domwaiter.js => scrape/lib/domwaiter.ts} (50%) rename src/search/scripts/{find-indexable-pages.js => scrape/lib/find-indexable-pages.ts} (70%) rename src/search/scripts/{parse-page-sections-into-records.js => scrape/lib/parse-page-sections-into-records.ts} (91%) rename src/search/scripts/{popular-pages.js => scrape/lib/popular-pages.ts} (61%) rename src/search/scripts/{sync.js => scrape/lib/scrape-into-index-json.ts} (64%) rename src/search/scripts/{validate-records.js => scrape/lib/search-index-records.ts} (61%) rename src/search/scripts/{sync-search-indices.js => scrape/scrape-cli.ts} (64%) mode change 100755 => 100644 create mode 100644 src/search/scripts/scrape/types.ts delete mode 100644 src/search/scripts/search-index-records.js create mode 100644 src/search/tests/api-ai-search-autocomplete.ts rename src/search/tests/{api-autocomplete-search.js => api-general-autocomplete-search.ts} (62%) rename src/search/tests/{api-search.js => api-search.ts} (66%) create mode 100644 src/search/tests/fixtures/data/ai/search/queries/en/enterprise-cloud/queries.json create mode 100644 src/search/tests/fixtures/data/ai/search/queries/en/free-pro-team/queries.json rename src/search/tests/fixtures/search-indexes/{github-docs-dotcom-en-records.json => tests_github-docs_general-search_fpt_en-records.json} (100%) rename src/search/tests/fixtures/search-indexes/{github-docs-dotcom-ja-records.json => tests_github-docs_general-search_fpt_ja-records.json} (100%) rename src/search/tests/fixtures/search-indexes/{github-docs-ghec-en-records.json => tests_github-docs_general-search_ghec_en-records.json} (100%) rename src/search/tests/fixtures/search-indexes/{github-docs-ghec-ja-records.json => tests_github-docs_general-search_ghec_ja-records.json} (100%) rename src/search/tests/{parse-page-sections-into-records.js => parse-page-sections-into-records.ts} (63%) rename src/search/tests/{rendering.js => rendering.ts} (93%) delete mode 100644 src/search/tests/search.js create mode 100644 src/search/tests/search.ts delete mode 100644 src/search/tests/topics.js create mode 100644 src/search/tests/topics.ts create mode 100644 src/search/types.ts create mode 100644 src/tests/helpers/e2etest-ts.ts diff --git a/.github/workflows/index-autocomplete-elasticsearch.yml b/.github/workflows/index-autocomplete-search.yml similarity index 69% rename from .github/workflows/index-autocomplete-elasticsearch.yml rename to .github/workflows/index-autocomplete-search.yml index a316812c81e4..da837deec763 100644 --- a/.github/workflows/index-autocomplete-elasticsearch.yml +++ b/.github/workflows/index-autocomplete-search.yml @@ -1,7 +1,7 @@ -name: Index autocomplete Elasticsearch +name: Index autocomplete search in Elasticsearch -# **What it does**: Indexes autocomplete data into Elasticsearch. -# **Why we have it**: So we can power the API for autocomplete. +# **What it does**: Indexes autocomplete data (general and AI search) into Elasticsearch. +# **Why we have it**: So we can power the APIs for autocomplete. # **Who does it impact**: docs-engineering on: @@ -10,7 +10,7 @@ on: - cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST pull_request: paths: - - .github/workflows/index-autocomplete-elasticsearch.yml + - .github/workflows/index-autocomplete-search.yml - 'src/search/scripts/index/**' - 'package*.json' @@ -40,10 +40,15 @@ jobs: if: ${{ github.event_name == 'pull_request' }} run: curl --fail --retry-connrefused --retry 5 -I http://localhost:9200 - - name: Run indexing + - name: Run general auto-complete indexing env: ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }} - run: npm run index -- autocomplete docs-internal-data + run: npm run index-general-autocomplete -- docs-internal-data + + - name: Run AI search auto-complete indexing + env: + ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }} + run: npm run index-ai-search-autocomplete -- docs-internal-data - uses: ./.github/actions/slack-alert if: ${{ failure() && github.event_name == 'schedule' }} diff --git a/.github/workflows/sync-search-pr.yml b/.github/workflows/index-general-search-pr.yml similarity index 81% rename from .github/workflows/sync-search-pr.yml rename to .github/workflows/index-general-search-pr.yml index f7d504a77f6f..3f819ce556af 100644 --- a/.github/workflows/sync-search-pr.yml +++ b/.github/workflows/index-general-search-pr.yml @@ -1,6 +1,6 @@ -name: Sync search - PR +name: Index general search in Elasticsearch on PR -# **What it does**: This does what `sync-sarch-elasticsearch.yml` does but +# **What it does**: This does what `index-general-search-elasticsearch.yml` does but # with a localhost Elasticsearch and only for English. # **Why we have it**: To test that the script works and the popular pages json is valid. # **Who does it impact**: Docs engineering @@ -11,8 +11,8 @@ on: paths: - 'src/search/**' - 'package*.json' - # Ultimately, for debugging this workflow itself - - .github/workflows/sync-search-pr.yml + # For debugging this workflow + - .github/workflows/index-general-search-pr.yml # Make sure we run this if the composite action changes - .github/actions/setup-elasticsearch/action.yml @@ -25,9 +25,6 @@ concurrency: cancel-in-progress: true env: - # Yes, it's hardcoded but it makes all the steps look exactly the same - # as they do in `sync-search-elasticsearch.yml` where it uses - # that `${{ env.ELASTICSEARCH_URL }}` ELASTICSEARCH_URL: http://localhost:9200 # Since we'll run in NDOE_ENV=production, we need to be explicit that # we don't want Hydro configured. @@ -63,7 +60,7 @@ jobs: env: ENABLE_DEV_LOGGING: false run: | - npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log & + npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log & # first sleep to give it a chance to start sleep 6 @@ -88,15 +85,13 @@ jobs: # let's just accept an empty string instead. THROW_ON_EMPTY: false - # The sync-search-index recognizes this env var if you don't - # use the `--docs-internal-data ` option. DOCS_INTERNAL_DATA: docs-internal-data run: | mkdir /tmp/records - npm run sync-search-indices -- /tmp/records \ + npm run general-search-scrape -- /tmp/records \ --language en \ - --version dotcom + --version fpt ls -lh /tmp/records @@ -106,9 +101,9 @@ jobs: - name: Index into Elasticsearch run: | - npm run index-elasticsearch -- /tmp/records \ + npm run index-general-search -- /tmp/records \ --language en \ - --version dotcom + --version fpt - name: Check created indexes and aliases run: | diff --git a/.github/workflows/sync-search-elasticsearch.yml b/.github/workflows/index-general-search.yml similarity index 94% rename from .github/workflows/sync-search-elasticsearch.yml rename to .github/workflows/index-general-search.yml index 4ca84e08993f..0f175cf2c675 100644 --- a/.github/workflows/sync-search-elasticsearch.yml +++ b/.github/workflows/index-general-search.yml @@ -1,4 +1,4 @@ -name: Sync search Elasticsearch +name: Index general search in Elasticsearch # **What it does**: It scrapes the whole site and dumps the records in a # temp directory. Then it indexes that into Elasticsearch. @@ -140,7 +140,7 @@ jobs: env: ENABLE_DEV_LOGGING: false run: | - npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log & + npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log & # first sleep to give it a chance to start sleep 6 @@ -169,13 +169,11 @@ jobs: # the same as not set within the script. VERSION: ${{ inputs.version }} - # The sync-search-index recognizes this env var if you don't - # use the `--docs-internal-data ` option. DOCS_INTERNAL_DATA: docs-internal-data run: | mkdir /tmp/records - npm run sync-search-indices -- /tmp/records \ + npm run general-search-scrape -- /tmp/records \ --language ${{ matrix.language }} ls -lh /tmp/records @@ -186,12 +184,12 @@ jobs: - name: Index into Elasticsearch env: - # Must match what we used when scraping (npm run sync-search-indices) + # Must match what we used when scraping (npm run general-search-scrape) # otherwise the script will seek other versions from disk that might # not exist. VERSION: ${{ inputs.version }} run: | - npm run index-elasticsearch -- /tmp/records \ + npm run index-general-search -- /tmp/records \ --language ${{ matrix.language }} \ --stagger-seconds 5 \ --retries 5 diff --git a/.gitignore b/.gitignore index 218dc0efa2cf..0591ebbdb049 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,9 @@ assets/images/help/writing/unordered-list-rendered (1).png # Used by precompute-pageinfo .pageinfo-cache.json.br + +# Cloned and used for indexing Elasticsearch data +docs-internal-data/ + +# For intermediate data (like scraping for Elasticsearch indexing) +tmp/ \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index ccebe9df887f..44c695262210 100644 --- a/package-lock.json +++ b/package-lock.json @@ -109,10 +109,13 @@ "@octokit/rest": "21.0.2", "@playwright/test": "^1.48.1", "@types/accept-language-parser": "1.5.6", + "@types/cheerio": "^0.22.35", "@types/connect-datadog": "0.0.10", "@types/connect-timeout": "0.0.39", "@types/cookie": "0.6.0", "@types/cookie-parser": "1.4.7", + "@types/elasticsearch": "^5.0.43", + "@types/event-to-promise": "^0.7.5", "@types/express": "4.17.21", "@types/imurmurhash": "^0.1.4", "@types/js-cookie": "^3.0.6", @@ -3165,6 +3168,15 @@ "integrity": "sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==", "dev": true }, + "node_modules/@types/cheerio": { + "version": "0.22.35", + "resolved": "https://registry.npmjs.org/@types/cheerio/-/cheerio-0.22.35.tgz", + "integrity": "sha512-yD57BchKRvTV+JD53UZ6PD8KWY5g5rvvMLRnZR3EQBCZXiDT/HR+pKpMzFGlWNhFrXlo7VPZXtKvIEwZkAWOIA==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/connect": { "version": "3.4.38", "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz", @@ -3228,12 +3240,27 @@ "@types/ms": "*" } }, + "node_modules/@types/elasticsearch": { + "version": "5.0.43", + "resolved": "https://registry.npmjs.org/@types/elasticsearch/-/elasticsearch-5.0.43.tgz", + "integrity": "sha512-N+MpzURpDCWd7zaJ7CE1aU+nBSeAABLhDE0lGodQ0LLftx7ku6hjTXLr9OAFZLSXiWL3Xxx8jts485ynrcm5NA==", + "dev": true + }, "node_modules/@types/estree": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz", "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==", "dev": true }, + "node_modules/@types/event-to-promise": { + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/@types/event-to-promise/-/event-to-promise-0.7.5.tgz", + "integrity": "sha512-h10M3ybTySQFVP4N1uiEgPwbpHExNS8UMpCqRUJFkMhlpgSlWsyYsGMmkrJIKRnhGfYDOb4LD3U+SSPujoMHNA==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/express": { "version": "4.17.21", "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz", diff --git a/package.json b/package.json index 17ae073dc21c..569dfee80c0b 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ "exports": "./src/frame/server.ts", "scripts": { "all-documents": "tsx src/content-render/scripts/all-documents/cli.ts", - "analyze-text": "node src/search/scripts/analyze-text.js", + "analyze-text": "tsx src/search/scripts/analyze-text.ts", "analyze-comment": "tsx src/events/scripts/analyze-comment-cli.ts", "archive-version": "tsx --max-old-space-size=16384 src/ghes-releases/scripts/archive-version.ts", "audit-log-sync": "tsx src/audit-logs/scripts/sync.ts", @@ -39,8 +39,14 @@ "find-unused-variables": "tsx src/content-linter/scripts/find-unsed-variables.ts", "fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start", "fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests", - "index": "tsx src/search/scripts/index/index.ts", - "index-elasticsearch": "node src/search/scripts/index-elasticsearch.js", + "general-search-scrape": "tsx src/search/scripts/scrape/scrape-cli.ts", + "general-search-scrape-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts", + "ghes-release-scrape-with-server": "cross-env GHES_RELEASE=1 start-server-and-test general-search-scrape-server 4002 general-search-scrape", + "general-search-scrape-with-server": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test general-search-scrape-server 4002 general-search-scrape", + "index": "tsx src/search/scripts/index/index-cli autocomplete docs-internal-data", + "index-ai-search-autocomplete": "tsx src/search/scripts/index/index-cli ai-search-autocomplete", + "index-general-autocomplete": "tsx src/search/scripts/index/index-cli general-autocomplete", + "index-general-search": "tsx src/search/scripts/index/index-cli general-search", "index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh", "lint": "eslint '**/*.{js,mjs,ts,tsx}'", "lint-content": "node src/content-linter/scripts/lint-content.js", @@ -70,10 +76,6 @@ "start-for-playwright": "cross-env ROOT=src/fixtures/fixtures TRANSLATIONS_FIXTURE_ROOT=src/fixtures/fixtures/translations ENABLED_LANGUAGES=en,ja NODE_ENV=test tsx src/frame/server.ts", "symlink-from-local-repo": "node src/early-access/scripts/symlink-from-local-repo.js", "sync-rest": "tsx src/rest/scripts/update-files.ts", - "sync-search": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test sync-search-server 4002 sync-search-indices", - "sync-search-ghes-release": "cross-env GHES_RELEASE=1 start-server-and-test sync-search-server 4002 sync-search-indices", - "sync-search-indices": "node src/search/scripts/sync-search-indices.js", - "sync-search-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts", "sync-secret-scanning": "tsx src/secret-scanning/scripts/sync.ts", "sync-webhooks": "npx tsx src/rest/scripts/update-files.ts -o webhooks", "test": "vitest", @@ -222,6 +224,7 @@ "src/open-source/scripts/add-pr-links.js", "src/open-source/scripts/pr-link-source.js", "rest-api-description/", + "docs-internal-data/", "src/code-scanning/scripts/generate-code-scanning-query-list.ts" ] }, @@ -327,10 +330,13 @@ "@octokit/rest": "21.0.2", "@playwright/test": "^1.48.1", "@types/accept-language-parser": "1.5.6", + "@types/cheerio": "^0.22.35", "@types/connect-datadog": "0.0.10", "@types/connect-timeout": "0.0.39", "@types/cookie": "0.6.0", "@types/cookie-parser": "1.4.7", + "@types/elasticsearch": "^5.0.43", + "@types/event-to-promise": "^0.7.5", "@types/express": "4.17.21", "@types/imurmurhash": "^0.1.4", "@types/js-cookie": "^3.0.6", diff --git a/src/fixtures/tests/breadcrumbs.ts b/src/fixtures/tests/breadcrumbs.ts index e51a7d678dfb..41bc836ea1d5 100644 --- a/src/fixtures/tests/breadcrumbs.ts +++ b/src/fixtures/tests/breadcrumbs.ts @@ -68,7 +68,7 @@ describe('breadcrumbs', () => { expect($breadcrumbTitles.length).toBe(0) expect($breadcrumbLinks.length).toBe(2) - expect($breadcrumbLinks[0].attribs.title).toBe('Deeper secrets') - expect($breadcrumbLinks[1].attribs.title).toBe('Mariana Trench') + expect(($breadcrumbLinks[0] as cheerio.TagElement).attribs.title).toBe('Deeper secrets') + expect(($breadcrumbLinks[1] as cheerio.TagElement).attribs.title).toBe('Mariana Trench') }) }) diff --git a/src/frame/middleware/api.ts b/src/frame/middleware/api.ts index 1770a0b742ef..62d77ef61916 100644 --- a/src/frame/middleware/api.ts +++ b/src/frame/middleware/api.ts @@ -3,7 +3,7 @@ import { createProxyMiddleware } from 'http-proxy-middleware' import events from '@/events/middleware.js' import anchorRedirect from '@/rest/api/anchor-redirect.js' -import search from '@/search/middleware/search.js' +import search from '@/search/middleware/search-routes.js' import pageInfo from '@/pageinfo/middleware' import pageList from '@/pagelist/middleware' import webhooks from '@/webhooks/middleware/webhooks.js' diff --git a/src/frame/middleware/index.ts b/src/frame/middleware/index.ts index fe11bf133f59..d93377e09ed4 100644 --- a/src/frame/middleware/index.ts +++ b/src/frame/middleware/index.ts @@ -61,7 +61,7 @@ import fastlyCacheTest from './fastly-cache-test' import trailingSlashes from './trailing-slashes' import mockVaPortal from './mock-va-portal' import dynamicAssets from '@/assets/middleware/dynamic-assets' -import contextualizeSearch from '@/search/middleware/contextualize.js' +import generalSearchMiddleware from '@/search/middleware/general-search-middleware' import shielding from '@/shielding/middleware' import tracking from '@/tracking/middleware' import { MAX_REQUEST_TIMEOUT } from '@/frame/lib/constants.js' @@ -275,7 +275,7 @@ export default function (app: Express) { app.use(asyncMiddleware(productExamples)) app.use(asyncMiddleware(productGroups)) app.use(asyncMiddleware(glossaries)) - app.use(asyncMiddleware(contextualizeSearch)) + app.use(asyncMiddleware(generalSearchMiddleware)) app.use(asyncMiddleware(featuredLinks)) app.use(asyncMiddleware(learningTrack)) diff --git a/src/frame/tests/favicons.ts b/src/frame/tests/favicons.ts index 030b35fdd19f..28b873047233 100644 --- a/src/frame/tests/favicons.ts +++ b/src/frame/tests/favicons.ts @@ -15,7 +15,10 @@ describe('favicon assets', () => { expect(res.headers['cache-control']).toContain('public') expect(res.headers['cache-control']).toContain('immutable') expect(res.headers['cache-control']).toMatch(/max-age=\d+/) - const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10) + const maxAgeSeconds = parseInt( + (res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '', + 10, + ) // Let's not be too specific in the tests, just as long as it's testing // that it's a reasonably large number of seconds. expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60) @@ -25,13 +28,16 @@ describe('favicon assets', () => { test('should serve a valid and aggressively caching /apple-touch-icon.png', async () => { const res = await get('/apple-touch-icon.png') expect(res.statusCode).toBe(200) - expect(parseInt(res.headers['content-length'], 10)).toBeGreaterThan(0) + expect(parseInt(res.headers['content-length'] || '', 10)).toBeGreaterThan(0) expect(res.headers['content-type']).toBe('image/png') expect(res.headers['set-cookie']).toBeUndefined() expect(res.headers['cache-control']).toContain('public') expect(res.headers['cache-control']).toContain('immutable') expect(res.headers['cache-control']).toMatch(/max-age=\d+/) - const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10) + const maxAgeSeconds = parseInt( + (res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '', + 10, + ) // Let's not be too specific in the tests, just as long as it's testing // that it's a reasonably large number of seconds. expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60) diff --git a/src/frame/tests/manifest.ts b/src/frame/tests/manifest.ts index dcdc74eb5c2c..c1ba9e00fc58 100644 --- a/src/frame/tests/manifest.ts +++ b/src/frame/tests/manifest.ts @@ -20,6 +20,9 @@ describe('manifest', () => { test('download manifest from HTML and check content', async () => { const $ = await getDOM('/') const url = $('link[rel="manifest"]').attr('href') + if (!url) { + throw new Error('No manifest URL found') + } const res = await get(url) expect(res.statusCode).toBe(200) diff --git a/src/ghes-releases/lib/release-templates/release-steps-1.md b/src/ghes-releases/lib/release-templates/release-steps-1.md index 33ebc7b82fb7..4f01a44dc099 100644 --- a/src/ghes-releases/lib/release-templates/release-steps-1.md +++ b/src/ghes-releases/lib/release-templates/release-steps-1.md @@ -17,7 +17,7 @@ labels: - [Prerequisites](#prerequisites) - [Create publication branch for a new version of GHES](#creation) - [Resolve check failures](#check-failures) -- [Sync the search indices](#sync-search-indices) +- [Scrape the search indices](#scrape-search-indices) - [Maintain the publication branch](#maintenance) - [Complete preparation for the RC and publish the docset](#publication) @@ -110,11 +110,11 @@ For content from the OpenAPI schema, note the affected content with broken links
- + -### [🔎](#sync-search-indices) Sync the search indices +### [🔎](#scrape-search-indices) Scrape the search indices -1. Go to the [`sync-search-elasticsearch` workflow](https://github.com/github/docs-internal/actions/workflows/sync-search-elasticsearch.yml) ([permalink](https://github.com/github/docs-internal/blob/f8ca45703c48c7d1976a278337bc3391fb14fe9e/.github/workflows/sync-search-elasticsearch.yml) in case it moves) +1. Go to the [`index-general-search.yml` workflow](https://github.com/github/docs-internal/actions/workflows/index-general-search.yml) 1. Click on the **Run workflow** drop down and set the following parameters: - `Branch:` set to the name of the publication branch - `Version` set to the version you're publishing (e.g., `ghes-3.12` if you're publishing GHES 3.12) diff --git a/src/languages/tests/frame.ts b/src/languages/tests/frame.ts index d75ee9a484f1..e00c61fffd0b 100644 --- a/src/languages/tests/frame.ts +++ b/src/languages/tests/frame.ts @@ -17,15 +17,15 @@ describe('frame', () => { test.each(langs)('breadcrumbs link to %s pages', async (lang) => { const $ = await getDOM(`/${lang}/get-started/learning-about-github`) const $breadcrumbs = $('[data-testid=breadcrumbs-in-article] a') - expect($breadcrumbs[0].attribs.href).toBe(`/${lang}/get-started`) + expect(($breadcrumbs[0] as cheerio.TagElement).attribs.href).toBe(`/${lang}/get-started`) }) test.each(langs)('homepage links go to %s pages', async (lang) => { const $ = await getDOM(`/${lang}`) const $links = $('[data-testid=bump-link]') - $links.each((i: number, el: Element) => { + $links.each((i: number, el: cheerio.Element) => { const linkUrl = $(el).attr('href') - expect(linkUrl.startsWith(`/${lang}/`)).toBe(true) + expect((linkUrl || '').startsWith(`/${lang}/`)).toBe(true) }) }) diff --git a/src/links/scripts/rendered-content-link-checker.ts b/src/links/scripts/rendered-content-link-checker.ts index ba87dbd28a31..db4aef1c5964 100755 --- a/src/links/scripts/rendered-content-link-checker.ts +++ b/src/links/scripts/rendered-content-link-checker.ts @@ -3,7 +3,7 @@ import fs from 'fs' import path from 'path' -import cheerio, { type CheerioAPI, type Element } from 'cheerio' +import cheerio from 'cheerio' import coreLib from '@actions/core' import got, { RequestError } from 'got' import chalk from 'chalk' @@ -339,7 +339,15 @@ async function main( const t0 = new Date().getTime() const flawsGroups = await Promise.all( pages.map((page: Page) => - processPage(core, page, pageMap, redirects, opts, externalLinkCheckerDB, versions), + processPage( + core, + page, + pageMap, + redirects, + opts, + externalLinkCheckerDB, + versions as string[], + ), ), ) const t1 = new Date().getTime() @@ -695,13 +703,13 @@ async function processPermalink( } const $ = cheerio.load(html, { xmlMode: true }) const flaws: LinkFlaw[] = [] - const links: Element[] = [] + const links: cheerio.Element[] = [] $('a[href]').each((i, link) => { links.push(link) }) const newFlaws: LinkFlaw[] = await Promise.all( links.map(async (link) => { - const { href } = link.attribs + const { href } = (link as cheerio.TagElement).attribs // The global cache can't be used for anchor links because they // depend on each page it renders @@ -752,7 +760,7 @@ async function processPermalink( if (checkImages) { $('img[src]').each((i, img) => { - let { src } = img.attribs + let { src } = (img as cheerio.TagElement).attribs // Images get a cache-busting prefix injected in the image // E.g. @@ -874,7 +882,7 @@ let globalCacheMissCount = 0 async function checkHrefLink( core: any, href: string, - $: CheerioAPI, + $: cheerio.Root, redirects: Redirects, pageMap: PageMap, checkAnchors = false, diff --git a/src/search/README.md b/src/search/README.md index eef9a860a478..4532f3ebdc71 100644 --- a/src/search/README.md +++ b/src/search/README.md @@ -16,9 +16,36 @@ The site search is part of every version of docs.github.com. This endpoint respo You can also query our search endpoint directly at: `https://docs.github.com/search?version=&language=&query=` -- The VERSION can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`). -- The LANGUAGE CODE can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko` -- Any search QUERY you'd like. +- The `VERSION` can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`). +- The `LANGUAGE CODE` can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko` +- The `QUERY` can be any alphanumeric string value. + +## Types of search + +Our backend currently supports 3 "types" of searching. + +All searches accept a `query` param, e.g. `?query=how` and return results based on their type: + +1. **general search** + - Results: The pages of our sites that match the query, sorted by popularity + - Example: Query = "clone" -> Results + - Endpoint: `/api/search/v1` +2. **general autocomplete** + - Results: Potential terms that can be autocompleted from the query based on previous user searches + - Example: Query = "cl" -> A Result = "clone" + - Endpoint: `/api/search/autocomplete/v1` +3. **AI search autocomplete** + - Results: Human-readable full-sentence questions that best match the query. Questions are based on previous searches and popular pages + - Example: Query = "How do I clone" -> A Result = "How do I clone a repository?" + - Endpoint: `/api/search/ai-search-autocomplete/v1` + +## Elasticsearch + +Elasticsearch is an external service that we use for searching. When a user types a search, our backend queries Elasticsearch for the most relevant results. + +### Indexing Elasticsearch + +In order to provide relevant results to queries, we prefill Elasticsearch with data via Indexes. See the [Indexing README](./scripts/index/README.md) for how we index on Docs. ## Production deploys @@ -32,40 +59,25 @@ You can manually run the workflow to generate the indexes after you push your ch ### Build and sync -The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/sync-search-elasticsearch.yml). +The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/index-general-search.yml). ## Files ### Actions workflow files -- [`.github/workflows/sync-search-elasticsearch.yml`](/.github/workflows/sync-search-elasticsearch.yml) - Builds and syncs search indices on the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab. +- [`.github/workflows/index-general-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for **general search** using the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab. +- [`.github/workflows/index-autocomplete-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for both **general autocomplete** and **AI search autocomplete** using data from an internal repo. Runs daily. ### Notable code files and directories - [src/search/components/Search.tsx](/src/search/components/Search.tsx) - The browser-side code that enables the search. - [src/search/components/SearchResults.tsx](/src/search/components/SearchResults.tsx) - The browser-side code that displays search results. -- [src/search/middleware/es-search.js](/src/search/middleware/es-search.js) - A wrapper around the Node.js Elasticsearch module for interacting with the search API. -- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations. -- [src/search/tests](/src/search/tests) - Tests! - -## Records - -Each record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example: - -```json -{ - "objectID":"/en/actions/creating-actions/about-custom-actions", - "breadcrumbs":"GitHub Actions / Creating actions", - "title":"About custom actions", - "headings":"About custom actions\nTypes of actions\n[...]", - "content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]", - "intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.", - "toplevel":"GitHub Actions", - "popularity":0 -} -``` - -## Notes +- [src/search/middleware/general-search-middleware.ts](src/search/middleware/general-search-middleware.ts) - Entrypoint to general search when you hit docs.github/search +- [src/search/middleware/search-routes](src/search/middleware/general-search-middleware.ts) - Entrypoint to the API endpoints for our search routes +- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations like scraping data for indexing and performing the indexing. +- [src/search/tests](/src/search/tests) - Tests relevant to searching. + +## Miscellaneous Notes - It's not strictly necessary to set an `objectID` as the search index will create one automatically, but by creating our own we have a guarantee that subsequent invocations of this upload script will overwrite existing records instead of creating numerous duplicate records with differing IDs. - Our search querying has typo tolerance. Try spelling something wrong and see what you get! diff --git a/src/search/components/Aggregations.tsx b/src/search/components/Aggregations.tsx index cde2aadf7ba8..415acbde5e58 100644 --- a/src/search/components/Aggregations.tsx +++ b/src/search/components/Aggregations.tsx @@ -2,9 +2,10 @@ import { CheckboxGroup, Checkbox, FormControl } from '@primer/react' import { useRouter } from 'next/router' import Link from 'next/link' -import type { SearchResultAggregations } from './types' import { useTranslation } from 'src/languages/components/useTranslation' +import type { SearchResultAggregations } from 'src/search/types' + type Props = { aggregations: SearchResultAggregations } diff --git a/src/search/components/SearchResults.tsx b/src/search/components/SearchResults.tsx index 054e1a14e3e4..3e628b6a1370 100644 --- a/src/search/components/SearchResults.tsx +++ b/src/search/components/SearchResults.tsx @@ -4,30 +4,39 @@ import { useRouter } from 'next/router' import { useEffect, useState } from 'react' import cx from 'classnames' -import type { SearchResultsT, SearchResultHitT, SearchQueryT } from './types' import { useTranslation } from 'src/languages/components/useTranslation' import { Link } from 'src/frame/components/Link' import { sendEvent, EventType } from 'src/events/components/events' import styles from './SearchResults.module.scss' +import type { SearchQueryContentT } from 'src/search/components/types' +import type { GeneralSearchHitWithoutIncludes, GeneralSearchResponse } from 'src/search/types' +import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types' + type Props = { - results: SearchResultsT - search: SearchQueryT + results: GeneralSearchResponse + searchParams: SearchQueryContentT } -export function SearchResults({ results, search }: Props) { - const pages = Math.ceil(results.meta.found.value / results.meta.size) +export function SearchResults({ results, searchParams }: Props) { + const pages = Math.ceil((results.meta.found as SearchTotalHits).value / results.meta.size) const { page } = results.meta return (
- + {pages > 1 && }
) } -function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search: SearchQueryT }) { +function SearchResultHits({ + hits, + searchParams, +}: { + hits: GeneralSearchHitWithoutIncludes[] + searchParams: SearchQueryContentT +}) { return (
{hits.length === 0 && } @@ -35,10 +44,10 @@ function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search: ))}
@@ -64,7 +73,7 @@ function SearchResultHit({ index, debug, }: { - hit: SearchResultHitT + hit: GeneralSearchHitWithoutIncludes query: string totalHits: number index: number diff --git a/src/search/components/ValidationErrors.tsx b/src/search/components/ValidationErrors.tsx index 3be3074b35fa..ce8bf15c5577 100644 --- a/src/search/components/ValidationErrors.tsx +++ b/src/search/components/ValidationErrors.tsx @@ -1,10 +1,10 @@ import { Flash } from '@primer/react' import { useTranslation } from 'src/languages/components/useTranslation' -import type { SearchValidationErrorT } from './types' +import type { SearchValidationErrorEntry } from '../types' interface Props { - errors: SearchValidationErrorT[] + errors: SearchValidationErrorEntry[] } export function ValidationErrors({ errors }: Props) { diff --git a/src/search/components/context/SearchContext.tsx b/src/search/components/context/SearchContext.tsx index 08ff25d14d97..086896cb8bce 100644 --- a/src/search/components/context/SearchContext.tsx +++ b/src/search/components/context/SearchContext.tsx @@ -1,10 +1,5 @@ import { createContext, useContext } from 'react' - -import type { SearchT } from '../types' - -export type SearchContextT = { - search: SearchT -} +import type { SearchContextT } from '../types' export const SearchContext = createContext(null) diff --git a/src/search/components/index.tsx b/src/search/components/index.tsx index 39399b3c08f7..26fc041be6a6 100644 --- a/src/search/components/index.tsx +++ b/src/search/components/index.tsx @@ -7,8 +7,9 @@ import { useNumberFormatter } from 'src/search/components/useNumberFormatter' import { SearchResults } from 'src/search/components/SearchResults' import { NoQuery } from 'src/search/components/NoQuery' import { useMainContext } from 'src/frame/components/context/MainContext' -import { ValidationErrors } from './ValidationErrors' -import { useSearchContext } from './context/SearchContext' +import { ValidationErrors } from 'src/search/components/ValidationErrors' +import { useSearchContext } from 'src/search/components/context/SearchContext' +import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types' export function Search() { const { search } = useSearchContext() @@ -17,7 +18,7 @@ export function Search() { const { t } = useTranslation('search_results') const { currentVersion } = useVersion() - const { query } = search.search + const { query } = search.searchParams // A reference to the `content/search/index.md` Page object. // Not to be confused with the "page" that is for paginating @@ -37,7 +38,7 @@ export function Search() { pageTitle += ` (${searchVersion})` } if (results) { - pageTitle = `${formatInteger(results.meta.found.value)} ${pageTitle}` + pageTitle = `${formatInteger((results.meta.found as SearchTotalHits).value)} ${pageTitle}` } } @@ -63,7 +64,7 @@ export function Search() { ) : null} - {results ? : null} + {results ? : null} ) } diff --git a/src/search/components/types.ts b/src/search/components/types.ts index ce6c8a80ef77..1dad85ca9dec 100644 --- a/src/search/components/types.ts +++ b/src/search/components/types.ts @@ -1,58 +1,15 @@ -export type SearchResultHitT = { - id: string - url: string - title: string - breadcrumbs: string - highlights: { - title?: string[] - content?: string[] - content_explicit?: string[] - } - score?: number - popularity?: number - es_url?: string -} +import { GeneralSearchResponse, SearchValidationErrorEntry } from 'src/search/types' -type SearchResultsMeta = { - found: { - value: number - relation: string +export interface SearchContextT { + search: { + results?: GeneralSearchResponse + searchParams: SearchQueryContentT + validationErrors: SearchValidationErrorEntry[] } - took: { - query_msec: number - total_msec: number - } - page: number - size: number -} - -type Aggregation = { - key: string - count: number -} - -export type SearchResultAggregations = { - [key: string]: Aggregation[] } -export type SearchResultsT = { - meta: SearchResultsMeta - hits: SearchResultHitT[] - aggregations?: SearchResultAggregations -} - -export type SearchQueryT = { +// Parts of the search query that are set to the search context +export type SearchQueryContentT = { query: string debug: boolean } - -export type SearchValidationErrorT = { - error: string - // key: string -} - -export type SearchT = { - search: SearchQueryT - results?: SearchResultsT - validationErrors: SearchValidationErrorT[] -} diff --git a/src/search/lib/config.js b/src/search/lib/config.js deleted file mode 100644 index 21ec77955be7..000000000000 --- a/src/search/lib/config.js +++ /dev/null @@ -1,5 +0,0 @@ -export const namePrefix = 'github-docs' - -export default { - namePrefix, -} diff --git a/src/search/lib/elasticsearch-indexes.ts b/src/search/lib/elasticsearch-indexes.ts new file mode 100644 index 000000000000..4990d229a97b --- /dev/null +++ b/src/search/lib/elasticsearch-indexes.ts @@ -0,0 +1,91 @@ +import languages from '@/languages/lib/languages.js' +import { utcTimestamp } from '@/search/lib/helpers/time' +import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions' + +import type { SearchTypes } from '@/search/types' + +export type SearchIndexes = { + [key in SearchTypes]: SearchIndex +} + +export type SearchIndex = { + prefix: string + type: string +} + +/* Elasticsearch uses indexes to group categories of data + + We currently have 3 top-level categories of indexes: + 1. General search: This is populated using data from all of our Docs pages + 2. General autocomplete: This is populated using analytics search history in docs-internal-data + 3. AI autocomplete: This is populated with human-readable questions using a GPT query in docs-internal-data + + This file is intended to be the source of truth for Docs Elasticsearch indexes. + + Indexes are in the form: + --- + e.g. github-docs-general-search-fpt-en + + might be "tests_" for tests +*/ +const prefix = 'github-docs' +const indexes: SearchIndexes = { + generalSearch: { + prefix, + type: 'general-search', + }, + generalAutocomplete: { + prefix, + type: 'general-autocomplete', + }, + aiSearchAutocomplete: { + prefix, + type: 'ai-search-autocomplete', + }, +} + +// Source of truth for determining the index name for the Elastic Search index given a version and language +export function getElasticSearchIndex( + type: SearchTypes, + version: string, + language: string, + manualPrefix = '', +): { + indexName: string + indexAlias: string +} { + if (!(type in indexes)) { + throw new Error(`Type ${type} not found in indexes for getElasticSearchIndex function.`) + } + const index = indexes[type] as SearchIndex + + // Validate language + if (!(language in languages)) { + throw new Error( + `Language ${language} not found in languages for getElasticSearchIndex function.`, + ) + } + + // Validate version + if (!allIndexVersionKeys.includes(version)) { + throw new Error( + `Version '${version}' does not map to a valid version for getElasticSearchIndex function.`, + ) + } + + // e.g. free-pro-team becomes fpt for the index name + const indexVersion = versionToIndexVersionMap[version] + + // In the index-test-fixtures.sh script, we use the tests_ prefix index for testing + const testPrefix = process.env.NODE_ENV === 'test' ? 'tests_' : '' + + // If a manual prefix is provided, append an underscore to it + if (manualPrefix && !manualPrefix.endsWith('_')) { + manualPrefix += '_' + } + + const indexName = `${testPrefix || manualPrefix}${index.prefix}_${index.type}_${indexVersion}_${language}` + const indexAlias = `${indexName}__${utcTimestamp()}` + + return { indexName, indexAlias } +} diff --git a/src/search/lib/elasticsearch-versions.ts b/src/search/lib/elasticsearch-versions.ts new file mode 100644 index 000000000000..7d6cb0dd070a --- /dev/null +++ b/src/search/lib/elasticsearch-versions.ts @@ -0,0 +1,107 @@ +/* + * Source of truth for versioning in the context of Elasticsearch + * We have a unique index for each version of the docs + * so consistency is important for creating/accessing ES Indexes. + * + * Example versions (these may not be up to date): + * + * 1. free-pro-team@latest. Previously known as "dotcom". This is the default version of the docs. + * - short name: fpt + * 2. enterprise-cloud@latest + * - short name: ghec + * 3. enterprise-server@X: This is the source of versioning complexity since the version is dynamic + * - short name: ghes-X + * + * However, for (3) someone might enter `&version=3.5` as the version in the request query string. + * This would map to `ghes-3.5` + */ + +import { allVersions } from '@/versions/lib/all-versions' + +// versionToIndexVersionMap examples: +// free-pro-team@latest -> fpt +// free-pro-team -> fpt +// dotcom -> fpt +// enterprise-cloud@latest -> ghec +// enterprise-server@3.5 -> ghes-3.5 +// 3.5 -> ghes-3.5 +export const versionToIndexVersionMap: { [key: string]: string } = {} + +// For each potential input (from request query string, CLI, etc), map it to the appropriate index version +for (const versionSource of Object.values(allVersions)) { + if (versionSource.hasNumberedReleases) { + versionToIndexVersionMap[versionSource.currentRelease] = versionSource.miscVersionName + // Map shortname or plan, e.g. `ghes` or `enterprise-server` to the latest release, e.g. `ghes-3.14` + if (versionSource.latestRelease === versionSource.currentRelease) { + versionToIndexVersionMap[versionSource.plan] = versionSource.miscVersionName + versionToIndexVersionMap[versionSource.shortName] = versionSource.miscVersionName + } + } else { + versionToIndexVersionMap[versionSource.version] = versionSource.shortName + versionToIndexVersionMap[versionSource.miscVersionName] = versionSource.shortName + // The next two lines map things like `?version=free-pro-team` -> `?version=fpt` + versionToIndexVersionMap[versionSource.plan] = versionSource.shortName + versionToIndexVersionMap[versionSource.shortName] = versionSource.shortName + } +} + +// All of the possible keys that can be input to access a version +export const allIndexVersionKeys = Array.from( + new Set([...Object.keys(versionToIndexVersionMap), ...Object.keys(allVersions)]), +) + +// These should be the only possible values that an ES index will use (source of truth) +// allIndexVersionOptions example: +// fpt, ghec, ghes-3.14, ghes-3.13, ghes-3.12, ghes-3.11, ghes-3.10 +export const allIndexVersionOptions = Array.from( + new Set([...Object.values(versionToIndexVersionMap)]), +) + +// Autocomplete only supports 3 "versions": free-pro-team, enterprise-cloud, and enterprise-server +// docs-internal-data stores data under directories with these names. It does not account for individual enterprise-server versions +// These are the "plan" names on the allVersions object +const allVersionPlans: string[] = [] +for (const version of Object.values(allVersions)) { + if (version.plan) { + allVersionPlans.push(version.plan) + } +} +// Remove duplicates +export const supportedAutocompletePlanVersions = Array.from(new Set(allVersionPlans)) + +// Returns the plan name for the given version +// Needed because {version} in the docs-internal-data paths use the version's 'plan' name, e.g. `free-pro-team` instead of `fpt` +export function getPlanVersionFromIndexVersion(indexVersion: string): string { + const planVersion = + Object.values(allVersions).find( + (info) => + info.shortName === indexVersion || + info.plan === indexVersion || + info.miscVersionName === indexVersion || + info.currentRelease === indexVersion, + )?.plan || '' + + if (!planVersion) { + throw new Error(`Plan version not found for index version ${indexVersion}`) + } + + return planVersion +} + +// Gets the matching key from allVersions for the given index version +// This is needed for scraping since the pages use the 'allVersions' key as their version +export function getAllVersionsKeyFromIndexVersion(indexVersion: string): string { + const key = Object.keys(allVersions).find( + (key) => + key === indexVersion || + allVersions[key].shortName === indexVersion || + allVersions[key].plan === indexVersion || + allVersions[key].miscVersionName === indexVersion, + ) + + if (!key) { + throw new Error(`No key found for index version ${indexVersion}`) + } + + return key +} diff --git a/src/search/lib/get-elasticsearch-results/ai-search-autocomplete.ts b/src/search/lib/get-elasticsearch-results/ai-search-autocomplete.ts new file mode 100644 index 000000000000..4d63dd62d3a9 --- /dev/null +++ b/src/search/lib/get-elasticsearch-results/ai-search-autocomplete.ts @@ -0,0 +1,125 @@ +import { Client } from '@elastic/elasticsearch' +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config' + +import type { AutocompleteSearchResponse } from '@/search/types' +import type { + AutocompleteMatchQueriesOptions, + AutocompleteResultsArgs, +} from '@/search/lib/get-elasticsearch-results/types' +import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types' + +// Query Elasticsearch for AI Search autocomplete results +export async function getAISearchAutocompleteResults({ + indexName, + query, + size, +}: AutocompleteResultsArgs): Promise { + const t0 = new Date() + const client = getElasticsearchClient() as Client + + const matchQueries = getAISearchAutocompleteMatchQueries(query.trim(), { + fuzzy: { + minLength: 3, + maxLength: 20, + }, + }) + const matchQuery = { + bool: { + should: matchQueries, + }, + } + + const highlight = getHighlightConfiguration(query, ['term']) + + const searchQuery = { + index: indexName, + highlight, + size, + query: matchQuery, + _source_includes: ['term'], + } + + const result = await client.search<{ term: string }>(searchQuery) + + const hitsAll = result.hits + const hits = hitsAll.hits.map((hit) => ({ + term: hit._source?.term, + highlights: (hit.highlight && hit.highlight.term) || [], + })) + + return { + meta: { + found: hitsAll.total as SearchTotalHits, + took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() }, + size, + }, + hits, + } +} + +function getAISearchAutocompleteMatchQueries( + query: string, + { fuzzy }: AutocompleteMatchQueriesOptions, +) { + const BOOST_PHRASE = 4.0 + const BOOST_REGULAR = 2.0 + const BOOST_PREFIX = 1.0 + const BOOST_FUZZY = 0.1 + + const matchQueries: QueryDslQueryContainer[] = [] + + // Use match_phrase for exact term matches + matchQueries.push({ + match_phrase: { + term: { + query, + boost: BOOST_PHRASE, + slop: 1, // Allows minor word reordering + }, + }, + }) + + // Use match for general matching + matchQueries.push({ + match: { + term: { + query, + boost: BOOST_PREFIX, + }, + }, + }) + + // Match phrase prefix for partial term matches + matchQueries.push({ + match_phrase_prefix: { + term: { + query, + boost: BOOST_PREFIX, + }, + }, + }) + matchQueries.push({ + match_bool_prefix: { + term: { + query, + boost: BOOST_REGULAR, + }, + }, + }) + + // Add fuzzy matching for typos and variations + if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) { + matchQueries.push({ + fuzzy: { + term: { + value: query, + boost: BOOST_FUZZY, + fuzziness: 'AUTO', + }, + }, + }) + } + + return matchQueries +} diff --git a/src/search/lib/get-elasticsearch-results/general-autocomplete.ts b/src/search/lib/get-elasticsearch-results/general-autocomplete.ts new file mode 100644 index 000000000000..0f3653940e72 --- /dev/null +++ b/src/search/lib/get-elasticsearch-results/general-autocomplete.ts @@ -0,0 +1,100 @@ +import { Client } from '@elastic/elasticsearch' +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config' + +import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types' +import type { AutocompleteSearchResponse } from '@/search/types' +import type { + AutocompleteMatchQueriesOptions, + AutocompleteResultsArgs, + AutocompleteElasticsearchItem, +} from '@/search/lib/get-elasticsearch-results/types' + +// Query Elasticsearch for general autocomplete results +export async function getAutocompleteSearchResults({ + indexName, + query, + size, +}: AutocompleteResultsArgs): Promise { + const t0 = new Date() + const client = getElasticsearchClient() as Client + + const matchQueries = getAutocompleteMatchQueries(query.trim(), { + fuzzy: { + minLength: 3, + maxLength: 20, + }, + }) + const matchQuery = { + bool: { + should: matchQueries, + }, + } + + const highlight = getHighlightConfiguration(query, ['term']) + + const searchQuery = { + index: indexName, + highlight, + size, + query: matchQuery, + // Send absolutely minimal from Elasticsearch to here. Less data => faster. + _source_includes: ['term'], + } + + const result = await client.search(searchQuery) + + const hitsAll = result.hits + const hits = hitsAll.hits.map((hit) => ({ + term: hit._source?.term, + highlights: (hit.highlight && hit.highlight.term) || [], + })) + + return { + meta: { + found: hitsAll.total as SearchTotalHits, + took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() }, + size, + }, + hits, + } +} + +function getAutocompleteMatchQueries(query: string, { fuzzy }: AutocompleteMatchQueriesOptions) { + const BOOST_PHRASE = 4.0 + const BOOST_REGULAR = 2.0 + const BOOST_FUZZY = 0.1 + + const matchQueries: QueryDslQueryContainer[] = [] + const isMultiWordQuery = query.includes(' ') || query.includes('-') + + if (isMultiWordQuery) { + matchQueries.push({ + match_phrase_prefix: { + term: { + query, + boost: BOOST_PHRASE, + }, + }, + }) + } + + matchQueries.push({ + match_bool_prefix: { + term: { + query, + boost: BOOST_REGULAR, + }, + }, + }) + + if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) { + matchQueries.push({ + fuzzy: { + term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' }, + }, + }) + } + + return matchQueries +} diff --git a/src/search/middleware/es-search.js b/src/search/lib/get-elasticsearch-results/general-search.ts similarity index 59% rename from src/search/middleware/es-search.js rename to src/search/lib/get-elasticsearch-results/general-search.ts index a23e2314dec6..263a7b787739 100644 --- a/src/search/middleware/es-search.js +++ b/src/search/lib/get-elasticsearch-results/general-search.ts @@ -1,57 +1,54 @@ -import { Client } from '@elastic/elasticsearch' - -export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content'] -// This needs to match what we *use* in the `` component. -// For example, if we don't display "headings" we shouldn't request -// highlights for it either. -export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content'] - -const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { DEFAULT_HIGHLIGHT_FIELDS } from '@/search/lib/search-request-params/search-params-objects' +import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config' + +import type { + SearchHit as ElasticsearchHit, + QueryDslQueryContainer, + SearchRequest, + SearchTotalHits, +} from '@elastic/elasticsearch/lib/api/types' +import type { + AdditionalIncludes, + ComputedSearchQueryParamsMap, +} from '@/search/lib/search-request-params/types' +import type { SearchAggregation, GeneralSearchHit, GeneralSearchResponse } from '@/search/types' const MAX_AGGREGATE_SIZE = 30 -const isDevMode = process.env.NODE_ENV !== 'production' +const isDevMode: boolean = process.env.NODE_ENV !== 'production' -function getClient() { - if (!ELASTICSEARCH_URL) { - // If this was mistakenly not set, it will eventually fail - // when you use the Client. But `new Client({node: undefined})` - // won't throw. And the error you get when you actually do try - // to use that Client instance is cryptic compared to this - // plain and simple thrown error. - throw new Error(`$ELASTICSEARCH_URL is not set`) - } - return new Client({ - node: ELASTICSEARCH_URL, - // The default is 30,000ms but we noticed that the median time is about - // 100-150ms with some occasional searches taking multiple seconds. - // The default `maxRetries` is 3 which is a sensible number. - // If a query gets stuck, it's better to (relatively) quickly give up - // and retry. So if it takes longer than this time here, we're banking on - // that it was just bad luck and that it'll work if we simply try again. - // See internal issue #2318. - requestTimeout: 1900, - // It's important that requestTimeout * maxRetries is less than 10 seconds. - maxRetries: 5, - }) +type getGeneralSearchResultsParams = { + indexName: string + searchParams: ComputedSearchQueryParamsMap['generalSearch'] + topics?: string[] + includeTopics?: boolean } -// The true work horse that actually performs the Elasticsearch query -export async function getSearchResults({ - indexName, - query, - page, - size, - debug, - sort, - topics, - includeTopics, - usePrefixSearch, - highlights, - include, - toplevel, - aggregate, -}) { +// Query Elasticsearch for general search results +export async function getGeneralSearchResults( + args: getGeneralSearchResultsParams, +): Promise { + const { + indexName, + searchParams: { + highlights, + include, + toplevel, + aggregate, + autocomplete, + query, + page, + size, + debug, + sort, + }, + topics, + includeTopics, + } = args + + const usePrefixSearch = autocomplete + if (topics && !Array.isArray(topics)) { throw new Error("'topics' has to be an array") } @@ -71,8 +68,8 @@ export async function getSearchResults({ throw new Error("Every entry in the 'toplevel' must be a string") } } - const t0 = new Date() - const client = getClient() + const t0 = Date.now() + const client = getElasticsearchClient() const from = size * (page - 1) const matchQueries = getMatchQueries(query.trim(), { @@ -83,7 +80,7 @@ export async function getSearchResults({ }, }) - const matchQuery = { + const matchQuery: Record = { bool: { should: matchQueries, // This allows filtering by toplevel later. @@ -91,7 +88,8 @@ export async function getSearchResults({ }, } - const topicsFilter = (topics || []).map((topic) => { + const topicsArray = Array.isArray(topics) ? topics : topics ? [topics] : [] + const topicsFilter = topicsArray.map((topic) => { return { term: { // Remember, 'topics' is a keyword field, meaning you need @@ -101,15 +99,18 @@ export async function getSearchResults({ } }) if (topicsFilter.length) { - matchQuery.bool.filter = topicsFilter + matchQuery.bool.filter = matchQuery.bool.filter || [] + matchQuery.bool.filter.push(...topicsFilter) } - if (toplevel && toplevel.length) { - matchQuery.bool.filter = { + const toplevelArray = toplevel || [] + if (toplevelArray.length) { + matchQuery.bool.filter = matchQuery.bool.filter || [] + matchQuery.bool.filter.push({ terms: { - toplevel, + toplevel: toplevelArray, }, - } + }) } const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS) @@ -121,7 +122,7 @@ export async function getSearchResults({ const aggs = getAggregations(aggregate) - const searchQuery = { + const searchQuery: SearchRequest = { index: indexName, highlight, from, @@ -136,13 +137,13 @@ export async function getSearchResults({ _source_includes: ['title', 'url', 'breadcrumbs', 'popularity', 'toplevel'], } - if (includeTopics) { - searchQuery._source_includes.push('topics') + if (includeTopics && Array.isArray(searchQuery._source_includes)) { + searchQuery._source_includes?.push('topics') } - for (const key of ['intro', 'headings']) { - if (include.includes(key)) { - searchQuery._source_includes.push(key) + for (const key of ['intro', 'headings'] as const) { + if (include.includes(key) && Array.isArray(searchQuery._source_includes)) { + searchQuery._source_includes?.push(key) } } @@ -193,26 +194,26 @@ export async function getSearchResults({ highlightFields, include, }) - const aggregations = getAggregationsResult(aggregate, result.aggregations) - const t1 = new Date() + const aggregationsResult = getAggregationsResult(aggregate, result.aggregations) + const t1 = Date.now() const meta = { - found: hitsAll.total, + found: hitsAll.total as SearchTotalHits, took: { query_msec: result.took, - total_msec: t1.getTime() - t0.getTime(), + total_msec: t1 - t0, }, page, size, } - return { meta, hits, aggregations } + return { meta, hits, aggregations: aggregationsResult } } -function getAggregations(aggregate) { +function getAggregations(aggregate?: string[]): Record | undefined { if (!aggregate || !aggregate.length) return undefined - const aggs = {} + const aggs: Record = {} for (const key of aggregate) { aggs[key] = { terms: { @@ -224,66 +225,37 @@ function getAggregations(aggregate) { return aggs } -function getAggregationsResult(aggregate, result) { - if (!aggregate || !aggregate.length) return - return Object.fromEntries( - aggregate.map((key) => [ - key, - result[key].buckets - .map((bucket) => { - return { - key: bucket.key, - count: bucket.doc_count, - } - }) - .sort((a, b) => a.key.localeCompare(b.key)), - ]), - ) -} - -export async function getAutocompleteSearchResults({ indexName, query, size }) { - const client = getClient() - - const matchQueries = getAutocompleteMatchQueries(query.trim(), { - fuzzy: { - minLength: 3, - maxLength: 20, - }, - }) - const matchQuery = { - bool: { - should: matchQueries, - }, - } - - const highlight = getHighlightConfiguration(query, ['term']) - - const searchQuery = { - index: indexName, - highlight, - size, - query: matchQuery, - // Send absolutely minimal from Elasticsearch to here. Less data => faster. - _source_includes: ['term'], - } - const result = await client.search(searchQuery) - - const hitsAll = result.hits - const hits = hitsAll.hits.map((hit) => { - return { - term: hit._source.term, - highlights: (hit.highlight && hit.highlight.term) || [], +function getAggregationsResult( + aggregate?: string[], + result?: Record, +): Record | undefined { + if (!aggregate || !aggregate.length || !result) return undefined + const aggregations: Record = {} + for (const key of aggregate) { + if (result[key]?.buckets) { + aggregations[key] = result[key].buckets + .map((bucket: any) => ({ + key: bucket.key as string, + count: bucket.doc_count as number, + })) + .sort((a: { key: string }, b: { key: string }) => a.key.localeCompare(b.key)) } - }) - - const meta = { - found: hitsAll.total, } + return aggregations +} - return { meta, hits } +interface GetMatchQueriesOptions { + usePrefixSearch: boolean + fuzzy: { + minLength: number + maxLength: number + } } -function getMatchQueries(query, { usePrefixSearch, fuzzy }) { +function getMatchQueries( + query: string, + { usePrefixSearch, fuzzy }: GetMatchQueriesOptions, +): QueryDslQueryContainer[] { const BOOST_PHRASE = 10.0 const BOOST_TITLE = 4.0 const BOOST_HEADINGS = 3.0 @@ -296,7 +268,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) { // which wouldn't find anything else anyway. const BOOST_FUZZY = 0.1 - const matchQueries = [] + const matchQueries: QueryDslQueryContainer[] = [] // If the query input is multiple words, it's good to know because you can // make the query do `match_phrase` and you can make `match` query @@ -453,12 +425,12 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) { } else if (query.startsWith('http')) { // E.g. `https://docs.github.com/en/some/page?foo=bar` // will become a search on `{url: '/en/some/page'}` - let pathname + let pathname: string | undefined try { pathname = new URL(query).pathname } catch { // If it failed, it can't be initialized with the `URL` constructor - // we so we can deem it *not* a valid URL. + // so we can deem it *not* a valid URL. } if (pathname) { matchQueries.push({ @@ -471,47 +443,18 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) { return matchQueries } -function getAutocompleteMatchQueries(query, { fuzzy }) { - const BOOST_PHRASE = 4.0 - const BOOST_REGULAR = 2.0 - const BOOST_FUZZY = 0.1 // make it always last in ranking - const matchQueries = [] - - // If the query input is multiple words, it's good to know because you can - // make the query do `match_phrase` and you can make `match` query - // with the `AND` operator (`OR` is the default). - const isMultiWordQuery = query.includes(' ') || query.includes('-') - - if (isMultiWordQuery) { - matchQueries.push({ - match_phrase_prefix: { - term: { - query, - boost: BOOST_PHRASE, - }, - }, - }) - } - matchQueries.push({ - match_bool_prefix: { - term: { - query, - boost: BOOST_REGULAR, - }, - }, - }) - if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) { - matchQueries.push({ - fuzzy: { - term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' }, - }, - }) - } - - return matchQueries +interface GetHitsOptions { + indexName: string + debug?: boolean + includeTopics?: boolean + highlightFields: string[] + include: AdditionalIncludes[] } -function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) { +function getHits( + hits: ElasticsearchHit[], + { indexName, debug = false, includeTopics = false, highlightFields, include }: GetHitsOptions, +): GeneralSearchHit[] { return hits.map((hit) => { // Return `hit.highlights[...]` based on the highlight fields requested. // So if you searched with `&highlights=headings&highlights=content` @@ -521,11 +464,12 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu // headings: [...] // } // even if there was a match on 'title'. - const hitHighlights = Object.fromEntries( - highlightFields.map((key) => [key, (hit.highlight && hit.highlight[key]) || []]), - ) + const hitHighlights: Record = {} + for (const key of highlightFields) { + hitHighlights[key] = (hit.highlight && hit.highlight[key]) || [] + } - const result = { + const result: GeneralSearchHit = { id: hit._id, url: hit._source.url, title: hit._source.title, @@ -536,87 +480,15 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu result.topics = hit._source.topics || [] } if (debug) { - result.score = hit._score || 0.0 - result.popularity = hit._source.popularity || 0.0 + result.score = hit._score ?? 0.0 + result.popularity = hit._source.popularity ?? 0.0 if (isDevMode) { result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}` } } - for (const field of include || []) { + for (const field of include) { result[field] = hit._source[field] } return result }) } - -// The highlight configuration is dependent on how we use the content -// in the UI. For example, we feel we need about 3 lines (max) -// of highlights of content under each title. If we feel it shows too -// many highlights in the search result UI, we can come back here -// and change it to something more appropriate. -function getHighlightConfiguration(query, highlights) { - const fields = {} - if (highlights.includes('title')) { - fields.title = { - // Fast Vector Highlighter - // Using this requires that you first index these fields - // with {term_vector: 'with_positions_offsets'} - type: 'fvh', - fragment_size: 200, - number_of_fragments: 1, - } - } - if (highlights.includes('content')) { - // The 'no_match_size' is so we can display *something* for the - // preview if there was no highlight match at all within the content. - fields.content = { - // Fast Vector Highlighter - // Using this requires that you first index these fields - // with {term_vector: 'with_positions_offsets'} - type: 'fvh', - fragment_size: 150, - number_of_fragments: 1, - no_match_size: 150, - - highlight_query: { - match_phrase_prefix: { - content: { - query, - }, - }, - }, - } - fields.content_explicit = { - // Fast Vector Highlighter - // Using this requires that you first index these fields - // with {term_vector: 'with_positions_offsets'} - type: 'fvh', - fragment_size: 150, - number_of_fragments: 1, - no_match_size: 0, - - highlight_query: { - match_phrase_prefix: { - content_explicit: { - query, - }, - }, - }, - } - } - if (highlights.includes('term')) { - fields.term = { - // Fast Vector Highlighter - // Using this requires that you first index these fields - // with {term_vector: 'with_positions_offsets'} - type: 'fvh', - // fragment_size: 200, - // number_of_fragments: 1, - } - } - return { - pre_tags: [''], - post_tags: [''], - fields, - } -} diff --git a/src/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config.ts b/src/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config.ts new file mode 100644 index 000000000000..98c9bc5ff6ec --- /dev/null +++ b/src/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config.ts @@ -0,0 +1,86 @@ +import { SearchHighlight } from '@elastic/elasticsearch/lib/api/types' + +import type { HighlightOptions } from '@/search/lib/search-request-params/types' + +export interface HighlightConfig { + type: string + fragment_size?: number + number_of_fragments?: number + no_match_size?: number + highlight_query?: object +} + +export type HighlightFields = { + [key in HighlightOptions]: HighlightConfig +} + +// When we query Elasticsearch, we can specify a highlight configuration +export function getHighlightConfiguration( + query: string, + highlightsFields: HighlightOptions[], +): SearchHighlight { + const fields = {} as HighlightFields + if (highlightsFields.includes('title')) { + fields.title = { + // Fast Vector Highlighter + // Using this requires that you first index these fields + // with {term_vector: 'with_positions_offsets'} + type: 'fvh', + fragment_size: 200, + number_of_fragments: 1, + } + } + if (highlightsFields.includes('content')) { + fields.content = { + // Fast Vector Highlighter + // Using this requires that you first index these fields + // with {term_vector: 'with_positions_offsets'} + type: 'fvh', + fragment_size: 150, + number_of_fragments: 1, + // So we can at least display something if there was no highlight match within the content. + no_match_size: 150, + + highlight_query: { + match_phrase_prefix: { + content: { + query, + }, + }, + }, + } + fields.content_explicit = { + // Fast Vector Highlighter + // Using this requires that you first index these fields + // with {term_vector: 'with_positions_offsets'} + type: 'fvh', + fragment_size: 150, + number_of_fragments: 1, + no_match_size: 0, + + highlight_query: { + match_phrase_prefix: { + content_explicit: { + query, + }, + }, + }, + } + } + if (highlightsFields.includes('term')) { + fields.term = { + // Fast Vector Highlighter + // Using this requires that you first index these fields + // with {term_vector: 'with_positions_offsets'} + type: 'fvh', + } + } + + const highlightConfig: SearchHighlight = { + pre_tags: [''], + post_tags: [''], + fields, + } + + return highlightConfig +} diff --git a/src/search/lib/get-elasticsearch-results/types.ts b/src/search/lib/get-elasticsearch-results/types.ts new file mode 100644 index 000000000000..da6fa59f5612 --- /dev/null +++ b/src/search/lib/get-elasticsearch-results/types.ts @@ -0,0 +1,23 @@ +export interface AutocompleteResultsArgs { + indexName: string + query: string + size: number +} + +export interface FuzzyConfig { + minLength: number + maxLength: number +} + +export interface MatchQueriesOptions { + usePrefixSearch?: boolean + fuzzy: FuzzyConfig +} + +export interface AutocompleteMatchQueriesOptions { + fuzzy: FuzzyConfig +} + +export interface AutocompleteElasticsearchItem { + term: string +} diff --git a/src/search/lib/helpers/get-client.ts b/src/search/lib/helpers/get-client.ts new file mode 100644 index 000000000000..b6b4c1106452 --- /dev/null +++ b/src/search/lib/helpers/get-client.ts @@ -0,0 +1,31 @@ +import { Client } from '@elastic/elasticsearch' +import { safeUrlDisplay } from '@/search/lib/helpers/strings' + +export function getElasticsearchClient(overrideURL = '', verbose = false): Client { + const node = getElasticsearchURL(overrideURL) + if (verbose) { + console.log('Connecting to Elasticsearch URL:', safeUrlDisplay(node)) + } + const client = new Client({ node }) + return client +} + +function getElasticsearchURL(overrideURL = ''): string { + if (!process.env.ELASTICSEARCH_URL && !overrideURL) { + throw new Error( + 'Must pass the elasticsearch URL option or ' + + 'set the environment variable ELASTICSEARCH_URL', + ) + } + let node = overrideURL || process.env.ELASTICSEARCH_URL || '' + + // Allow the user to lazily set it to `localhost:9200` for example. + if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { + node = `http://${node}` + } + + const parsed = new URL(node) + if (!parsed.hostname) throw new Error('no valid hostname') + + return node +} diff --git a/src/search/lib/helpers/old-version-logic.ts b/src/search/lib/helpers/old-version-logic.ts new file mode 100644 index 000000000000..137c63c71495 --- /dev/null +++ b/src/search/lib/helpers/old-version-logic.ts @@ -0,0 +1,44 @@ +import { allVersions } from '@/versions/lib/all-versions' + +// TODO: Old version logic +type VersionAliases = { [key: string]: string } +export const versionAliases: VersionAliases = {} +export const prefixVersionAliases: VersionAliases = {} + +Object.values(allVersions).forEach((info) => { + if (info.hasNumberedReleases) { + versionAliases[info.currentRelease] = info.miscVersionName + } else { + versionAliases[info.version] = info.miscVersionName + versionAliases[info.miscVersionName] = info.miscVersionName + } + prefixVersionAliases[info.plan] = info.shortName + prefixVersionAliases[info.shortName] = info.shortName +}) + +// Temporary hard-coded switch +// +// We need to run workflows in production to index the search data +// We want the middleware + routes that consume the indexes to consume the old indexes +// until the new indexes are ready. + +// Once they are ready we can remove this file & cleanup the places it is used +export function isBeforeSearchIndexMigration() { + if (process.env.NODE_ENV === 'production') return true + return false +} + +// Old test prefix helper function +export function getGeneralSearchIndexPrefix(): string { + if (process.env.NODE_ENV === 'test') return 'tests_' + return '' +} + +export function getGeneralSearchIndexVersion(paramVersion: string): string { + const version = + prefixVersionAliases[paramVersion] || + versionAliases[paramVersion] || + allVersions[paramVersion].miscVersionName + + return version +} diff --git a/src/search/lib/helpers/strings.ts b/src/search/lib/helpers/strings.ts new file mode 100644 index 000000000000..d8ca26383cc4 --- /dev/null +++ b/src/search/lib/helpers/strings.ts @@ -0,0 +1,10 @@ +export function safeUrlDisplay(url: string): string { + const parsed = new URL(url) + if (parsed.password) { + parsed.password = '***' + } + if (parsed.username) { + parsed.username = parsed.username.slice(0, 4) + '***' + } + return parsed.toString() +} diff --git a/src/search/scripts/index/lib/utils.ts b/src/search/lib/helpers/time.ts similarity index 56% rename from src/search/scripts/index/lib/utils.ts rename to src/search/lib/helpers/time.ts index 779ba85c6990..36579358d4ef 100644 --- a/src/search/scripts/index/lib/utils.ts +++ b/src/search/lib/helpers/time.ts @@ -33,3 +33,28 @@ export function utcTimestamp() { .join('') ) } + +/** + * Converts a given number of seconds into a formatted time string "HH:mm:ss". + * + * @param {number} seconds - The total number of seconds to format. + * @returns {string} A string representing the time in "hours:minutes:seconds" format. + * + * @example + * // returns "01:30:45" + * formatSeconds(5445); + */ +export function formatSecondsToHHMMSS(seconds: number): string { + return new Date(seconds * 1000).toISOString().substr(11, 8) +} + +export function readableTimeMinAndSec(ms: number): string { + if (ms < 1000) { + return `${ms.toFixed(1)}ms` + } + const seconds = ms / 1000 + if (seconds > 60) { + return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s` + } + return `${seconds.toFixed(1)}s` +} diff --git a/src/search/lib/search-request-params/get-search-from-request-params.ts b/src/search/lib/search-request-params/get-search-from-request-params.ts new file mode 100644 index 000000000000..1ae3fd38ad85 --- /dev/null +++ b/src/search/lib/search-request-params/get-search-from-request-params.ts @@ -0,0 +1,96 @@ +import type { Request } from 'express' +import { format } from 'node:util' + +import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' +import { + ValidationError, + getSearchRequestParamsObject, +} from '@/search/lib/search-request-params/search-params-objects' +import { + getGeneralSearchIndexVersion, + getGeneralSearchIndexPrefix, + isBeforeSearchIndexMigration, +} from '@/search/lib/helpers/old-version-logic' + +import type { + ComputedSearchQueryParams, + ComputedSearchQueryParamsMap, + GetSearchRequestReturn, +} from '@/search/lib/search-request-params/types' +import type { SearchTypes, SearchValidationErrorEntry } from '@/search/types' + +type ForceParams = { + [K in keyof ComputedSearchQueryParams]?: ComputedSearchQueryParams[K] +} + +// Fetches the Search Params Object based on the type of request and uses that object to validate the passed in request parameters +// For example, if the request is a general search request, the general search params object expects a `page` key, e.g. ?page=1 on the request +// If that key is not present, it will be added to the validation errors array which will result in a 400 to the user. +export function getSearchFromRequestParams( + req: Request, + type: Type, + forceParams: ForceParams = {} as ForceParams, +): GetSearchRequestReturn { + const searchParamsObject = getSearchRequestParamsObject(type) + + const searchParams: ComputedSearchQueryParamsMap[Type] = {} as ComputedSearchQueryParamsMap[Type] + const validationErrors: SearchValidationErrorEntry[] = [] + + for (const { key, default_, cast, validate, multiple } of searchParamsObject) { + if (key in forceParams) { + ;(searchParams[key] as any) = forceParams[key] as any + continue + } + + let value = req.query[key] + if (!value || (typeof value === 'string' && !value.trim())) { + if (default_ === undefined) { + validationErrors.push({ error: `No truthy value for key '${key}'`, key }) + continue + } + value = default_ + } + if (cast) { + value = cast(value) + } + try { + if (validate && !validate(value)) { + validationErrors.push({ + error: format('Not a valid value (%O) for key %O', value, key), + key, + }) + } + } catch (err) { + if (err instanceof ValidationError) { + validationErrors.push({ error: err.toString(), field: key }) + } else { + throw err + } + } + if (!multiple && Array.isArray(value)) { + validationErrors.push({ + error: format('Cannot have multiple values (%O) for key %O', value, key), + key, + }) + } + + ;(searchParams[key] as any) = value + } + + let indexName = '' + if (!validationErrors.length) { + // generalSearch is the only type of search that uses the old index prefix logic, rather than the `getElasticSearchIndex` function logic + if (type === 'generalSearch' && isBeforeSearchIndexMigration()) { + indexName = `${getGeneralSearchIndexPrefix()}github-docs-${getGeneralSearchIndexVersion(searchParams.version)}-${searchParams.language}` + } else { + const getIndexResults = getElasticSearchIndex( + type, + searchParams.version, + searchParams.language, + ) + indexName = getIndexResults.indexName + } + } + + return { indexName, searchParams, validationErrors } +} diff --git a/src/search/lib/search-request-params/search-params-objects.ts b/src/search/lib/search-request-params/search-params-objects.ts new file mode 100644 index 000000000000..76dfce35e61e --- /dev/null +++ b/src/search/lib/search-request-params/search-params-objects.ts @@ -0,0 +1,153 @@ +/* + When a request is made to a /search endpoint with query parameters, e.g. ?query=foo&version=free-pro-team, + we need to validate and parse the parameters. This file contains the configuration for which parameters + to expect based on the type of search request "e.g. general search vs autocomplete search" and how to validate them. + */ +import languages from '@/languages/lib/languages' +import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions' +import { SearchTypes } from '@/search/types' +import { versionAliases } from '@/search/lib/helpers/old-version-logic' +import { allVersions } from '@/versions/lib/all-versions' + +import type { SearchRequestQueryParams } from '@/search/lib/search-request-params/types' + +// Entry to this file, returns the query parameters to expect based on the type of search request +export function getSearchRequestParamsObject(type: SearchTypes): SearchRequestQueryParams[] { + if (type === 'generalAutocomplete') { + return AUTOCOMPLETE_PARAMS_OBJ + } else if (type === 'aiSearchAutocomplete') { + return AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ + } + return GENERAL_SEARCH_PARAMS_OBJ +} + +// - - - Everything below this line is for building the search query param objects - - - // + +// Constants +const DEFAULT_AUTOCOMPLETE_SIZE = 5 +const MAX_AUTOCOMPLETE_SIZE = 10 +const DEFAULT_SIZE = 10 +const MAX_SIZE = 50 +const DEFAULT_PAGE = 1 +const POSSIBLE_SORTS = ['best', 'relevance'] as const +const DEFAULT_SORT = POSSIBLE_SORTS[0] +const MAX_PAGE = 10 +const V1_AGGREGATES = ['toplevel'] as const +export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content'] as const +// This needs to match what we *use* in the `` component. +// For example, if we don't display "headings" we shouldn't request +// highlights for it either. +export const DEFAULT_HIGHLIGHT_FIELDS: readonly string[] = ['title', 'content'] + +export const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel'] as const + +export class ValidationError extends Error {} + +const SHARED_PARAMS_OBJ: SearchRequestQueryParams[] = [ + { key: 'query' }, + { + key: 'version', + default_: 'free-pro-team', + validate: (version: string) => { + if (!versionToIndexVersionMap[version]) { + throw new ValidationError(`'${version}' not in ${allIndexVersionKeys.join(', ')}`) + } + return true + }, + }, +] + +const GENERAL_SEARCH_PARAMS_OBJ: SearchRequestQueryParams[] = [ + ...SHARED_PARAMS_OBJ, + { key: 'query' }, + // TODO: Overwrite with old version logic for now + { + key: 'version', + default_: 'dotcom', + validate: (v) => { + if (versionAliases[v] || allVersions[v]) return true + const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)] + throw new ValidationError(`'${v}' not in ${valid}`) + }, + }, + { key: 'language', default_: 'en', validate: (v) => v in languages }, + { + key: 'size', + default_: DEFAULT_SIZE, + cast: (v) => parseInt(v, 10), + validate: (v) => v >= 0 && v <= MAX_SIZE, + }, + { + key: 'page', + default_: DEFAULT_PAGE, + cast: (v) => parseInt(v, 10), + validate: (v) => v >= 1 && v <= MAX_PAGE, + }, + { key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v as any) }, + { + key: 'highlights', + default_: DEFAULT_HIGHLIGHT_FIELDS, + cast: (v) => (Array.isArray(v) ? v : [v]), + multiple: true, + validate: (v) => { + for (const highlight of v) { + if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) { + throw new ValidationError(`highlight value '${highlight}' is not valid`) + } + } + return true + }, + }, + { key: 'autocomplete', default_: false, cast: toBoolean }, + { key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean }, + { + key: 'include', + default_: [], + cast: toArray, + multiple: true, + validate: (values) => + values.every((value: string) => V1_ADDITIONAL_INCLUDES.includes(value as any)), + }, + { + key: 'toplevel', + default_: [], + cast: toArray, + multiple: true, + }, + { + key: 'aggregate', + default_: [], + cast: toArray, + multiple: true, + validate: (values) => values.every((value: string) => V1_AGGREGATES.includes(value as any)), + }, +] + +const SHARED_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [ + { + key: 'size', + default_: DEFAULT_AUTOCOMPLETE_SIZE, + cast: (size: string) => parseInt(size, 10), + validate: (size: number) => size >= 0 && size <= MAX_AUTOCOMPLETE_SIZE, + }, +] + +const AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [ + ...SHARED_PARAMS_OBJ, + ...SHARED_AUTOCOMPLETE_PARAMS_OBJ, + { key: 'language', default_: 'en', validate: (language: string) => language === 'en' }, +] + +const AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [ + ...SHARED_PARAMS_OBJ, + ...SHARED_AUTOCOMPLETE_PARAMS_OBJ, + { key: 'language', default_: 'en', validate: (language: string) => language in languages }, +] + +function toBoolean(value: any): boolean { + return value === 'true' || value === '1' +} + +function toArray(value: any): any[] { + return Array.isArray(value) ? value : [value] +} diff --git a/src/search/lib/search-request-params/types.ts b/src/search/lib/search-request-params/types.ts new file mode 100644 index 000000000000..e9673c767769 --- /dev/null +++ b/src/search/lib/search-request-params/types.ts @@ -0,0 +1,52 @@ +import { V1_ADDITIONAL_INCLUDES } from '@/search/lib/search-request-params/search-params-objects' + +import { SearchTypes, SearchValidationErrorEntry } from '@/search/types' + +export type HighlightOptions = 'title' | 'content' | 'content_explicit' | 'term' + +export type AdditionalIncludes = (typeof V1_ADDITIONAL_INCLUDES)[number] + +export interface ComputedSearchQueryParams { + query: string + size: number + version: string + language: string + // These are optional, so we need to use ComputedSearchQueryParamsMap in functions to get the exact types per Search Type + page?: number + sort?: string + highlights?: HighlightOptions[] + autocomplete?: boolean + debug?: boolean + include?: AdditionalIncludes[] + toplevel?: string[] + aggregate?: string[] +} + +export interface ComputedSearchQueryParamsMap { + generalSearch: ComputedSearchQueryParams & { + page: number + sort: string + highlights: HighlightOptions[] + autocomplete: boolean + debug: boolean + include: AdditionalIncludes[] + toplevel: string[] + aggregate: string[] + } + generalAutocomplete: ComputedSearchQueryParams + aiSearchAutocomplete: ComputedSearchQueryParams +} + +export interface SearchRequestQueryParams { + key: keyof ComputedSearchQueryParams + default_?: any + cast?: (value: any) => any + validate?: (value: any) => boolean + multiple?: boolean +} + +export interface GetSearchRequestReturn { + indexName: string + searchParams: ComputedSearchQueryParamsMap[Type] + validationErrors: SearchValidationErrorEntry[] +} diff --git a/src/search/middleware/contextualize.js b/src/search/middleware/contextualize.js deleted file mode 100644 index 20462c06c88d..000000000000 --- a/src/search/middleware/contextualize.js +++ /dev/null @@ -1,153 +0,0 @@ -import got from 'got' -import { errors } from '@elastic/elasticsearch' -import statsd from '#src/observability/lib/statsd.js' - -import { getPathWithoutVersion, getPathWithoutLanguage } from '#src/frame/lib/path-utils.js' -import { getSearchFromRequest } from './get-search-request.js' -import { getSearchResults } from './es-search.js' - -export default async function contextualizeSearch(req, res, next) { - // If it's NextJS fetching or data or it's a direct request, - // the pagePath is the "normalized" version - const { pagePath } = req - if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') { - return next() - } - - // When you use `/api/search/v1?version=foo&language=xy&...` - // the language and version comes from the query string. - // When you use `/xz/enterprise-cloud@latest/search?query=hello` - // the `version` and `language` is implied from the URL pathname. - // search.version = req.context.currentVersion - // search.language = req.context.currentLanguage - - const { search, validationErrors } = getSearchFromRequest(req, { - version: req.context.currentVersion, - language: req.context.currentLanguage, - }) - - if (validationErrors.map((error) => error.key).includes('query')) { - // 'query' is such an exception because the search result component - // will attempt to display its value even if there was any - // validation error. In a sense, it displays: - // - // You searched for "foo" - // But your 'page' parameter is invalid. - // - // If for example, the search input is an array, we pick the first - // value. If it's too long, we truncate it. - if (Array.isArray(search.query)) { - search.query = search.query[0] - } else if (!search.query) { - // If the 'query' query string parameter wasn't even present, - // it becomes `undefined`. But since `search.query` needs to be - // a *string*, we pretend it was provided but empty. - search.query = '' - } - } - - // This enables so that when the search is sent to Elasticsearch - // it will request an aggregate by these keyword fields. - search.aggregate = ['toplevel'] - - req.context.search = { search, validationErrors } - - if (!validationErrors.length && search.query) { - if (!process.env.ELASTICSEARCH_URL) { - // This is only true in local dev or in Preview environments. - // And in local dev, it's usually for content contributors who - // want to test a preview locally, but don't want to have to - // set up Elasticsearch. - // This same proxying logic happens in `middleware/api/index.js` - // too for the outwards facing `/api/search/v1` endpoint. - if (search.aggregate && search.toplevel && search.toplevel.length > 0) { - // Do 2 searches. One without filtering - const { toplevel, ...searchWithoutFilter } = search - searchWithoutFilter.size = 0 - const { aggregations } = await getProxySearch(searchWithoutFilter) - const { aggregate, ...searchWithoutAggregate } = search - req.context.search.results = await getProxySearch(searchWithoutAggregate) - req.context.search.results.aggregations = aggregations - } else { - req.context.search.results = await getProxySearch(search) - } - } else { - // If this throws, so be it. Let it bubble up. - // In local dev, you get to see the error. In production, - // you get a "Oops! Something went wrong" which involves a Failbot - // send. - const tags = [`indexName:${search.indexName}`, `toplevels:${search.toplevel.length}`] - const timed = statsd.asyncTimer(getSearchResults, 'contextualize.search', tags) - try { - if (search.aggregate && search.toplevel && search.toplevel.length > 0) { - // Do 2 searches. One without filtering - const { toplevel, ...searchWithoutFilter } = search - searchWithoutFilter.size = 0 - const { aggregations } = await timed(searchWithoutFilter) - req.context.search.results = await timed(search) - req.context.search.results.aggregations = aggregations - } else { - req.context.search.results = await timed(search) - } - } catch (error) { - // If the error coming from the Elasticsearch client is any sort - // of 4xx error, it will be bubbled up to the next middleware - // which might think something else is wrong with the *client's* - // request from the outside. But in reality it's not their fault. - // It's our fault in the backend side. So we throw a new error - // so that this failure to seach ultimately bubbles up to a - // proper 500 error (including Failbot reporting). - // In particular, this helps platform developers working on the - // Elasticsearch searching code. - if (error instanceof errors.ElasticsearchClientError) { - console.error('Error calling getSearchResults(%s):', search, error) - if (error.meta?.body) { - console.error(`Meta:`, error.meta.body) - } - throw new Error(error.message) - } else { - throw error - } - } - } - } - - return next() -} - -// When you use the proxy to prod, using its API, we need to "convert" -// the parameters we have figured out here in the contextualizer. -// Thankfully all the names match. For example, we might figure -// the page by doing `req.context.search.page = 123` and now we need to -// add that to the query string for the `/api/search/v1`. -// We inclusion-list all the keys that we want to take from the search -// object into the query string URL. -const SEARCH_KEYS_TO_QUERY_STRING = [ - 'query', - 'version', - 'language', - 'page', - 'aggregate', - 'toplevel', - 'size', -] - -async function getProxySearch(search) { - const url = new URL('https://docs.github.com/api/search/v1') - for (const key of SEARCH_KEYS_TO_QUERY_STRING) { - const value = search[key] - if (typeof value === 'boolean') { - url.searchParams.set(key, value ? 'true' : 'false') - } else if (Array.isArray(value)) { - for (const v of value) { - url.searchParams.append(key, v) - } - } else if (typeof value === 'number') { - url.searchParams.set(key, `${value}`) - } else if (value) { - url.searchParams.set(key, value) - } - } - console.log(`Proxying search to ${url}`) - return got(url).json() -} diff --git a/src/search/middleware/general-search-middleware.ts b/src/search/middleware/general-search-middleware.ts new file mode 100644 index 000000000000..5613d2bb7ef4 --- /dev/null +++ b/src/search/middleware/general-search-middleware.ts @@ -0,0 +1,174 @@ +/* +This file & middleware is for when a user requests our /search page e.g. 'docs.github.com/search?query=foo' + We make whatever search is in the ?query= parameter and attach it to req.search + req.search is then consumed by the search component in 'src/search/pages/search.tsx' + +When a user directly hits our API e.g. /api/search/v1?query=foo, they will hit the routes in ./search-routes.ts +*/ + +import got from 'got' +import { Request, Response, NextFunction } from 'express' +import { errors } from '@elastic/elasticsearch' +import statsd from '@/observability/lib/statsd.js' + +import { getPathWithoutVersion, getPathWithoutLanguage } from '@/frame/lib/path-utils' +import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search' +import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params' + +import type { ComputedSearchQueryParamsMap } from '@/search/lib/search-request-params/types' +import type { + GeneralSearchResponse, + SearchOnReqObject, + SearchTypes, + SearchValidationErrorEntry, +} from '@/search/types.js' + +interface Context { + currentVersion: string + currentLanguage: string + search: SearchOnReqObject +} + +interface CustomRequest extends Request { + pagePath: string + context: Context +} + +export default async function contextualizeGeneralSearch( + req: CustomRequest<'generalSearch'>, + res: Response, + next: NextFunction, +): Promise { + const { pagePath } = req + if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') { + return next() + } + + // Since this is a middleware language & version are already set in req.context via a prior middleware + const { indexName, searchParams, validationErrors } = getSearchFromRequestParams( + req, + 'generalSearch', + // Force the version and language keys to be set from the `req.context` object + { + version: req.context.currentVersion, + language: req.context.currentLanguage, + }, + ) + + if (validationErrors.map((error: SearchValidationErrorEntry) => error.key).includes('query')) { + if (Array.isArray(searchParams.query)) { + searchParams.query = searchParams.query[0] + } else if (!searchParams.query) { + searchParams.query = '' // If 'undefined' we need to cast to string + } + } + + searchParams.aggregate = ['toplevel'] + + req.context.search = { + searchParams, + validationErrors, + } + + if (!validationErrors.length && searchParams.query) { + // In local dev ELASTICSEARCH_URL may not be set, so we proxy the search to prod + if (!process.env.ELASTICSEARCH_URL) { + if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) { + // Do 2 searches. One without filtering to get the aggregations + const searchWithoutFilter = Object.fromEntries( + Object.entries(searchParams).filter(([key]) => key !== 'topLevel'), + ) + searchWithoutFilter.size = 0 + const { aggregations } = await getProxySearch( + searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'], + ) + const searchWithoutAggregate = Object.fromEntries( + Object.entries(searchParams).filter(([key]) => key !== 'aggregate'), + ) + req.context.search.results = await getProxySearch( + searchWithoutAggregate as ComputedSearchQueryParamsMap['generalSearch'], + ) + req.context.search.results.aggregations = aggregations + } else { + req.context.search.results = await getProxySearch(searchParams) + } + } else { + const tags: string[] = [`indexName:${indexName}`, `toplevels:${searchParams.toplevel.length}`] + const timed = statsd.asyncTimer(getGeneralSearchResults, 'contextualize.search', tags) + const getGeneralSearchArgs = { + indexName, + searchParams, + } + try { + if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) { + // Do 2 searches. One without filtering to get the aggregations + const searchWithoutFilter = Object.fromEntries( + Object.entries(searchParams).filter(([key]) => key !== 'topLevel'), + ) + searchWithoutFilter.size = 0 + const { aggregations } = await timed({ + ...getGeneralSearchArgs, + searchParams: searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'], + }) + req.context.search.results = await timed(getGeneralSearchArgs) + req.context.search.results.aggregations = aggregations + } else { + req.context.search.results = await timed(getGeneralSearchArgs) + } + } catch (error) { + // If the Elasticsearch sends a 4XX we want the user to see a 500 + if (error instanceof errors.ResponseError) { + console.error( + 'Error calling getSearchResults(%s):', + JSON.stringify({ + indexName, + searchParams, + }), + error, + ) + if (error?.meta?.body) { + console.error(`Meta:`, error.meta.body) + } + throw new Error(error.message) + } else { + throw error + } + } + } + } + + return next() +} + +const SEARCH_KEYS_TO_QUERY_STRING: (keyof ComputedSearchQueryParamsMap['generalSearch'])[] = [ + 'query', + 'version', + 'language', + 'page', + 'aggregate', + 'toplevel', + 'size', +] + +// Proxy the API endpoint with the relevant search params +async function getProxySearch( + search: ComputedSearchQueryParamsMap['generalSearch'], +): Promise { + const url = new URL('https://docs.github.com/api/search/v1') + for (const key of SEARCH_KEYS_TO_QUERY_STRING) { + const value = search[key] + if (typeof value === 'boolean') { + url.searchParams.set(key, value ? 'true' : 'false') + } else if (Array.isArray(value)) { + for (const v of value) { + url.searchParams.append(key, v) + } + } else if (typeof value === 'number') { + url.searchParams.set(key, `${value}`) + } else if (value) { + url.searchParams.set(key, value) + } + } + console.log(`Proxying search to ${url}`) + return got(url).json() +} diff --git a/src/search/middleware/get-search-request.js b/src/search/middleware/get-search-request.js deleted file mode 100644 index 05b340de00eb..000000000000 --- a/src/search/middleware/get-search-request.js +++ /dev/null @@ -1,229 +0,0 @@ -import { format } from 'node:util' - -import languages from '#src/languages/lib/languages.js' -import { allVersions } from '#src/versions/lib/all-versions.js' -import { POSSIBLE_HIGHLIGHT_FIELDS, DEFAULT_HIGHLIGHT_FIELDS } from './es-search.js' - -const DEFAULT_SIZE = 10 -const DEFAULT_AUTOCOMPLETE_SIZE = 8 -const MAX_SIZE = 50 // How much you return has a strong impact on performance -const MAX_AUTOCOMPLETE_SIZE = 10 -const DEFAULT_PAGE = 1 -const POSSIBLE_SORTS = ['best', 'relevance'] -const DEFAULT_SORT = POSSIBLE_SORTS[0] -const MAX_PAGE = 10 - -// There are some fields you can optionally include in the output. -// These are fields available in Elasticsearch that we don't include in -// the output by default. E.g. `...&include=intro` -// Requesting anything that is not in this list will result in -// a 400 Bad Request. -const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel'] - -const V1_AGGREGATES = ['toplevel'] - -// If someone searches for `...&version=3.5` what they actually mean -// is `ghes-3.5`. This is because of legacy formatting with the old search. -// In some distant future we can clean up any client enough that this -// aliasing won't be necessary. -const versionAliases = {} -const prefixVersionAliases = {} -Object.values(allVersions).forEach((info) => { - if (info.hasNumberedReleases) { - versionAliases[info.currentRelease] = info.miscVersionName - } else { - versionAliases[info.version] = info.miscVersionName - versionAliases[info.miscVersionName] = info.miscVersionName - } - // This makes it so you can search for `?version=enterprise-server` - // and that actually means `?version=ghes` because there's an index - // called `github-autocomplete-en-ghes`. - prefixVersionAliases[info.plan] = info.shortName - prefixVersionAliases[info.shortName] = info.shortName -}) - -function getIndexPrefix() { - // This logic is mirrored in the scripts we use before running tests - // In particular, see the `index-test-fixtures` npm script. - // That's expected to be run before CI and local vitest testing. - // The reason we have a deliberately different index name (by prefix) - // for testing compared to regular operation is to make it convenient - // for engineers working on local manual testing *and* automated - // testing without have to re-index different content (e.g. fixtures - // vs real content) on the same index name. - if (process.env.NODE_ENV === 'test') return 'tests_' - - return '' -} - -class ValidationError extends Error {} - -const PARAMS = [ - { key: 'query' }, - { - key: 'version', - default_: 'dotcom', - validate: (v) => { - if (versionAliases[v] || allVersions[v]) return true - const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)] - throw new ValidationError(`'${v}' not in ${valid}`) - }, - }, - { key: 'language', default_: 'en', validate: (v) => v in languages }, - { - key: 'size', - default_: DEFAULT_SIZE, - cast: (v) => parseInt(v, 10), - validate: (v) => v >= 0 && v <= MAX_SIZE, - }, - { - key: 'page', - default_: DEFAULT_PAGE, - cast: (v) => parseInt(v, 10), - validate: (v) => v >= 1 && v <= MAX_PAGE, - }, - { key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) }, - { - key: 'highlights', - default_: DEFAULT_HIGHLIGHT_FIELDS, - cast: (v) => (Array.isArray(v) ? v : [v]), - multiple: true, - validate: (v) => { - for (const highlight of v) { - if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) { - throw new ValidationError(`highlight value '${highlight}' is not valid`) - } - } - return true - }, - }, - { key: 'autocomplete', default_: false, cast: toBoolean }, - { key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean }, - { - key: 'include', - default_: [], - cast: toArray, - multiple: true, - // Note: At the time of writing this general validator middleware - // doesn't yet know it's being used by the v1 version. - // But we don't have any other versions yet so no need to - // over-engineer this more. - validate: (values) => values.every((value) => V1_ADDITIONAL_INCLUDES.includes(value)), - }, - { - key: 'toplevel', - default_: [], - cast: toArray, - multiple: true, - }, - { - key: 'aggregate', - default_: [], - cast: toArray, - multiple: true, - validate: (values) => values.every((value) => V1_AGGREGATES.includes(value)), - }, -] - -const AUTOCOMPLETE_PARAMS = [ - { key: 'query' }, - { key: 'language', default_: 'en', validate: (v) => v in languages }, - { - key: 'version', - default_: 'free-pro-team', - validate: (v) => { - if (prefixVersionAliases[v] || allVersions[v]) return true - if (Object.values(prefixVersionAliases).includes(v)) return true - const valid = [ - ...Object.keys(prefixVersionAliases), - ...Object.values(prefixVersionAliases), - ...Object.keys(allVersions), - ] - throw new ValidationError(`'${v}' not in ${valid.join(', ')}`) - }, - }, - { - key: 'size', - default_: DEFAULT_AUTOCOMPLETE_SIZE, - cast: (v) => parseInt(v, 10), - validate: (v) => v >= 0 && v <= MAX_AUTOCOMPLETE_SIZE, - }, -] -export function getAutocompleteSearchFromRequest(req, force = {}) { - const { search, validationErrors } = getSearchFromRequest(req, {}, AUTOCOMPLETE_PARAMS) - if (validationErrors.length === 0) { - const version = prefixVersionAliases[search.version] || allVersions[search.version].shortName - search.indexName = `${getIndexPrefix()}github-autocomplete-${search.language}-${version}` - } - return { search, validationErrors } -} - -export function getSearchFromRequest(req, force = {}, params = PARAMS) { - const search = {} - const validationErrors = [] - - for (const { key, default_, cast, validate, multiple } of params) { - // This is necessary because when the version or language comes from - // the pathname, we don't want pick these up from the query string. - // This function gets used by /$locale/$version/search - // *and* /api/search/v1?language=$locale&version=$version - if (key in force) { - search[key] = force[key] - continue - } - - let value = req.query[key] - if (!value || (typeof value === 'string' && !value.trim())) { - if (default_ === undefined) { - // no value and no default, bad! - validationErrors.push({ error: `No truthy value for key '${key}'`, key }) - continue - } - value = default_ - } - if (cast) { - value = cast(value) - } - try { - if (validate && !validate(value)) { - validationErrors.push({ - error: format('Not a valid value (%O) for key %O', value, key), - key, - }) - } - } catch (err) { - if (err instanceof ValidationError) { - validationErrors.push({ error: err.toString(), field: key }) - } else { - throw err - } - } - if (!multiple && Array.isArray(value)) { - validationErrors.push({ - error: format('Cannot have multiple values (%O) for key %O', value, key), - key, - }) - } - - search[key] = value - } - - if (!validationErrors.length) { - const version = - prefixVersionAliases[search.version] || - versionAliases[search.version] || - allVersions[search.version].miscVersionName - search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en - } - - return { search, validationErrors } -} - -function toBoolean(value) { - if (value === 'true' || value === '1') return true - return false -} - -function toArray(value) { - return Array.isArray(value) ? value : [value] -} diff --git a/src/search/middleware/search-routes.ts b/src/search/middleware/search-routes.ts new file mode 100644 index 000000000000..db84b46639f3 --- /dev/null +++ b/src/search/middleware/search-routes.ts @@ -0,0 +1,150 @@ +/* + This file and the routes included are for the /search endpoint of our API + + For general search (client searches on docs.github.com) we use the middleware in ./general-search-middleware to get the search results +*/ +import express, { Request, Response } from 'express' + +import FailBot from '@/observability/lib/failbot.js' +import { searchCacheControl } from '@/frame/middleware/cache-control.js' +import catchMiddlewareError from '@/observability/middleware/catch-middleware-error.js' +import { + setFastlySurrogateKey, + SURROGATE_ENUMS, +} from '@/frame/middleware/set-fastly-surrogate-key.js' +import { getAutocompleteSearchResults } from '@/search/lib/get-elasticsearch-results/general-autocomplete' +import { getAISearchAutocompleteResults } from '@/search/lib/get-elasticsearch-results/ai-search-autocomplete' +import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params' +import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search' + +const router = express.Router() + +router.get('/legacy', (req: Request, res: Response) => { + res.status(410).send('Use /api/search/v1 instead.') +}) + +router.get( + '/v1', + catchMiddlewareError(async (req: Request, res: Response) => { + const { indexName, searchParams, validationErrors } = getSearchFromRequestParams( + req, + 'generalSearch', + ) + if (validationErrors.length) { + // We only send the first validation error to the user + return res.status(400).json(validationErrors[0]) + } + + const getResultOptions = { + indexName, + searchParams, + } + try { + const { meta, hits, aggregations } = await getGeneralSearchResults(getResultOptions) + + if (process.env.NODE_ENV !== 'development') { + searchCacheControl(res) + // We can cache this without purging it after every deploy + // because the API search is only used as a proxy for local + // and preview environments. + setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL) + } + + res.status(200).json({ meta, hits, aggregations }) + } catch (error) { + await handleGetSearchResultsError(req, res, error, getResultOptions) + } + }), +) + +router.get( + '/autocomplete/v1', + catchMiddlewareError(async (req: Request, res: Response) => { + const { + indexName, + validationErrors, + searchParams: { query, size }, + } = getSearchFromRequestParams(req, 'generalAutocomplete') + if (validationErrors.length) { + return res.status(400).json(validationErrors[0]) + } + + const options = { + indexName, + query, + size, + } + try { + const { meta, hits } = await getAutocompleteSearchResults(options) + + if (process.env.NODE_ENV !== 'development') { + searchCacheControl(res) + setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL) + } + + res.status(200).json({ meta, hits }) + } catch (error) { + await handleGetSearchResultsError(req, res, error, options) + } + }), +) + +router.get( + '/ai-search-autocomplete/v1', + catchMiddlewareError(async (req: Request, res: Response) => { + const { + indexName, + validationErrors, + searchParams: { query, size }, + } = getSearchFromRequestParams(req, 'aiSearchAutocomplete') + if (validationErrors.length) { + return res.status(400).json(validationErrors[0]) + } + + const getResultOptions = { + indexName, + query, + size, + } + try { + const { meta, hits } = await getAISearchAutocompleteResults(getResultOptions) + + if (process.env.NODE_ENV !== 'development') { + searchCacheControl(res) + setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL) + } + + res.status(200).json({ meta, hits }) + } catch (error) { + await handleGetSearchResultsError(req, res, error, getResultOptions) + } + }), +) + +async function handleGetSearchResultsError(req: Request, res: Response, error: any, options: any) { + if (process.env.NODE_ENV === 'development') { + console.error(`Error calling getSearchResults(${options})`, error) + } else { + const reports = FailBot.report(error, { url: req.url, ...options }) + if (reports) await Promise.all(reports) + } + res.status(500).json({ error: error.message }) +} + +// Redirects for latest versions +router.get('/', (req: Request, res: Response) => { + res.redirect(307, req.originalUrl.replace('/search', '/search/v1')) +}) + +router.get('/autocomplete', (req: Request, res: Response) => { + res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1')) +}) + +router.get('/ai-search-autocomplete', (req: Request, res: Response) => { + res.redirect( + 307, + req.originalUrl.replace('/search/ai-search-autocomplete', '/search/ai-search-autocomplete/v1'), + ) +}) + +export default router diff --git a/src/search/middleware/search.js b/src/search/middleware/search.js deleted file mode 100644 index 8650b68acccc..000000000000 --- a/src/search/middleware/search.js +++ /dev/null @@ -1,160 +0,0 @@ -import express from 'express' - -import FailBot from '#src/observability/lib/failbot.js' -import { searchCacheControl } from '#src/frame/middleware/cache-control.js' -import catchMiddlewareError from '#src/observability/middleware/catch-middleware-error.js' -import { - setFastlySurrogateKey, - SURROGATE_ENUMS, -} from '#src/frame/middleware/set-fastly-surrogate-key.js' -import { getAutocompleteSearchResults, getSearchResults } from './es-search.js' -import { getAutocompleteSearchFromRequest, getSearchFromRequest } from './get-search-request.js' - -const router = express.Router() - -router.get('/legacy', (req, res) => { - res.status(410).send('Use /api/search/v1 instead.') -}) - -export const validationMiddleware = (req, res, next) => { - const { search, validationErrors } = getSearchFromRequest(req) - if (validationErrors.length) { - // There might be multiple things bad about the query parameters, - // but we send a 400 on the first possible one in the API. - return res.status(400).json(validationErrors[0]) - } - - req.search = search - return next() -} - -router.get( - '/v1', - validationMiddleware, - catchMiddlewareError(async function search(req, res) { - const { - indexName, - query, - autocomplete, - page, - size, - debug, - sort, - highlights, - include, - toplevel, - aggregate, - } = req.search - - const options = { - indexName, - query, - page, - size, - debug, - sort, - highlights, - usePrefixSearch: autocomplete, - include, - toplevel, - aggregate, - } - try { - const { meta, hits, aggregations } = await getSearchResults(options) - - if (process.env.NODE_ENV !== 'development') { - searchCacheControl(res) - // We can cache this without purging it after every deploy - // because the API search is only used as a proxy for local - // and preview environments. - setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL) - } - - // The v1 version of the output matches perfectly what comes out - // of the getSearchResults() function. - res.status(200).json({ meta, hits, aggregations }) - } catch (error) { - // If getSearchResult() throws an error that might be 404 inside - // elasticsearch, if we don't capture that here, it will propagate - // to the next middleware. - await handleGetSearchResultsError(req, res, error, options) - } - }), -) - -export const autocompleteValidationMiddleware = (req, res, next) => { - const { search, validationErrors } = getAutocompleteSearchFromRequest(req) - if (validationErrors.length) { - // There might be multiple things bad about the query parameters, - // but we send a 400 on the first possible one in the API. - return res.status(400).json(validationErrors[0]) - } - - req.search = search - return next() -} - -router.get( - '/autocomplete/v1', - autocompleteValidationMiddleware, - catchMiddlewareError(async (req, res) => { - const { indexName, query, size } = req.search - - const options = { - indexName, - query, - size, - } - try { - const { meta, hits } = await getAutocompleteSearchResults(options) - - if (process.env.NODE_ENV !== 'development') { - searchCacheControl(res) - // We can cache this without purging it after every deploy - // because the API search is only used as a proxy for local - // and preview environments. - setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL) - } - - // The v1 version of the output matches perfectly what comes out - // of the getSearchResults() function. - res.status(200).json({ meta, hits }) - } catch (error) { - // If getSearchResult() throws an error that might be 404 inside - // elasticsearch, if we don't capture that here, it will propagate - // to the next middleware. - await handleGetSearchResultsError(req, res, error, options) - } - }), -) - -// We have more than one place where we do `try{...} catch error( THIS )` -// which is slightly different depending on the "sub-version" (e.g. /legacy) -// This function is a single place to take care of all of these error handlings -async function handleGetSearchResultsError(req, res, error, options) { - if (process.env.NODE_ENV === 'development') { - console.error(`Error calling getSearchResults(${options})`, error) - } else { - const reports = FailBot.report(error, Object.assign({ url: req.url }, options)) - // It might be `undefined` if no backends are configured which - // is likely when using production NODE_ENV on your laptop - // where you might not have a HAYSTACK_URL configured. - if (reports) await Promise.all(reports) - } - res.status(500).json({ error: error.message }) -} - -// Alias for the latest version -router.get('/', (req, res) => { - // At the time of writing, the latest version is v1. (July 2022) - // Use `req.originalUrl` because this router is "self contained" - // which means that `req.url` will be `/` in this context. - res.redirect(307, req.originalUrl.replace('/search', '/search/v1')) -}) - -// Alias for the latest autocomplete version -router.get('/autocomplete', (req, res) => { - res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1')) -}) - -export default router diff --git a/src/search/pages/search.tsx b/src/search/pages/search.tsx index 42965548ae69..55c0333a2e25 100644 --- a/src/search/pages/search.tsx +++ b/src/search/pages/search.tsx @@ -7,9 +7,10 @@ import { addUINamespaces, } from 'src/frame/components/context/MainContext' import { DefaultLayout } from 'src/frame/components/DefaultLayout' -import type { SearchT } from 'src/search/components/types' -import { SearchContext, SearchContextT } from 'src/search/components/context/SearchContext' +import { SearchContext } from 'src/search/components/context/SearchContext' import { Search } from 'src/search/components/index' +import { SearchOnReqObject } from 'src/search/types' +import type { SearchContextT } from 'src/search/components/types' type Props = { mainContext: MainContextT @@ -40,6 +41,8 @@ export const getServerSideProps: GetServerSideProps = async (context) => throw new Error('Expected req.context to be populated with .search') } + const searchObject = req.context.search as SearchOnReqObject<'generalSearch'> + // The `req.context.search` is similar to what's needed to React // render the search result page. // But it contains information (from the contextualizing) that is @@ -48,24 +51,24 @@ export const getServerSideProps: GetServerSideProps = async (context) => // `page` and `indexName` which was useful when it made the actual // Elasticsearch query. But it's not needed to render the results. // We explicitly pick out the parts that are needed, only. - const search: SearchT = { - search: { - query: req.context.search.search.query, - debug: req.context.search.search.debug, + const search: SearchContextT['search'] = { + searchParams: { + query: searchObject.searchParams.query, + debug: searchObject.searchParams.debug, }, - validationErrors: req.context.search.validationErrors, + validationErrors: searchObject.validationErrors, } // If there are no results (e.g. /en/search?query=) from the // contextualizing, then `req.context.search.results` will // be `undefined` which can't be serialized as a prop, using JSON.stringify. - if (req.context.search.results) { + if (searchObject.results) { search.results = { - meta: req.context.search.results.meta, - hits: req.context.search.results.hits, + meta: searchObject.results.meta, + hits: searchObject.results.hits, // Use `null` instead of `undefined` for JSON serialization. // The only reason it would ever not be truthy is if the aggregates // functionality is not enabled for this version. - aggregations: req.context.search.results.aggregations || null, + aggregations: searchObject.results.aggregations || null, } } diff --git a/src/search/scripts/analyze-text.js b/src/search/scripts/analyze-text.ts similarity index 61% rename from src/search/scripts/analyze-text.js rename to src/search/scripts/analyze-text.ts index b314a2bf2439..668836734c5c 100755 --- a/src/search/scripts/analyze-text.js +++ b/src/search/scripts/analyze-text.ts @@ -1,24 +1,19 @@ -#!/usr/bin/env node - -// [start-readme] -// -// See how a piece of text gets turned into tokens by the different -// analyzers. +// See how a piece of text gets turned into tokens by the different analyzers. // Requires that the index exists in Elasticsearch. // // Example: // -// npm run analyze-text "my words" to tokenize -// -// [end-readme] +// npm run analyze-text -- -V dotcom -l en "The name of the wind" import { Client } from '@elastic/elasticsearch' -import { program, Option } from 'commander' +import { Command, Option } from 'commander' import chalk from 'chalk' import dotenv from 'dotenv' -import { languageKeys } from '#src/languages/lib/languages.js' -import { allVersions } from '#src/versions/lib/all-versions.js' +import { languageKeys } from '@/languages/lib/languages.js' +import { allVersions } from '@/versions/lib/all-versions.js' + +import type { IndicesAnalyzeAnalyzeToken } from '@elastic/elasticsearch/lib/api/types' // Now you can optionally have set the ELASTICSEARCH_URL in your .env file. dotenv.config() @@ -38,16 +33,28 @@ dotenv.config() // // We need this later to be able to map CLI arguments to what the // records are called when found on disk. -const shortNames = Object.fromEntries( - Object.values(allVersions).map((info) => { - const shortName = info.hasNumberedReleases - ? info.miscBaseName + info.currentRelease - : info.miscBaseName - return [shortName, info] - }), -) +const shortNames: Record = + Object.fromEntries( + Object.values(allVersions).map((info) => { + const shortName = info.hasNumberedReleases + ? `${info.miscBaseName}${info.currentRelease}` + : info.miscBaseName + return [shortName, info] + }), + ) -const allVersionKeys = Object.keys(shortNames) +const allVersionKeys: string[] = Object.keys(shortNames) + +interface Options { + verbose?: boolean + version?: string + language?: string + notLanguage?: string + elasticsearchUrl?: string + indexPrefix?: string +} + +const program = new Command() program .description('Analyze text into tokens') @@ -56,21 +63,29 @@ program .addOption( new Option('-l, --language ', 'Which language to focus on').choices(languageKeys), ) + .option('--not-language ', 'Exclude a specific language') .option('-u, --elasticsearch-url ', 'If different from $ELASTICSEARCH_URL') + .option('--index-prefix ', 'Prefix for the index name') .argument('', 'text to tokenize') .parse(process.argv) -main(program.opts(), program.args) +const options = program.opts() +const args: string[] = program.args + +main(options, args).catch((err) => { + console.error(chalk.red('Error:'), err) + process.exit(1) +}) -async function main(opts, args) { +async function main(opts: Options, args: string[]): Promise { const texts = [args.join(' ')] if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) { throw new Error( - 'Must passed the elasticsearch URL option or ' + + 'Must pass the elasticsearch URL option or ' + 'set the environment variable ELASTICSEARCH_URL', ) } - let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL + let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL! // Allow the user to lazily set it to `localhost:9200` for example. if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { @@ -79,15 +94,15 @@ async function main(opts, args) { try { const parsed = new URL(node) - if (!parsed.hostname) throw new Error('no valid hostname') + if (!parsed.hostname) throw new Error('No valid hostname') } catch (err) { - console.error(chalk.bold('URL for Elasticsearch not a valid URL', err)) + console.error(chalk.bold('URL for Elasticsearch not a valid URL'), err) return } const { verbose, language, notLanguage } = opts - // The notLanguage is useful you want to, for example, index all languages + // The notLanguage is useful if you want to, for example, index all languages // *except* English. if (language && notLanguage) { throw new Error("Can't combine --language and --not-language") @@ -116,29 +131,32 @@ async function main(opts, args) { const indexName = `${prefix}github-docs-${versionKey}-${languageKey}` console.log(chalk.yellow(`Analyzing in ${chalk.bold(indexName)}`)) - await analyzeVersion(client, texts, indexName, verbose) + await analyzeVersion(client, texts, indexName) } -function safeUrlDisplay(url) { +function safeUrlDisplay(url: string): string { const parsed = new URL(url) if (parsed.password) { parsed.password = '***' } if (parsed.username) { - parsed.username = parsed.username.slice(0, 4) + '***' + parsed.username = `${parsed.username.slice(0, 4)}***` } return parsed.toString() } -async function analyzeVersion(client, texts, indexName, verbose = false) { + +async function analyzeVersion(client: Client, texts: string[], indexName: string): Promise { for (const text of texts) { console.log(`RAW TEXT: 〝${chalk.italic(text)}〞`) for (const analyzer of ['text_analyzer_explicit', 'text_analyzer', 'standard']) { console.log('ANALYZER:', chalk.bold(analyzer)) - const { tokens } = await client.indices.analyze({ + const response = await client.indices.analyze({ index: indexName, body: { analyzer, text }, }) - const tokenWords = tokens.map((token) => token.token) + + const tokens: IndicesAnalyzeAnalyzeToken[] | undefined = response.tokens + const tokenWords: string[] = tokens?.map((token) => token.token) || [] console.log(tokenWords) } } diff --git a/src/search/scripts/index-elasticsearch.js b/src/search/scripts/index-elasticsearch.js deleted file mode 100755 index ac78d312bd7a..000000000000 --- a/src/search/scripts/index-elasticsearch.js +++ /dev/null @@ -1,575 +0,0 @@ -#!/usr/bin/env node - -// [start-readme] -// -// Creates Elasticsearch index, populates from records, -// moves the index alias, deletes old indexes. -// -// [end-readme] - -import fs from 'fs/promises' -import path from 'path' - -import { Client, errors } from '@elastic/elasticsearch' -import { program, Option, InvalidArgumentError } from 'commander' -import chalk from 'chalk' -import dotenv from 'dotenv' - -import { retryOnErrorTest } from './retry-on-error-test.js' -import { languageKeys } from '#src/languages/lib/languages.js' -import { allVersions } from '#src/versions/lib/all-versions.js' - -// Now you can optionally have set the ELASTICSEARCH_URL in your .env file. -dotenv.config() - -// Create an object that maps the "short name" of a version to -// all information about it. E.g. -// -// { -// 'ghes-3.5': { -// hasNumberedReleases: true, -// currentRelease: '3.5', -// version: 'enterprise-server@3.5', -// miscBaseName: 'ghes-' -// ... -// }, -// ... -// -// We need this later to be able to map CLI arguments to what the -// records are called when found on disk. -const shortNames = Object.fromEntries( - Object.values(allVersions).map((info) => { - const shortName = info.hasNumberedReleases - ? info.miscBaseName + info.currentRelease - : info.miscBaseName - return [shortName, info] - }), -) - -const allVersionKeys = Object.keys(shortNames) - -const DEFAULT_SLEEPTIME_SECONDS = 30 - -program - .description('Creates Elasticsearch index from records') - .option('-v, --verbose', 'Verbose outputs') - .addOption(new Option('-V, --version [VERSION...]', 'Specific versions').choices(allVersionKeys)) - .addOption( - new Option('-l, --language ', 'Which languages to focus on').choices(languageKeys), - ) - .addOption( - new Option('--not-language ', 'Specific language to omit').choices(languageKeys), - ) - .option('-u, --elasticsearch-url ', 'If different from $ELASTICSEARCH_URL') - .option('-p, --index-prefix ', 'Index string to put before index name') - .option( - '-s, --stagger-seconds ', - 'Number of seconds to sleep between each bulk operation', - (value) => { - const parsed = parseInt(value, 10) - if (isNaN(parsed)) { - throw new InvalidArgumentError('Not a number.') - } - return parsed - }, - ) - .option( - '-r, --retries ', - 'Number of retry attempts on recoverable network errors', - (value) => { - const parsed = parseInt(value, 10) - if (isNaN(parsed)) { - throw new InvalidArgumentError('Not a number.') - } - return parsed - }, - ) - .option( - '--sleep-time ', - `Number of seconds to sleep between each retry attempt (defaults to ${DEFAULT_SLEEPTIME_SECONDS})`, - (value) => { - const parsed = parseInt(value, 10) - if (isNaN(parsed)) { - throw new InvalidArgumentError('Not a number.') - } - return parsed - }, - ) - .argument('', 'where the indexable files are') - .parse(process.argv) - -main(program.opts(), program.args) - -async function main(opts, args) { - if (!args.length) { - throw new Error('Must pass the source as the first argument') - } - - const { verbose, language, notLanguage, elasticsearchUrl } = opts - - if (!elasticsearchUrl && !process.env.ELASTICSEARCH_URL) { - throw new Error( - 'Must passed the elasticsearch URL option or ' + - 'set the environment variable ELASTICSEARCH_URL', - ) - } - let node = elasticsearchUrl || process.env.ELASTICSEARCH_URL - - // Allow the user to lazily set it to `localhost:9200` for example. - if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { - node = `http://${node}` - } - - try { - const parsed = new URL(node) - if (!parsed.hostname) throw new Error('no valid hostname') - } catch (err) { - console.error(chalk.bold('URL for Elasticsearch not a valid URL', err)) - throw err - } - - // The notLanguage is useful you want to, for example, index all languages - // *except* English. - if (language && notLanguage) { - throw new Error("Can't combine --language and --not-language") - } - - if (verbose) { - console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`) - } - const sourceDirectory = args[0] - try { - await fs.stat(sourceDirectory) - } catch (error) { - if (error.code === 'ENOENT') { - throw new Error(`The specified directory '${sourceDirectory}' does not exist.`) - } - throw error - } - - try { - await indexAll(node, sourceDirectory, opts) - } catch (error) { - // If any error is thrown from within the SDK, that error object will - // contain a `Connection` object which, when printed, can reveal the - // username/password or the base64 Basic auth credentials. - // So we want to carefully re-throw it so it only contains the minimal - // information for debugging without exposing the Connection credentials - // in Actions logs. - if (error instanceof errors.ElasticsearchClientError) { - // All ElasticsearchClientError error subclasses have a `name` and - // `message` but only some have a `meta`. - if (error.meta) { - console.error('Error meta: %O', error.meta) - } - throw new Error(error.message) - } - // If any other error happens that isn't from the elasticsearch SDK, - // let it bubble up. - throw error - } -} - -const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) - -async function indexAll(node, sourceDirectory, opts) { - const client = new Client({ node }) - - const { language, verbose, notLanguage, indexPrefix, staggerSeconds } = opts - - let version - if ('version' in opts) { - version = opts.version - if (process.env.VERSION) { - console.warn( - `'version' specified as argument ('${version}') AND environment variable ('${process.env.VERSION}')`, - ) - } - } else { - if (process.env.VERSION && process.env.VERSION !== 'all') { - version = process.env.VERSION - if (!allVersionKeys.includes(version)) { - throw new Error( - `Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`, - ) - } - } - } - let versionKeys = allVersionKeys - // If it came from the `--version` argument parsing, it might be a string - // or an array of strings because it uses `--version [VERSION...]`. - if (version) { - if (Array.isArray(version)) { - versionKeys = version - } else { - versionKeys = [version] - } - } - - // This will throw if it can't ping - await client.ping() - - const languages = - language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang)) - if (verbose) { - console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`) - } - - const prefix = indexPrefix ? `${indexPrefix}_` : '' - - for (const language of languages) { - let count = 0 - for (const versionKey of versionKeys) { - console.log(chalk.yellow(`Indexing ${chalk.bold(versionKey)} in ${chalk.bold(language)}`)) - const indexName = `${prefix}github-docs-${versionKey}-${language}` - - const t0 = new Date() - await indexVersion(client, indexName, versionKey, language, sourceDirectory, opts) - const t1 = new Date() - console.log(chalk.green(`Finished indexing ${indexName}. Took ${formatTime(t1 - t0)}`)) - if (verbose) { - console.log(`To view index: ${safeUrlDisplay(node + `/${indexName}`)}`) - console.log(`To search index: ${safeUrlDisplay(node + `/${indexName}/_search`)}`) - } - count++ - // console.log({ count, versionKeysLength: versionKeys.length }) - if (staggerSeconds && count < versionKeys.length - 1) { - console.log(`Sleeping for ${staggerSeconds} seconds...`) - await sleep(1000 * staggerSeconds) - } - // A bit of visual separation betweeen each version - console.log('') - } - } -} - -function safeUrlDisplay(url) { - const parsed = new URL(url) - if (parsed.password) { - parsed.password = '***' - } - if (parsed.username) { - parsed.username = parsed.username.slice(0, 4) + '***' - } - return parsed.toString() -} - -// Return '20220719012012' if the current date is -// 2022-07-19T01:20:12.172Z. Note how the 6th month (July) becomes -// '07'. All numbers become 2 character zero-padding strings individually. -function utcTimestamp() { - const d = new Date() - - return ( - [ - `${d.getUTCFullYear()}`, - d.getUTCMonth() + 1, - d.getUTCDate(), - d.getUTCHours(), - d.getUTCMinutes(), - d.getUTCSeconds(), - ] - // If it's a number make it a zero-padding 2 character string - .map((x) => (typeof x === 'number' ? ('0' + x).slice(-2) : x)) - .join('') - ) -} - -// Consider moving this to lib -async function indexVersion(client, indexName, version, language, sourceDirectory, opts) { - const { verbose } = opts - - // Note, it's a bit "weird" that numbered releases versions are - // called the number but that's the convention the previous - // search backend used - const indexVersionName = shortNames[version].hasNumberedReleases - ? shortNames[version].currentRelease - : shortNames[version].miscBaseName - const recordsName = `github-docs-${indexVersionName}-${language}` - - const records = await loadRecords(recordsName, sourceDirectory) - - const thisAlias = `${indexName}__${utcTimestamp()}` - - // CREATE INDEX - const settings = { - analysis: { - char_filter: { - // This will turn `runs-on` into `runs_on` so that it can't be - // tokenized to `runs` because `on` is a stop word. - // It also means that prose terms, in English, like `opt-in` - // not be matched if someone searches for `opt in`. But this - // is why we have multiple different analyzers. So it becomes - // `opt_in` in the `text_analyzer_explicit` analyzer, but is - // left as `opt` in the `text_analyzer` analyzer. - hyphenation_filter: { - type: 'mapping', - mappings: ['- => _'], - }, - }, - analyzer: { - // We defined to analyzers. Both based on a "common core" with the - // `standard` tokenizer. But the second one adds Snowball filter. - // That means the tokenization of "Dependency naming" becomes - // `[dependency, naming]` in the explicit one and `[depend, name]` - // in the Snowball one. - // We do this to give a chance to boost the more exact spelling a - // bit higher with the assumption that if the user knew exactly - // what it was called, we should show that higher. - // A great use-case of this when users search for keywords that are - // code words like `dependency-name`. - text_analyzer_explicit: { - char_filter: ['hyphenation_filter'], - filter: ['lowercase', 'stop', 'asciifolding'], - tokenizer: 'standard', - type: 'custom', - }, - text_analyzer: { - filter: ['lowercase', 'stop', 'asciifolding'], - tokenizer: 'standard', - type: 'custom', - }, - }, - filter: { - // Will later, conditionally, put the snowball configuration here. - }, - }, - } - const snowballLanguage = getSnowballLanguage(language) - if (snowballLanguage) { - settings.analysis.analyzer.text_analyzer.filter.push('languaged_snowball') - settings.analysis.filter.languaged_snowball = { - type: 'snowball', - language: snowballLanguage, - } - } else { - if (verbose) { - console.warn(`No snowball language for '${language}'`) - } - } - - await client.indices.create({ - index: thisAlias, - mappings: { - properties: { - url: { type: 'keyword' }, - title: { - type: 'text', - analyzer: 'text_analyzer', - norms: false, - // This is used for fast highlighting. Uses more space but makes - // the searches faster. - term_vector: 'with_positions_offsets', - }, - title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false }, - content: { - type: 'text', - analyzer: 'text_analyzer', - // This is used for fast highlighting. Uses more space but makes - // the searches faster. - term_vector: 'with_positions_offsets', - }, - content_explicit: { - type: 'text', - analyzer: 'text_analyzer_explicit', - // This is used for fast highlighting. Uses more space but makes - // the searches faster. - term_vector: 'with_positions_offsets', - }, - headings: { type: 'text', analyzer: 'text_analyzer', norms: false }, - headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false }, - breadcrumbs: { type: 'text' }, - popularity: { type: 'float' }, - intro: { type: 'text' }, - // Use 'keyword' because it's faster to index and (more importantly) - // faster to search on. It would be different if it was something - // users could type in into a text input. - toplevel: { type: 'keyword' }, - }, - }, - settings, - }) - - // POPULATE - const allRecords = Object.values(records).sort((a, b) => b.popularity - a.popularity) - const operations = allRecords.flatMap((doc) => { - const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc - const contentEscaped = escapeHTML(content) - const headingsEscaped = escapeHTML(headings) - const record = { - url: objectID, - title, - title_explicit: title, - content: contentEscaped, - content_explicit: contentEscaped, - breadcrumbs, - headings: headingsEscaped, - headings_explicit: headingsEscaped, - // This makes sure the popularities are always greater than 1. - // Generally the 'popularity' is a ratio where the most popular - // one of all is 1.0. - // By making it >=1.0 when we multiply a relevance score, - // you never get a product of 0.0. - popularity: doc.popularity + 1, - intro, - toplevel, - } - return [{ index: { _index: thisAlias } }, record] - }) - - const bulkOptions = { - // Default is 'false'. - // It means that the index is NOT refreshed as documents are inserted. - // Which makes sense in our case because we do not intend to search on - // this index until after we've pointed the alias to this new index. - refresh: false, - // Default is '1m' but we have no reason *not* to be patient. It's run - // by a bot on a schedeule (GitHub Actions). - timeout: '5m', - } - - const attempts = opts.retries || 0 - const sleepTime = (opts.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000 - - console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, { - attempts, - sleepTime, - }) - const t0 = new Date() - const bulkResponse = await retryOnErrorTest( - (error) => { - // Rate limiting can happen when you're indexing too much at - // same time. - return error instanceof errors.ResponseError && error.meta.statusCode === 429 - }, - () => client.bulk({ operations, ...bulkOptions }), - { - attempts, - sleepTime, - onError: (_, attempts, sleepTime) => { - console.warn( - chalk.yellow( - `Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${ - sleepTime / 1000 - }s sleep).`, - ), - ) - }, - }, - ) - - if (bulkResponse.errors) { - // Some day, when we're more confident how and why this might happen - // we can rewrite this code to "massage" the errors better. - // For now, if it fails, it's "OK". It means we won't be proceeding, - // an error is thrown in Actions and we don't have to worry about - // an incompletion index. - console.error(`Bulk response errors: ${bulkResponse.errors}`) - throw new Error('Bulk errors happened.') - } - const t1 = new Date() - console.log(`Bulk indexed ${thisAlias}. Took ${formatTime(t1 - t0)}`) - - // The counting of documents in the index is async and can take a while - // to reflect. So send count requests until we get the right number. - let documentsInIndex = 0 - let countAttempts = 3 - while (documentsInIndex < allRecords.length) { - const { count } = await client.count({ index: thisAlias }) - documentsInIndex = count - if (documentsInIndex >= allRecords.length) break - countAttempts-- - if (!countAttempts) { - console.log(`After ${countAttempts} attempts still haven't matched the expected number.`) - break - } - await sleep(1000) - } - - console.log( - `Documents now in ${chalk.bold(thisAlias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`, - ) - - // To perform an atomic operation that creates the new alias and removes - // the old indexes, we can use the updateAliases API with a body that - // includes an "actions" array. The array includes the added alias - // and the removed indexes. If any of the actions fail, none of the operations - // are performed. - // https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html - const aliasUpdates = [ - { - add: { - index: thisAlias, - alias: indexName, - }, - }, - ] - console.log(`Alias ${indexName} -> ${thisAlias}`) - - console.log('About to get indices with retry %O', { attempts, sleepTime }) - const indices = await retryOnErrorTest( - (error) => { - // 404 can happen when you're trying to get an index that - // doesn't exist. ...yet! - return error instanceof errors.ResponseError && error.meta.statusCode === 404 - }, - () => client.cat.indices({ format: 'json' }), - { - attempts, - sleepTime, - onError: (error, attempts, sleepTime) => { - console.warn( - chalk.yellow( - `Failed to get index ${indexName} (${ - error.message || error.toString() - }). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`, - ), - ) - }, - }, - ) - for (const index of indices) { - if (index.index !== thisAlias && index.index.startsWith(indexName)) { - aliasUpdates.push({ remove_index: { index: index.index } }) - console.log('Deleting index', index.index) - } - } - if (verbose) console.log('Updating alias actions:', aliasUpdates) - await client.indices.updateAliases({ body: { actions: aliasUpdates } }) -} - -function escapeHTML(content) { - return content.replace(//g, '>').replace(/"/g, '"') -} - -async function loadRecords(indexName, sourceDirectory) { - const filePath = path.join(sourceDirectory, `${indexName}-records.json`) - const payload = await fs.readFile(filePath) - return JSON.parse(payload) -} - -function getSnowballLanguage(language) { - // Based on https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-tokenfilter.html - // Note, not all languages are supported. So this function might return - // undefined. That implies that you can't use snowballing. - return { - en: 'English', - fr: 'French', - es: 'Spanish', - ru: 'Russian', - it: 'Italian', - de: 'German', - pt: 'Portuguese', - }[language] -} - -function formatTime(ms) { - if (ms < 1000) { - return `${ms.toFixed(1)}ms` - } - const seconds = ms / 1000 - if (seconds > 60) { - return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s` - } - return `${seconds.toFixed(1)}s` -} diff --git a/src/search/scripts/index-test-fixtures.sh b/src/search/scripts/index-test-fixtures.sh index 235eb915d28f..3091b5e84774 100755 --- a/src/search/scripts/index-test-fixtures.sh +++ b/src/search/scripts/index-test-fixtures.sh @@ -6,7 +6,10 @@ set -e # For general site-search -npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes +npm run index-general-search -- src/search/tests/fixtures/search-indexes -l en -l ja -V ghec -V fpt --index-prefix tests -# For autocomplete search -npm run index -- autocomplete src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests +# For general autocomplete search +npm run index-general-autocomplete -- src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests + +# For AI search autocomplete +npm run index-ai-search-autocomplete -- src/search/tests/fixtures/data -l en -v fpt -v ghec --index-prefix tests diff --git a/src/search/scripts/index/README.md b/src/search/scripts/index/README.md new file mode 100644 index 000000000000..a9f9432c0b0f --- /dev/null +++ b/src/search/scripts/index/README.md @@ -0,0 +1,24 @@ +# Elastic Search Indexing + +Elasticsearch uses indexes to store the data that is used to determine search results. + +We use this scripts in this directory to index our Elasticsearch instances. + +In production, the indexing happens in the GitHub workflows: `index-autocomplete-search.yml` and `index-general-search.yml` + +## CLI Script + +Before running the indexing for **general search** you run the [scrape](../scrape/README.md) script to scrape page data into files. + +Before running the indexing for **general autocomplete** and **AI search autocomplete** you need to clone [docs-internal-data](https://github.com/github/docs-internal-data) to the root of this directory. + +There is a separate run command for indexing each type of search data: +1. **general search**: `npm run index-general-search -- ` +2. **general autocomplete**: `npm run index-general-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory) +3. **AI search autocomplete**: `npm run index-ai-search-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory) + +To see the arguments accepted by any script, pass the `--help` argument, for example + +```bash +npm run index-general-autocomplete -- --help +``` \ No newline at end of file diff --git a/src/search/scripts/index/index-autocomplete.ts b/src/search/scripts/index/index-autocomplete.ts deleted file mode 100644 index 2c88c3a50d29..000000000000 --- a/src/search/scripts/index/index-autocomplete.ts +++ /dev/null @@ -1,167 +0,0 @@ -import fs from 'node:fs' -import path from 'node:path' - -import { Client, estypes } from '@elastic/elasticsearch' - -import { getClient } from './lib/get-client' -import { utcTimestamp } from './lib/utils' -import { populate } from './lib/populate' - -import { type Version, Records } from './types' - -export const shortVersionNames = { - 'enterprise-server': 'ghes', - 'enterprise-cloud': 'ghec', - 'free-pro-team': 'fpt', -} as const - -const DEFAULT_SLEEPTIME_SECONDS = 30 - -type Options = { - dataRepoRoot: string - languages: string[] - versions: Version[] - retries?: number - sleepTime?: number - verbose?: boolean - indexPrefix?: string -} - -export async function indexAutocomplete(options: Options) { - // The data repo has a predictable structure of - // `hydro/rollups/user-searches/$language/$version/rollup.json` - // But note that the "version" might be a prefix, like enterprise-server. - // const { verbose } = options - - const client = getClient() - - const { dataRepoRoot, versions, languages } = options - for (const language of languages) { - for (const version of versions) { - const records = loadRecords({ version, language, dataRepoRoot }) - const { alias, name } = await createIndex( - client, - language, - version, - options.indexPrefix || '', - ) - await populate(client, records, { - alias, - name, - retries: options.retries || 0, - sleepTime: options.sleepTime || DEFAULT_SLEEPTIME_SECONDS, - }) - } - } -} - -type LoadOptions = { - dataRepoRoot: string - language: string - version: string -} - -function loadRecords(options: LoadOptions): Records { - // First load the rollup records for user-searches - const filePath = path.join( - options.dataRepoRoot, - 'hydro/rollups/user-searches', - options.language, - options.version, - 'rollup.json', - ) - const terms: Records = {} - - const userSearchTerms: Records = JSON.parse(fs.readFileSync(filePath, 'utf8')) - let highestValue = Math.max(...Object.values(userSearchTerms)) - if (highestValue === 0) { - throw new Error(`No records found for ${options.language} ${options.version}`) - } - for (const [term, value] of Object.entries(userSearchTerms)) { - // Why +1? - // Because we want these user-searches to alway be higher than all the - // terms generated from titles. - // For example, a common user-search term that users use - // is "log forwarding". But there might not be a deconstructed term, - // from the document titles, however there might be one called - // "log proxy". So when our users search for "log" we want to suggest, - // in the autocomplete UI "log forwarding" before "log proxy". - terms[term] = value / highestValue + 1 - } - - const documentTermsFilePath = path.join( - options.dataRepoRoot, - 'all-documents/terms', - options.language, - options.version, - 'terms.json', - ) - const documentTerms: Records = JSON.parse(fs.readFileSync(documentTermsFilePath, 'utf8')) - highestValue = Math.max(...Object.values(documentTerms)) - if (highestValue === 0) { - throw new Error(`No document title records found for ${options.language} ${options.version}`) - } - for (const [term, value] of Object.entries(documentTerms)) { - if (!(term in terms)) { - terms[term] = value / highestValue + 1 - } - } - - return terms -} - -type IndexInfo = { - alias: string - name: string -} - -async function createIndex( - client: Client, - language: string, - version: Version, - indexPrefix: string, -): Promise { - const settings: estypes.IndicesIndexSettings = { - analysis: { - analyzer: { - text_analyzer: { - filter: ['lowercase'], - tokenizer: 'standard', - type: 'custom', - }, - }, - }, - // filter: { - // // Will later, conditionally, put the snowball configuration here. - // }, - // XXX SNOWBALL? - } - - if (indexPrefix && !indexPrefix.endsWith('_')) { - indexPrefix += '_' - } - - const indexName = `${indexPrefix}github-autocomplete-${language}-${shortVersionNames[version] || version}` - const thisAlias = `${indexName}__${utcTimestamp()}` - - const mappings: estypes.MappingTypeMapping = { - properties: { - term: { - type: 'text', - analyzer: 'text_analyzer', - // This is used for fast highlighting. Uses more space but makes - // the searches faster. - term_vector: 'with_positions_offsets', - }, - popularity: { type: 'float' }, - }, - } - - await client.indices.create({ - index: thisAlias, - mappings, - settings, - }) - - return { alias: thisAlias, name: indexName } -} diff --git a/src/search/scripts/index/index-cli.ts b/src/search/scripts/index/index-cli.ts new file mode 100644 index 000000000000..5ec770e8ef4a --- /dev/null +++ b/src/search/scripts/index/index-cli.ts @@ -0,0 +1,158 @@ +import { program, Option, Command, InvalidArgumentError } from 'commander' +import { errors } from '@elastic/elasticsearch' +import dotenv from 'dotenv' + +import { languageKeys } from '@/languages/lib/languages.js' +import { indexGeneralAutocomplete } from './lib/index-general-autocomplete' +import { indexGeneralSearch } from './lib/index-general-search' +import { + allIndexVersionKeys, + allIndexVersionOptions, + supportedAutocompletePlanVersions, +} from '@/search/lib/elasticsearch-versions' +import { indexAISearchAutocomplete } from './lib/index-ai-search-autocomplete' + +// If you optionally have ELASTICSEARCH_URL set in your .env file. +dotenv.config() + +program.name('index').description('CLI scripts for indexing Docs data into Elasticsearch') + +const generalAutoCompleteCommand = new Command('general-autocomplete') + .description('Index for general search autocomplete') + .addOption( + new Option('-l, --language ', 'Specific languages(s)').choices(languageKeys), + ) + .addOption( + new Option('-v, --version ', 'Specific versions').choices(allIndexVersionKeys), + ) + .option('--verbose', 'Verbose output') + .option('--index-prefix ', 'Prefix for the index names', '') + .argument('', 'path to the docs-internal-data repo') + .action(async (dataRepoRoot: string, options) => { + const languages = options.language ? options.language : languageKeys + const indexPrefix = options.indexPrefix || '' + try { + await indexGeneralAutocomplete({ + dataRepoRoot, + languages, + versions: options.version || supportedAutocompletePlanVersions, + indexPrefix, + }) + } catch (error: any) { + if (error instanceof errors.ElasticsearchClientError) { + if ((error as any)?.meta) { + console.error('Error meta: %O', (error as any).meta) + } + } + console.error('general-autocomplete indexing error:', error.message) + process.exit(1) + } + }) + +const generalSearchCommand = new Command('general-search') + .description( + 'Indexes records for general search. Records should be pre-scraped by the scrape script.', + ) + .option('-v, --verbose', 'Verbose outputs') + .addOption( + new Option('-V, --version [VERSION...]', 'Specific versions').choices(allIndexVersionOptions), + ) + .addOption( + new Option('-l, --language ', 'Which languages to focus on').choices(languageKeys), + ) + .addOption( + new Option('--not-language ', 'Specific language to omit').choices(languageKeys), + ) + .option('-u, --elasticsearch-url ', 'If different from $ELASTICSEARCH_URL') + .option('-p, --index-prefix ', 'Index string to put before index name') + .option( + '-s, --stagger-seconds ', + 'Number of seconds to sleep between each bulk operation', + (value) => { + const parsed = parseInt(value, 10) + if (isNaN(parsed)) { + throw new InvalidArgumentError('Not a number.') + } + return parsed + }, + ) + .option( + '-r, --retries ', + 'Number of retry attempts on recoverable network errors', + (value) => { + const parsed = parseInt(value, 10) + if (isNaN(parsed)) { + throw new InvalidArgumentError('Not a number.') + } + return parsed + }, + ) + .option( + '--sleep-time ', + `Number of seconds to sleep between each retry attempt (defaults to 30)`, + (value) => { + const parsed = parseInt(value, 10) + if (isNaN(parsed)) { + throw new InvalidArgumentError('Not a number.') + } + return parsed + }, + 30, + ) + .argument('', 'where the indexable files are') + .action(async (sourceDirectory, options) => { + try { + await indexGeneralSearch(sourceDirectory, options) + } catch (error: any) { + if (error instanceof errors.ElasticsearchClientError) { + if ((error as any)?.meta) { + console.error('Error meta: %O', (error as any).meta) + } + } + console.error('general-search indexing error:', error.message) + process.exit(1) + } + }) + +const aiSearchAutocompleteCommand = new Command('ai-search-autocomplete') + .description('Index for AI search autocomplete') + .addOption( + new Option( + '-l, --language ', + 'Specific languages(s). (NOTE: Only english, "en" is currently supported', + ).choices(['en']), + ) + .addOption( + new Option('-v, --version ', 'Specific versions').choices(allIndexVersionKeys), + ) + .option('--verbose', 'Verbose output') + .option('--index-prefix ', 'Prefix for the index names', '') + .argument('', 'path to the docs-internal-data repo') + .action(async (dataRepoRoot: string, options) => { + // In the future, we may want to support multiple languages + // Currently (since this is an experiment), we only support english + const languages = ['en'] + const indexPrefix = options.indexPrefix || '' + try { + await indexAISearchAutocomplete({ + dataRepoRoot, + languages, + versions: options.version || supportedAutocompletePlanVersions, + indexPrefix, + }) + } catch (error: any) { + if (error instanceof errors.ElasticsearchClientError) { + if ((error as any)?.meta) { + console.error('Error meta: %O', (error as any).meta) + } + } + console.error('ai-search-autocomplete indexing error:', error.message) + process.exit(1) + } + }) + +program.addCommand(generalAutoCompleteCommand) +program.addCommand(generalSearchCommand) +program.addCommand(aiSearchAutocompleteCommand) + +program.parse(process.argv) diff --git a/src/search/scripts/index/index.ts b/src/search/scripts/index/index.ts deleted file mode 100644 index 4b46ae05c1f6..000000000000 --- a/src/search/scripts/index/index.ts +++ /dev/null @@ -1,44 +0,0 @@ -import { program, Option } from 'commander' - -import { languageKeys } from '@/languages/lib/languages.js' -import { indexAutocomplete } from './index-autocomplete' -import { type Version } from './types' - -const defaultVersions: Version[] = ['free-pro-team', 'enterprise-server', 'enterprise-cloud'] -const shortAlias = new Map() -shortAlias.set('ghes', 'enterprise-server') -shortAlias.set('fpt', 'free-pro-team') -shortAlias.set('ghec', 'enterprise-cloud') - -program.name('index').description('CLI scripts for indexing to Elasticsearch') - -program - .command('autocomplete') - .description('Index for autocomplete') - .addOption( - new Option('-l, --language ', 'Specific languages(s)').choices(languageKeys), - ) - .addOption( - new Option('-v, --version ', 'Specific version prefix(es)').choices([ - ...defaultVersions, - ...shortAlias.keys(), - ]), - ) - .option('--verbose', 'Verbose output') - .option('--index-prefix ', 'Prefix for the index names', '') - .argument('', 'path to the docs-internal-data repo') - .action((root: string, options) => { - const languages = options.language ? options.language : languageKeys - const versions: Version[] = [] - for (const v of options.version || defaultVersions) { - if (shortAlias.has(v)) { - versions.push(shortAlias.get(v)!) - } else { - versions.push(v) - } - } - const indexPrefix = options.indexPrefix || '' - return indexAutocomplete({ dataRepoRoot: root, languages, versions, indexPrefix }) - }) - -program.parse(process.argv) diff --git a/src/search/scripts/index/lib/get-client.ts b/src/search/scripts/index/lib/get-client.ts deleted file mode 100644 index 4f9b79034430..000000000000 --- a/src/search/scripts/index/lib/get-client.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { Client } from '@elastic/elasticsearch' - -export function getClient(): Client { - const node = getElasticsearchURL() - const client = new Client({ node }) - return client -} - -function getElasticsearchURL() { - if (!process.env.ELASTICSEARCH_URL) { - throw new Error( - 'Must passed the elasticsearch URL option or ' + - 'set the environment variable ELASTICSEARCH_URL', - ) - } - let node = process.env.ELASTICSEARCH_URL - - // Allow the user to lazily set it to `localhost:9200` for example. - if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { - node = `http://${node}` - } - - const parsed = new URL(node) - if (!parsed.hostname) throw new Error('no valid hostname') - - return node -} diff --git a/src/search/scripts/index/lib/index-ai-search-autocomplete.ts b/src/search/scripts/index/lib/index-ai-search-autocomplete.ts new file mode 100644 index 000000000000..8bde681a1afe --- /dev/null +++ b/src/search/scripts/index/lib/index-ai-search-autocomplete.ts @@ -0,0 +1,112 @@ +import fs from 'node:fs' +import path from 'node:path' + +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' +import { + createIndex, + populateIndex, + printSuccess, + updateAlias, +} from '@/search/scripts/index/utils/indexing-elasticsearch-utils' +import { getAISearchAutocompleteSettings } from '@/search/scripts/index/utils/settings' +import { aiSearchAutocompleteMappings } from '@/search/scripts/index/utils/mappings' +import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions' + +import type { TermsWithFrequency } from '@/search/scripts/index/types' + +type Options = { + dataRepoRoot: string + languages: string[] + versions: string[] + retries?: number + sleepTime?: number + verbose?: boolean + indexPrefix?: string +} + +export async function indexAISearchAutocomplete(options: Options) { + const client = getElasticsearchClient(undefined, options.verbose) + await client.ping() // Will throw if not available + + const { dataRepoRoot, languages, versions } = options + for (const language of languages) { + for (const version of versions) { + const startTime = new Date() + + const records = loadQueriesWithPriority({ dataRepoRoot, language, version }) + const { indexName, indexAlias } = getElasticSearchIndex( + 'aiSearchAutocomplete', + version, + language, + options.indexPrefix || '', + ) + + const settings = getAISearchAutocompleteSettings(language, options.verbose) + + await createIndex(client, indexAlias, settings, aiSearchAutocompleteMappings) + + const recordsArray = Object.entries(records).map(([term, popularity]) => ({ + term, + popularity, + })) + + await populateIndex(client, indexAlias, indexName, recordsArray, { + retries: options.retries, + sleepTime: options.sleepTime, + verbose: options.verbose, + }) + + await updateAlias(client, indexName, indexAlias, options) + + printSuccess(indexName, startTime, options.verbose) + } + } +} + +type LoadOptions = { + dataRepoRoot: string + language: string + version: string +} + +function loadQueriesWithPriority(options: LoadOptions): TermsWithFrequency { + // The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt` + const internalDataVersion = getPlanVersionFromIndexVersion(options.version) + + if (!internalDataVersion) { + throw new Error(`No rollup version found for version ${options.version}`) + } + + const queriesFilePath = path.join( + options.dataRepoRoot, + 'ai/search/queries', + options.language, + internalDataVersion, + 'queries.json', + ) + + const queriesFile = JSON.parse(fs.readFileSync(queriesFilePath, 'utf8')) + const { topQueries, allQueries } = queriesFile + + const terms: TermsWithFrequency = {} + + let popularity = topQueries.length + allQueries.length + + // Assign higher popularity to topQueries + for (const term of topQueries) { + terms[term] = popularity + popularity -= 1 + } + + // Assign remaining popularity to allQueries using the order they have in the JSON + for (const term of allQueries) { + // Don't read in the topQueries again (duplicates) + if (!(term in terms)) { + terms[term] = popularity + popularity -= 1 + } + } + + return terms +} diff --git a/src/search/scripts/index/lib/index-general-autocomplete.ts b/src/search/scripts/index/lib/index-general-autocomplete.ts new file mode 100644 index 000000000000..436417256ecf --- /dev/null +++ b/src/search/scripts/index/lib/index-general-autocomplete.ts @@ -0,0 +1,134 @@ +import fs from 'node:fs' +import path from 'node:path' + +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' +import { + createIndex, + populateIndex, + printSuccess, + updateAlias, +} from '@/search/scripts/index/utils/indexing-elasticsearch-utils' +import { getGeneralAutocompleteSettings } from '@/search/scripts/index/utils/settings' +import { generalAutocompleteMappings } from '@/search/scripts/index/utils/mappings' +import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions' + +import type { TermsWithFrequency } from '@/search/scripts/index/types' + +type Options = { + dataRepoRoot: string + languages: string[] + versions: string[] + retries?: number + sleepTime?: number + verbose?: boolean + indexPrefix?: string +} + +export async function indexGeneralAutocomplete(options: Options) { + const client = getElasticsearchClient(undefined, options.verbose) + await client.ping() // Will throw if not available + + const { dataRepoRoot, versions, languages } = options + for (const language of languages) { + for (const version of versions) { + const startTime = new Date() + + const records = loadTermsWithFrequency({ version, language, dataRepoRoot }) + const { indexName, indexAlias } = getElasticSearchIndex( + 'generalAutocomplete', + version, + language, + options.indexPrefix || '', + ) + + const settings = getGeneralAutocompleteSettings(language, options.verbose) + + await createIndex(client, indexAlias, settings, generalAutocompleteMappings) + + const recordsArray = Object.entries(records).map(([term, popularity]) => ({ + term, + popularity, + })) + + await populateIndex(client, indexAlias, indexName, recordsArray, { + retries: options.retries, + sleepTime: options.sleepTime, + verbose: options.verbose, + }) + + await updateAlias(client, indexName, indexAlias, options) + + printSuccess(indexName, startTime, options.verbose) + } + } +} + +type LoadOptions = { + dataRepoRoot: string + language: string + version: string +} + +/* + * Terms are one-word search terms that a user might enter into a search toolbar + * We have two sources of "terms": + * - Previous user searches (searchTerms) + * - Terms auto-generated taking each word from each title of all of our articles (documentTerms) + * + * Each of the files live in our docs-internal-data repo that should be cloned before running this script. + * The paths to these files for each type of term are: + * - searchTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json + * - documentTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json + */ +function loadTermsWithFrequency(options: LoadOptions): TermsWithFrequency { + // The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt` + const internalDataVersion = getPlanVersionFromIndexVersion(options.version) + + if (!internalDataVersion) { + throw new Error(`No rollup version found for version ${options.version}`) + } + + const filePath = path.join( + options.dataRepoRoot, + 'hydro/rollups/user-searches', + options.language, + internalDataVersion, + 'rollup.json', + ) + const terms: TermsWithFrequency = {} + + const userSearchTerms: TermsWithFrequency = JSON.parse(fs.readFileSync(filePath, 'utf8')) + let maxFrequency = Math.max(...Object.values(userSearchTerms)) + if (maxFrequency === 0) { + throw new Error(`No records found for ${options.language} ${options.version}`) + } + for (const [term, frequency] of Object.entries(userSearchTerms)) { + // Normalize the frequency which will turn into "popularity" in ElasticSearch + // We include +1 here because "userSearchTerms" should have higher priority than "articleTitleTerms" + terms[term] = frequency / maxFrequency + 1 + } + + const articleTitleTermsFilePath = path.join( + options.dataRepoRoot, + 'all-documents/terms', + options.language, + internalDataVersion, + 'terms.json', + ) + const articleTitleTerms: TermsWithFrequency = JSON.parse( + fs.readFileSync(articleTitleTermsFilePath, 'utf8'), + ) + maxFrequency = Math.max(...Object.values(articleTitleTerms)) + if (maxFrequency === 0) { + throw new Error(`No document title records found for ${options.language} ${options.version}`) + } + for (const [articleTitleTerm, frequency] of Object.entries(articleTitleTerms)) { + if (!(articleTitleTerm in terms)) { + // Notice that we don't + 1 here because we want to give more priority to data from user searches + terms[articleTitleTerm] = frequency / maxFrequency + } + } + + return terms +} diff --git a/src/search/scripts/index/lib/index-general-search.ts b/src/search/scripts/index/lib/index-general-search.ts new file mode 100644 index 000000000000..7a6596a096e9 --- /dev/null +++ b/src/search/scripts/index/lib/index-general-search.ts @@ -0,0 +1,145 @@ +import { Client } from '@elastic/elasticsearch' +import chalk from 'chalk' + +import { languageKeys } from '#src/languages/lib/languages.js' +import { allVersions } from '#src/versions/lib/all-versions.js' +import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' +import { getElasticsearchClient } from '@/search/lib/helpers/get-client' +import { + createIndex, + escapeHTML, + loadIndexRecords, + populateIndex, + printSuccess, + updateAlias, +} from '@/search/scripts/index/utils/indexing-elasticsearch-utils' +import { sleep } from '@/search/lib/helpers/time' +import { getGeneralSearchSettings } from '@/search/scripts/index/utils/settings' +import { generalSearchMappings } from '@/search/scripts/index/utils/mappings' + +import type { AllVersionInfo } from '@/search/scripts/index/types' + +interface Options { + verbose?: boolean + version?: string[] | string + language?: string[] + notLanguage?: string[] + elasticsearchUrl?: string + indexPrefix?: string + staggerSeconds?: number + retries?: number + sleepTime: number +} + +const shortNames: { [key: string]: AllVersionInfo } = Object.fromEntries( + Object.values(allVersions).map((info: AllVersionInfo) => { + const shortName = info.hasNumberedReleases + ? info.miscBaseName + info.currentRelease + : info.miscBaseName + return [shortName, info] + }), +) + +const allVersionKeys = Object.keys(shortNames) + +export async function indexGeneralSearch(sourceDirectory: string, opts: Options) { + if (!sourceDirectory) { + throw new Error('Must pass the source directory as the first argument') + } + + const { language, notLanguage } = opts + + if (language && notLanguage) { + throw new Error("Can't combine --language and --not-language") + } + + const client = getElasticsearchClient(opts.elasticsearchUrl, opts.verbose) + await client.ping() // Will throw if not available + + let version: string | string[] | undefined = opts.version + if (!version && process.env.VERSION && process.env.VERSION !== 'all') { + version = process.env.VERSION + if (!allVersionKeys.includes(version)) { + throw new Error( + `Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`, + ) + } + } + let versionKeys = allVersionKeys + if (version) { + versionKeys = Array.isArray(version) ? version : [version] + } + + const languages = + language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang)) + if (opts.verbose) { + console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`) + } + + for (const language of languages) { + let count = 0 + for (const versionKey of versionKeys) { + const startTime = new Date() + + const { indexName, indexAlias } = getElasticSearchIndex( + 'generalSearch', + versionKey, + language, + opts.indexPrefix || '', + ) + + await indexVersion(client, indexName, indexAlias, language, sourceDirectory, opts) + + count++ + if (opts.staggerSeconds && count < versionKeys.length - 1) { + console.log(`Sleeping for ${opts.staggerSeconds} seconds...`) + await sleep(1000 * opts.staggerSeconds) + } + + printSuccess(indexName, startTime, opts.verbose) + } + } +} + +async function indexVersion( + client: Client, + indexName: string, + indexAlias: string, + language: string, + sourceDirectory: string, + opts: Options, +) { + const recordsData = await loadIndexRecords(indexName, sourceDirectory) + const allRecords = Object.values(recordsData).sort((a, b) => b.popularity - a.popularity) + const records = allRecords.map((doc) => { + const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc + const contentEscaped = escapeHTML(content) + const headingsEscaped = escapeHTML(headings) + return { + url: objectID, + title, + title_explicit: title, + content: contentEscaped, + content_explicit: contentEscaped, + breadcrumbs, + headings: headingsEscaped, + headings_explicit: headingsEscaped, + popularity: doc.popularity + 1, + intro, + toplevel, + } + }) + + const settings = getGeneralSearchSettings(language, opts.verbose || false) + const mappings = generalSearchMappings + + await createIndex(client, indexAlias, settings, mappings) + + await populateIndex(client, indexAlias, indexName, records, { + retries: opts.retries, + sleepTime: opts.sleepTime * 1000, + verbose: opts.verbose, + }) + + await updateAlias(client, indexName, indexAlias, opts) +} diff --git a/src/search/scripts/index/lib/populate.ts b/src/search/scripts/index/lib/populate.ts deleted file mode 100644 index 252b90cefbcd..000000000000 --- a/src/search/scripts/index/lib/populate.ts +++ /dev/null @@ -1,107 +0,0 @@ -import chalk from 'chalk' -import { Client, errors } from '@elastic/elasticsearch' - -import type { Records, RetryConfig } from '../types' -import { retryOnErrorTest } from './retry-on-error-test' -import { repointAlias } from './repoint-alias' -import { formatTime, sleep } from './utils' - -type PopulateOptions = RetryConfig & { - verbose?: boolean - alias: string - name: string -} - -export async function populate(client: Client, records: Records, options: PopulateOptions) { - const { alias, name } = options - - const allRecords = Object.entries(records).sort((a, b) => b[1] - a[1]) - const operations = allRecords.flatMap(([term, count]) => { - const popularity = count / allRecords[0][1] // Normalize to 1.0 for the highest count - return [ - { index: { _index: alias } }, - { - term, - popularity, - }, - ] - }) - - const bulkOptions = { - // Default is 'false'. - // It means that the index is NOT refreshed as documents are inserted. - // Which makes sense in our case because we do not intend to search on - // this index until after we've pointed the alias to this new index. - refresh: false, - // Default is '1m' but we have no reason *not* to be patient. It's run - // by a bot on a schedeule (GitHub Actions). - timeout: '5m', - } - - const attempts = options.retries - const sleepTime = options.sleepTime * 1000 - - console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, { - attempts, - sleepTime, - }) - const t0 = new Date() - const bulkResponse = await retryOnErrorTest( - (error: Error) => { - // Rate limiting can happen when you're indexing too much at - // same time. - return error instanceof errors.ResponseError && error.meta.statusCode === 429 - }, - () => client.bulk({ operations, ...bulkOptions }), - { - attempts, - sleepTime, - onError: (_, attempts, sleepTime) => { - console.warn( - chalk.yellow( - `Failed to bulk index ${name}. Will attempt ${attempts} more times (after ${ - sleepTime / 1000 - }s sleep).`, - ), - ) - }, - }, - ) - - if (bulkResponse.errors) { - // Some day, when we're more confident how and why this might happen - // we can rewrite this code to "massage" the errors better. - // For now, if it fails, it's "OK". It means we won't be proceeding, - // an error is thrown in Actions and we don't have to worry about - // an incompletion index. - console.error(`Bulk response errors: ${bulkResponse.errors}`) - throw new Error('Bulk errors happened.') - } - const t1 = new Date() - console.log(`Bulk indexed ${alias}. Took ${formatTime(t1.getTime() - t0.getTime())}`) - - // The counting of documents in the index is async and can take a while - // to reflect. So send count requests until we get the right number. - let documentsInIndex = 0 - let countAttempts = 3 - while (documentsInIndex < allRecords.length) { - const { count } = await client.count({ index: alias }) - documentsInIndex = count - if (documentsInIndex >= allRecords.length) break - countAttempts-- - if (!countAttempts) { - console.log(`After ${countAttempts} attempts still haven't matched the expected number.`) - break - } - await sleep(1000) - } - console.log( - `Documents now in ${chalk.bold(alias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`, - ) - - await repointAlias(client, alias, name, { - attempts, - sleepTime, - verbose: Boolean(options.verbose), - }) -} diff --git a/src/search/scripts/index/lib/repoint-alias.ts b/src/search/scripts/index/lib/repoint-alias.ts deleted file mode 100644 index 36af59d2609e..000000000000 --- a/src/search/scripts/index/lib/repoint-alias.ts +++ /dev/null @@ -1,77 +0,0 @@ -import chalk from 'chalk' -import { Client, errors } from '@elastic/elasticsearch' - -import { retryOnErrorTest } from './retry-on-error-test' -import { formatTime } from './utils' - -export async function repointAlias( - client: Client, - alias: string, - name: string, - options: { - attempts: number - sleepTime: number - verbose: boolean - }, -) { - const { attempts, sleepTime, verbose } = options - // To perform an atomic operation that creates the new alias and removes - // the old indexes, we can use the updateAliases API with a body that - // includes an "actions" array. The array includes the added alias - // and the removed indexes. If any of the actions fail, none of the operations - // are performed. - // https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html - - type Update = - | { - add: { - index: string - alias: string - } - } - | { - remove_index: { - index: string - } - } - const aliasUpdates: Update[] = [ - { - add: { - index: alias, - alias: name, - }, - }, - ] - console.log(`Alias ${name} -> ${alias}`) - - console.log('About to get indices with retry %O', { attempts, sleepTime }) - const indices = await retryOnErrorTest( - (error: any) => { - // 404 can happen when you're trying to get an index that - // doesn't exist. ...yet! - return error instanceof errors.ResponseError && error.meta.statusCode === 404 - }, - () => client.cat.indices({ format: 'json' }), - { - attempts, - sleepTime, - onError: (error, attempts, sleepTime) => { - console.warn( - chalk.yellow( - `Failed to get index ${name} (${ - error.message || error.toString() - }). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`, - ), - ) - }, - }, - ) - for (const index of indices) { - if (index.index !== alias && index.index.startsWith(name)) { - aliasUpdates.push({ remove_index: { index: index.index } }) - console.log('Deleting index', index.index) - } - } - if (verbose) console.log('Updating alias actions:', aliasUpdates) - await client.indices.updateAliases({ body: { actions: aliasUpdates } }) -} diff --git a/src/search/scripts/index/types.ts b/src/search/scripts/index/types.ts index 533fb79d045f..bb4fd8f876fe 100644 --- a/src/search/scripts/index/types.ts +++ b/src/search/scripts/index/types.ts @@ -1,10 +1,55 @@ -export type Version = 'free-pro-team' | 'enterprise-server' | 'enterprise-cloud' - -export type Records = { - [key: string]: number -} - export type RetryConfig = { retries: number sleepTime: number } + +export interface AllVersionInfo { + hasNumberedReleases: boolean + miscBaseName: string + currentRelease: string + version: string + plan: string +} + +export interface AllVersions { + [key: string]: AllVersionInfo +} + +export interface Options { + language?: string + notLanguage?: string + version?: string + docsInternalData?: string + markers?: boolean + filter?: string +} + +export type Args = string[] + +export interface Page { + relativePath: string + redirect_from?: string[] +} + +export interface Config { + noMarkers: boolean + filter?: string + docsInternalDataPath?: string +} + +export type TermsWithFrequency = { [term: string]: number } + +export interface Records { + [objectID: string]: Record // Here objectId will be identical to the record's objectId +} + +export interface Record { + objectID: string // e.g. "/en/enterprise-cloud@latest/get-started" + breadcrumbs: string // e.g. "Get started / Using GitHub" + title: string // e.g. "Get started with GitHub documentation" + headings: string + content: string + intro: string + toplevel: string + popularity: number +} diff --git a/src/search/scripts/index/utils/constants.ts b/src/search/scripts/index/utils/constants.ts new file mode 100644 index 000000000000..6b833259d0c8 --- /dev/null +++ b/src/search/scripts/index/utils/constants.ts @@ -0,0 +1,11 @@ +export const SNOWBALL_LANGUAGES: { [key: string]: string } = { + en: 'English', + fr: 'French', + es: 'Spanish', + ru: 'Russian', + it: 'Italian', + de: 'German', + pt: 'Portuguese', +} + +export const DEFAULT_SLEEPTIME_SECONDS = 30 diff --git a/src/search/scripts/index/utils/indexing-elasticsearch-utils.ts b/src/search/scripts/index/utils/indexing-elasticsearch-utils.ts new file mode 100644 index 000000000000..8bde91c54685 --- /dev/null +++ b/src/search/scripts/index/utils/indexing-elasticsearch-utils.ts @@ -0,0 +1,178 @@ +import chalk from 'chalk' +import { Client, estypes, errors } from '@elastic/elasticsearch' +import fs from 'fs/promises' +import path from 'path' + +import { readableTimeMinAndSec, sleep } from '@/search/lib/helpers/time' +import { retryOnErrorTest } from '@/search/scripts/index/utils/retry-on-error-test' +import { + DEFAULT_SLEEPTIME_SECONDS, + SNOWBALL_LANGUAGES, +} from '@/search/scripts/index/utils/constants' +import { safeUrlDisplay } from '@/search/lib/helpers/strings' + +import type { Records } from '@/search/scripts/index/types' + +type Options = { + retries?: number + sleepTime?: number + verbose?: boolean +} + +export async function createIndex( + client: Client, + indexAlias: string, + settings: estypes.IndicesIndexSettings, + mappings: estypes.MappingTypeMapping, +) { + await client.indices.create({ + index: indexAlias, + mappings, + settings, + }) +} + +export async function populateIndex( + client: Client, + indexAlias: string, + indexName: string, + records: any[], + options: Options, +) { + console.log(chalk.yellow(`\nIndexing ${chalk.bold(indexName)}`)) + const bulkOperations = records.flatMap((doc) => [{ index: { _index: indexAlias } }, doc]) + + const bulkOptions = { + refresh: false, + timeout: '5m', + } + + const attempts = options.retries || 0 + const sleepTime = options.sleepTime || DEFAULT_SLEEPTIME_SECONDS * 1000 + console.log(`About to bulk index ${records.length.toLocaleString()} records with retry %O`, { + attempts, + sleepTimeMS: sleepTime, + }) + + const t0 = new Date() + const bulkResponse = await retryOnErrorTest( + (error) => error instanceof errors.ResponseError && error.meta.statusCode === 429, + () => client.bulk({ operations: bulkOperations, ...bulkOptions }), + { + attempts, + sleepTime, + onError: (_, attempts, sleepTime) => { + console.warn( + chalk.yellow( + `Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${ + sleepTime / 1000 + }s sleep).`, + ), + ) + }, + }, + ) + + if (bulkResponse.errors) { + console.error(`Bulk response errors: ${bulkResponse.errors}`) + throw new Error('Bulk errors happened.') + } + const t1 = new Date() + console.log( + `Bulk indexed ${indexAlias}. Took ${readableTimeMinAndSec(t1.getTime() - t0.getTime())}`, + ) + + let documentsInIndex = 0 + let countAttempts = 3 + while (documentsInIndex < records.length) { + const { count } = await client.count({ index: indexAlias }) + documentsInIndex = count + if (documentsInIndex >= records.length) break + countAttempts-- + if (!countAttempts) { + console.log(`After ${countAttempts} attempts still haven't matched the expected number.`) + break + } + await sleep(1000) + } + + console.log(`Documents now in ${indexAlias}: ${documentsInIndex.toLocaleString()}`) +} + +export async function updateAlias( + client: Client, + indexName: string, + indexAlias: string, + options: Options, +) { + const aliasUpdates: estypes.IndicesUpdateAliasesAction[] = [ + { + add: { + index: indexAlias, + alias: indexName, + }, + }, + ] + + const indices = await retryOnErrorTest( + (error) => { + // 404 can happen when you're trying to get an index that + // doesn't exist. ...yet! + return error instanceof errors.ResponseError && error.meta.statusCode === 404 + }, + () => client.cat.indices({ format: 'json' }), + { + attempts: options.retries || 0, + sleepTime: (options.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000, + onError: (error, attempts, sleepTime) => { + console.warn( + chalk.yellow( + `Failed to get index ${indexName} (${ + error.message || error.toString() + }). Will attempt ${attempts} more times (after ${readableTimeMinAndSec(sleepTime)}s sleep).`, + ), + ) + }, + }, + ) + + for (const index of indices) { + if (index.index !== indexAlias && index.index.startsWith(indexName)) { + aliasUpdates.push({ remove_index: { index: index.index } }) + console.log('Deleting old index', index.index) + } + } + if (options.verbose) console.log('Updating alias actions:', aliasUpdates) + await client.indices.updateAliases({ body: { actions: aliasUpdates } }) +} + +export function printSuccess(indexName: string, startTime: Date, verbose = false) { + const endTime = new Date() + console.log( + chalk.green( + `Finished indexing ${indexName}. Took ${readableTimeMinAndSec(endTime.getTime() - startTime.getTime())}`, + ), + ) + + if (verbose) { + console.log(`To view index: ${safeUrlDisplay(`/${indexName}`)}`) + console.log(`To search index: ${safeUrlDisplay(`/${indexName}/_search`)}`) + } +} + +export async function loadIndexRecords( + indexName: string, + sourceDirectory: string, +): Promise { + const filePath = path.join(sourceDirectory, `${indexName}-records.json`) + const payload = await fs.readFile(filePath, 'utf8') + return JSON.parse(payload) +} + +export function escapeHTML(content: string): string { + return content.replace(//g, '>').replace(/"/g, '"') +} + +export function getSnowballLanguage(language: string): string | undefined { + return SNOWBALL_LANGUAGES[language] +} diff --git a/src/search/scripts/index/utils/mappings.ts b/src/search/scripts/index/utils/mappings.ts new file mode 100644 index 000000000000..1bacf528ee21 --- /dev/null +++ b/src/search/scripts/index/utils/mappings.ts @@ -0,0 +1,52 @@ +import type { estypes } from '@elastic/elasticsearch' + +export const generalSearchMappings: estypes.MappingTypeMapping = { + properties: { + url: { type: 'keyword' }, + title: { + type: 'text', + analyzer: 'text_analyzer', + norms: false, + term_vector: 'with_positions_offsets', + }, + title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false }, + content: { + type: 'text', + analyzer: 'text_analyzer', + term_vector: 'with_positions_offsets', + }, + content_explicit: { + type: 'text', + analyzer: 'text_analyzer_explicit', + term_vector: 'with_positions_offsets', + }, + headings: { type: 'text', analyzer: 'text_analyzer', norms: false }, + headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false }, + breadcrumbs: { type: 'text' }, + popularity: { type: 'float' }, + intro: { type: 'text' }, + toplevel: { type: 'keyword' }, + }, +} + +export const generalAutocompleteMappings: estypes.MappingTypeMapping = { + properties: { + term: { + type: 'text', + analyzer: 'text_analyzer', + term_vector: 'with_positions_offsets', + }, + popularity: { type: 'float' }, + }, +} + +export const aiSearchAutocompleteMappings: estypes.MappingTypeMapping = { + properties: { + term: { + type: 'text', + analyzer: 'text_analyzer', + term_vector: 'with_positions_offsets', + }, + popularity: { type: 'float' }, + }, +} diff --git a/src/search/scripts/index/lib/retry-on-error-test.ts b/src/search/scripts/index/utils/retry-on-error-test.ts similarity index 97% rename from src/search/scripts/index/lib/retry-on-error-test.ts rename to src/search/scripts/index/utils/retry-on-error-test.ts index b2c88420a4ef..bed48738f492 100644 --- a/src/search/scripts/index/lib/retry-on-error-test.ts +++ b/src/search/scripts/index/utils/retry-on-error-test.ts @@ -1,5 +1,3 @@ -// [start-readme] -// // Return a function that you can use to run any code within and if it // throws you get a chance to say whether to sleep + retry. // Example: @@ -20,10 +18,8 @@ // Note that, by default, the sleep time is "exponential" by a factor of // 1.5. So the first sleep will, in the above example, // be 800ms. Then 1,200ms, Then 1,800ms. etc. -// -// [end-readme] -import { sleep } from './utils' +import { sleep } from '@/search/lib/helpers/time' export async function retryOnErrorTest( errorTest: (error: any) => boolean, diff --git a/src/search/scripts/index/utils/settings.ts b/src/search/scripts/index/utils/settings.ts new file mode 100644 index 000000000000..a2d65ca29ffe --- /dev/null +++ b/src/search/scripts/index/utils/settings.ts @@ -0,0 +1,118 @@ +import { SNOWBALL_LANGUAGES } from '@/search/scripts/index/utils/constants' + +import type { estypes } from '@elastic/elasticsearch' +import type { + AnalysisSnowballLanguage, + AnalysisCustomAnalyzer, +} from '@elastic/elasticsearch/lib/api/types' + +export function getGeneralSearchSettings( + language: string, + verbose: boolean, +): estypes.IndicesIndexSettings { + const settings: estypes.IndicesIndexSettings = { + analysis: { + char_filter: { + hyphenation_filter: { + type: 'mapping', + mappings: ['- => _'], + }, + }, + analyzer: { + text_analyzer_explicit: { + char_filter: ['hyphenation_filter'], + filter: ['lowercase', 'stop', 'asciifolding'], + tokenizer: 'standard', + type: 'custom', + } as AnalysisCustomAnalyzer, + text_analyzer: { + filter: ['lowercase', 'stop', 'asciifolding'], + tokenizer: 'standard', + type: 'custom', + } as AnalysisCustomAnalyzer, + }, + filter: {}, + }, + } + + const snowballLanguage = SNOWBALL_LANGUAGES[language] + if (snowballLanguage) { + const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer + textAnalyzer.filter!.push('languaged_snowball') + + settings.analysis!.filter!['languaged_snowball'] = { + type: 'snowball', + language: snowballLanguage as AnalysisSnowballLanguage, + } + } else if (verbose) { + console.warn(`No snowball language for '${language}'`) + } + + return settings +} + +export function getGeneralAutocompleteSettings( + language: string, + verbose = false, +): estypes.IndicesIndexSettings { + const settings: estypes.IndicesIndexSettings = { + analysis: { + analyzer: { + text_analyzer: { + filter: ['lowercase'], + tokenizer: 'standard', + type: 'custom', + } as AnalysisCustomAnalyzer, + }, + filter: {}, + }, + } + + const snowballLanguage = SNOWBALL_LANGUAGES[language] + if (snowballLanguage) { + const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer + textAnalyzer.filter!.push('languaged_snowball') + + settings.analysis!.filter!['languaged_snowball'] = { + type: 'snowball', + language: snowballLanguage as AnalysisSnowballLanguage, + } + } else if (verbose) { + console.warn(`No snowball language for '${language}'`) + } + + return settings +} + +export function getAISearchAutocompleteSettings( + language: string, + verbose = false, +): estypes.IndicesIndexSettings { + const settings: estypes.IndicesIndexSettings = { + analysis: { + analyzer: { + text_analyzer: { + filter: ['lowercase'], + tokenizer: 'standard', + type: 'custom', + } as AnalysisCustomAnalyzer, + }, + filter: {}, + }, + } + + const snowballLanguage = SNOWBALL_LANGUAGES[language] + if (snowballLanguage) { + const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer + textAnalyzer.filter!.push('languaged_snowball') + + settings.analysis!.filter!['languaged_snowball'] = { + type: 'snowball', + language: snowballLanguage as AnalysisSnowballLanguage, + } + } else if (verbose) { + console.warn(`No snowball language for '${language}'`) + } + + return settings +} diff --git a/src/search/scripts/retry-on-error-test.js b/src/search/scripts/retry-on-error-test.js deleted file mode 100644 index c41b222b47d0..000000000000 --- a/src/search/scripts/retry-on-error-test.js +++ /dev/null @@ -1,76 +0,0 @@ -// [start-readme] -// -// Return a function that you can use to run any code within and if it -// throws you get a chance to say whether to sleep + retry. -// Example: -// -// async function mainFunction() { -// if (Math.random() > 0.9) throw new Error('too large') -// return 'OK' -// } -// -// const errorTest = (err) => err instanceof Error && err.message.includes('too large') -// const config = { // all optional -// attempts: 3, -// sleepTime: 800, -// onError: (err, attempts) => console.warn(`Failed ${attempts} attempts`) -// } -// const ok = await retry(errorTest, mainFunction, config) -// -// Note that, by default, the sleep time is "exponential" by a factor of -// 1.5. So the first sleep will, in the above example, -// be 800ms. Then 1,200ms, Then 1,800ms. etc. -// -// [end-readme] - -const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) - -export async function retryOnErrorTest( - errorTest, - callback, - { - attempts = 4, - sleepTime = 1000, - exponential = 1.5, - jitterPercent = 25, - onError = () => {}, - } = {}, -) { - while (true) { - try { - return await callback() - } catch (error) { - if (error instanceof Error && attempts > 0 && errorTest(error)) { - if (onError) onError(error, attempts, sleepTime) - attempts-- - // The reason for the jitter is to avoid a thundering herd problem. - // Suppose two independent processes/threads start at the same time. - // They both fail, perhaps due to rate limiting. Now, if they both - // sleep for 30 seconds in the first retry attempt, it'll just - // clash again 30 seconds later. But if you add a bit of jitter, at - // the next attempt these independent processes/threads will now - // start at slightly different times. - - // According to the Oxford English dictionary, they define "jitter" as: - // - // slight irregular movement, variation, or unsteadiness, - // especially in an electrical signal or electronic device. - // - await sleep(addJitter(sleepTime, jitterPercent)) - if (exponential) { - sleepTime *= 2 - } - } else { - throw error - } - } - } -} - -function addJitter(num, percent) { - // Return the number plus between 0 and $percent of that number. - // For example, for 1,000 with a 20% jitter you might get 1133.4 - // because you start with 1,000 and 13.4% is a random number between - // 0 and 20%. - return num + Math.random() * percent * 0.01 * num -} diff --git a/src/search/scripts/scrape/README.md b/src/search/scripts/scrape/README.md new file mode 100644 index 000000000000..538052f51b96 --- /dev/null +++ b/src/search/scripts/scrape/README.md @@ -0,0 +1,40 @@ +# Scraping for General Search + +We need to scrape each page on the Docs site and use the data we scrape to index Elasticsearch. + +We currently only scrape for **general search** results. + +Autocomplete search data is generated from analytics events and GPT queries. + +## CLI Script + +Before running the scraping script ensure that the server is running in another terminal with `npm run general-search-scrape-server` + +Run the script with `npm run general-search-scrape -- ` + +After a successful run it will generate a series of JSON files with the page data of every page of the Docs site into the passed directory. + +The `index-general-search.yml` workflow will scrape the records into `/tmp/records` then proceed to run the [general-search indexing script](../index/README.md) + +To see the arguments accepted by the script, pass the `--help` argument, for example + +```bash +npm run general-search-scrape -- --help +``` + +## Records (scraped pages) + +In the context of an Elasticsearch index, a record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example: + +```json +{ + "objectID":"/en/actions/creating-actions/about-custom-actions", + "breadcrumbs":"GitHub Actions / Creating actions", + "title":"About custom actions", + "headings":"About custom actions\nTypes of actions\n[...]", + "content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]", + "intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.", + "toplevel":"GitHub Actions", + "popularity":0 +} +``` diff --git a/src/search/scripts/build-records.js b/src/search/scripts/scrape/lib/build-records.ts similarity index 75% rename from src/search/scripts/build-records.js rename to src/search/scripts/scrape/lib/build-records.ts index 42313c4ee149..329771487a85 100644 --- a/src/search/scripts/build-records.js +++ b/src/search/scripts/scrape/lib/build-records.ts @@ -1,14 +1,16 @@ -#!/usr/bin/env node import eventToPromise from 'event-to-promise' import chalk from 'chalk' import dotenv from 'dotenv' import boxen from 'boxen' import { HTTPError } from 'got' -import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js' -import getPopularPages from './popular-pages.js' -import languages from '#src/languages/lib/languages.js' -import domwaiter from './domwaiter.js' +import languages from '@/languages/lib/languages.js' +import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records' +import getPopularPages from '@/search/scripts/scrape/lib/popular-pages' +import domwaiter from '@/search/scripts/scrape/lib/domwaiter' +import { getAllVersionsKeyFromIndexVersion } from '@/search/lib/elasticsearch-versions' + +import type { Page, Permalink, Record, Config, Redirects } from '@/search/scripts/scrape/types' const pageMarker = chalk.green('|') const recordMarker = chalk.grey('.') @@ -31,16 +33,19 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '5', 10) const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing']) export default async function buildRecords( - indexName, - indexablePages, - pageVersion, - languageCode, - redirects, - config = {}, -) { + indexName: string, + indexablePages: Page[], + indexVersion: string, + languageCode: string, + redirects: Redirects, + config: Config = {} as Config, +): Promise { + // Determine the page version from the index version + const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion) + const { noMarkers, docsInternalDataPath } = config console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`) - const records = [] + const records: Record[] = [] const pages = indexablePages // exclude pages that are not in the current language .filter((page) => page.languageCode === languageCode) @@ -55,12 +60,15 @@ export default async function buildRecords( }) }) .map((permalink) => { - permalink.url = `http://localhost:${port}${permalink.href}` + if (permalink) { + permalink.url = `http://localhost:${port}${permalink.href}` + } return permalink }) + .filter((permalink): permalink is Permalink => permalink !== undefined) const popularPages = docsInternalDataPath - ? await getPopularPages(docsInternalDataPath, redirects, pageVersion, languageCode) + ? await getPopularPages(docsInternalDataPath, redirects, indexVersion, languageCode) : {} console.log('indexable pages', indexablePages.length) @@ -93,7 +101,7 @@ export default async function buildRecords( if (err instanceof HTTPError && !err.response.ok) { console.log( '\n' + - boxen(chalk.bold(err.request.requestUrl.pathname), { + boxen(chalk.bold(err.request.requestUrl?.pathname), { title: chalk.red('The URL it failed on was'), padding: 1, borderColor: 'red', diff --git a/src/search/scripts/domwaiter.js b/src/search/scripts/scrape/lib/domwaiter.ts similarity index 50% rename from src/search/scripts/domwaiter.js rename to src/search/scripts/scrape/lib/domwaiter.ts index fccfc4aaae7c..fe70a1d9fedd 100644 --- a/src/search/scripts/domwaiter.js +++ b/src/search/scripts/scrape/lib/domwaiter.ts @@ -1,9 +1,18 @@ -import { EventEmitter } from 'node:events' +import { EventEmitter } from 'events' import Bottleneck from 'bottleneck' import got from 'got' import cheerio from 'cheerio' -export default function domwaiter(pages, opts = {}) { +import type { Permalink } from '@/search/scripts/scrape/types' + +interface DomWaiterOptions { + parseDOM?: boolean + json?: boolean + maxConcurrent?: number + minTime?: number +} + +export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter { const emitter = new EventEmitter() const defaults = { @@ -17,26 +26,26 @@ export default function domwaiter(pages, opts = {}) { const limiter = new Bottleneck(opts) pages.forEach((page) => { - limiter.schedule(getPage, page, emitter, opts) + limiter.schedule(() => getPage(page, emitter, opts)) }) - limiter - .on('idle', () => { - emitter.emit('done') - }) - .on('error', (err) => { - emitter.emit('error', err) - }) + limiter.on('idle', () => { + emitter.emit('done') + }) + + limiter.on('error', (err) => { + emitter.emit('error', err) + }) return emitter } -async function getPage(page, emitter, opts) { +async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) { emitter.emit('beforePageLoad', page) if (opts.json) { try { - const json = await got(page.url).json() + const json = await got(page.url!).json() const pageCopy = Object.assign({}, page, { json }) emitter.emit('page', pageCopy) } catch (err) { @@ -44,9 +53,9 @@ async function getPage(page, emitter, opts) { } } else { try { - const body = (await got(page.url)).body + const body = (await got(page.url!)).body const pageCopy = Object.assign({}, page, { body }) - if (opts.parseDOM) pageCopy.$ = cheerio.load(body) + if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body) emitter.emit('page', pageCopy) } catch (err) { emitter.emit('error', err) diff --git a/src/search/scripts/find-indexable-pages.js b/src/search/scripts/scrape/lib/find-indexable-pages.ts similarity index 70% rename from src/search/scripts/find-indexable-pages.js rename to src/search/scripts/scrape/lib/find-indexable-pages.ts index 9e05abfec99d..ed37cd196c2b 100644 --- a/src/search/scripts/find-indexable-pages.js +++ b/src/search/scripts/scrape/lib/find-indexable-pages.ts @@ -1,8 +1,9 @@ -#!/usr/bin/env node -import { loadPages } from '#src/frame/lib/page-data.js' +import { loadPages } from '@/frame/lib/page-data.js' -export default async function findIndexablePages(match = '') { - const allPages = await loadPages() +import type { Page } from '@/search/scripts/scrape/types' + +export default async function findIndexablePages(match = ''): Promise { + const allPages: Page[] = await loadPages() const indexablePages = allPages // exclude hidden pages .filter((page) => !page.hidden) diff --git a/src/search/scripts/parse-page-sections-into-records.js b/src/search/scripts/scrape/lib/parse-page-sections-into-records.ts similarity index 91% rename from src/search/scripts/parse-page-sections-into-records.js rename to src/search/scripts/scrape/lib/parse-page-sections-into-records.ts index 0897b7c289a5..8bee4c2a237b 100644 --- a/src/search/scripts/parse-page-sections-into-records.js +++ b/src/search/scripts/scrape/lib/parse-page-sections-into-records.ts @@ -1,17 +1,18 @@ -#!/usr/bin/env node import { render } from 'cheerio-to-text' +import type { Record } from '@/search/scripts/scrape/types' + // This module takes cheerio page object and divides it into sections // using H1,H2 heading elements as section delimiters. The text // that follows each heading becomes the content of the search record. const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites'] -export default function parsePageSectionsIntoRecords(page) { +export default function parsePageSectionsIntoRecords(page: any): Record { const { href, $ } = page const title = $('h1').first().text().trim() const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a') - .map((i, el) => { + .map((i: number, el: any) => { return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ') }) .get() @@ -21,8 +22,7 @@ export default function parsePageSectionsIntoRecords(page) { // page that don't make much sense to find in a site search. $('[data-search=hide]').remove() - // Only slice off the last one if the length of the array is greater - // that 1. + // Only slice off the last one if the length of the array is greater than 1 // On an article page, we the breadcrumbs array will be something // like: // @@ -51,12 +51,12 @@ export default function parsePageSectionsIntoRecords(page) { const $sections = $('h2', $root) .filter('[id]') - .filter((i, el) => { + .filter((i: number, el: any) => { return !ignoredHeadingSlugs.includes($(el).attr('id')) }) const headings = $sections - .map((i, el) => $(el).text()) + .map((i: number, el: any) => $(el).text()) .get() .join('\n') .trim() diff --git a/src/search/scripts/popular-pages.js b/src/search/scripts/scrape/lib/popular-pages.ts similarity index 61% rename from src/search/scripts/popular-pages.js rename to src/search/scripts/scrape/lib/popular-pages.ts index 11dac8186206..a6e42053441b 100644 --- a/src/search/scripts/popular-pages.js +++ b/src/search/scripts/scrape/lib/popular-pages.ts @@ -2,28 +2,31 @@ import { join } from 'path' import { existsSync } from 'fs' import fs from 'fs/promises' -export default async function getPopularPages(dirPath, redirects, version, language) { - // The dirPath is the path to the github/docs-internal-data repo. - // We make assumptions about the structure of the repo. In particular, - // the pageviews rollups live in - // `hydro/rollups/pageviews/$language/$versionprefix/rollup.json` - // For example - // `hydro/rollups/pageviews/en/enterprise-cloud/rollup.json` - const versionPrefix = version.split('@')[0] - let filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json') +import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions.js' + +import type { Redirects, PopularPages } from '@/search/scripts/scrape/types' + +export default async function getPopularPages( + dirPath: string, + redirects: Redirects, + indexVersion: string, + language: string, +): Promise { + const planVersion = getPlanVersionFromIndexVersion(indexVersion) + let filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json') if (!existsSync(filePath) && language !== 'en') { console.warn("Trying the rollup for 'en'") language = 'en' - filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json') + filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json') } if (!existsSync(filePath)) { - throw new Error(`No rollup found for version '${versionPrefix}'. Tried ${filePath}`) + throw new Error(`No rollup found for version '${planVersion}'. Tried ${filePath}`) } const rollupRaw = await fs.readFile(filePath, 'utf-8') - // Firt iterate through the array of objects, not making an assumption + // First iterate through the array of objects, not making an assumption // that the first one is the biggest one. - const all = {} + const all: { [key: string]: number } = {} for (const [path, count] of Object.entries(JSON.parse(rollupRaw))) { if (!path) { // Can happen if the SQL query is, for some unknown reason, finding @@ -41,11 +44,11 @@ export default async function getPopularPages(dirPath, redirects, version, langu // We never index these anyway so their popularity is never relevant. continue } - all[path] = count + all[path] = count as number } const biggestCount = Math.max(...Object.values(all)) - const popularPages = {} + const popularPages: PopularPages = {} for (const [path, count] of Object.entries(all)) { // Don't bother writing massively long floating point numbers // because reducing it makes the JSON records smaller and we don't @@ -55,11 +58,6 @@ export default async function getPopularPages(dirPath, redirects, version, langu // The reason we're heeding redirects is because it's possible // that the JSON file is older/"staler" than the // content itself. - // Imaging our analytics recorded that `/en/foo` had 1,234 pageviews, - // and someone goes and... `git mv content/foo content/bar` plus - // adding `redirect_from: - /foo` into the front-matter. - // Then, by using the redirects first, we can maintain that popularity - // by now "pretending" that it's `/en/bar` that has 1,234 pageviews. popularPages[redirects[path] || path] = ratio } diff --git a/src/search/scripts/sync.js b/src/search/scripts/scrape/lib/scrape-into-index-json.ts similarity index 64% rename from src/search/scripts/sync.js rename to src/search/scripts/scrape/lib/scrape-into-index-json.ts index 98feec1b81e6..56cbe264d4e6 100644 --- a/src/search/scripts/sync.js +++ b/src/search/scripts/scrape/lib/scrape-into-index-json.ts @@ -1,22 +1,22 @@ -#!/usr/bin/env node import chalk from 'chalk' -import languages from '#src/languages/lib/languages.js' -import buildRecords from './build-records.js' -import findIndexablePages from './find-indexable-pages.js' -import { allVersions } from '#src/versions/lib/all-versions.js' -import { namePrefix } from '#src/search/lib/config.js' -import { writeIndexRecords } from './search-index-records.js' +import languages from '@/languages/lib/languages.js' +import buildRecords from '@/search/scripts/scrape/lib/build-records' +import findIndexablePages from '@/search/scripts/scrape/lib/find-indexable-pages' +import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records' +import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' + +import type { Options, Config, Page, Redirects } from '@/search/scripts/scrape/types' // Build a search data file for every combination of product version and language // e.g. `github-docs-dotcom-en.json` and `github-docs-2.14-ja.json` -export default async function syncSearchIndexes({ +export default async function scrapeIntoIndexJson({ language, notLanguage, outDirectory, versionsToBuild, - config = {}, -}) { + config = {} as Config, +}: Options): Promise { const t0 = new Date() // build indices for a specific language if provided; otherwise build indices for all languages @@ -25,14 +25,14 @@ export default async function syncSearchIndexes({ ) console.log( - `Building indices for ${chalk.yellow(language || 'all languages')} and ${chalk.yellow( + `Building indices for language: ${chalk.yellow(language || 'all languages')} and version: ${chalk.yellow( versionsToBuild.length === 1 ? versionsToBuild[0] : 'all versions', )}.\n`, ) // Exclude WIP pages, hidden pages, index pages, etc - const indexablePages = await findIndexablePages(config.filter) - const redirects = {} + const indexablePages: Page[] = await findIndexablePages(config.filter) + const redirects: Redirects = {} indexablePages.forEach((page) => { const href = page.relativePath.replace('index.md', '').replace('.md', '') for (let redirectFrom of page.redirect_from || []) { @@ -47,22 +47,14 @@ export default async function syncSearchIndexes({ let countRecordsTotal = 0 // Build and validate all indices for (const languageCode of languagesToBuild) { - for (const pageVersion of versionsToBuild) { - // if GHES, resolves to the release number like 2.21, 2.22, etc. - // if FPT, resolves to 'dotcom' - const indexVersion = - allVersions[pageVersion].plan === 'enterprise-server' - ? allVersions[pageVersion].currentRelease - : allVersions[pageVersion].miscBaseName - - // github-docs-dotcom-en, github-docs-2.22-en - const indexName = `${namePrefix}-${indexVersion}-${languageCode}` + for (const indexVersion of versionsToBuild) { + const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode) // The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7 const records = await buildRecords( indexName, indexablePages, - pageVersion, + indexVersion, languageCode, redirects, config, @@ -81,6 +73,6 @@ export default async function syncSearchIndexes({ console.log(`Rate ~${chalk.bold(rate)} pages per second.`) } -function formatSeconds(seconds) { +function formatSeconds(seconds: number): string { return new Date(seconds * 1000).toISOString().substr(11, 8) } diff --git a/src/search/scripts/validate-records.js b/src/search/scripts/scrape/lib/search-index-records.ts similarity index 61% rename from src/search/scripts/validate-records.js rename to src/search/scripts/scrape/lib/search-index-records.ts index 1adb43217c5f..c4459ccdb88a 100644 --- a/src/search/scripts/validate-records.js +++ b/src/search/scripts/scrape/lib/search-index-records.ts @@ -1,16 +1,27 @@ -#!/usr/bin/env node +import path from 'path' +import fs from 'fs/promises' import assert from 'assert' import { isArray, isString } from 'lodash-es' -function countArrayValues(arr) { - const counter = new Map() - arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1)) - return [...counter.entries()].map(([value, count]) => { - return { value, count } - }) +import type { Record } from '@/search/scripts/scrape/types' + +export async function writeIndexRecords( + name: string, + records: Record[], + outDirectory: string, +): Promise { + validateRecords(name, records) + + const recordsObject = Object.fromEntries(records.map((record) => [record.objectID, record])) + const content = JSON.stringify(recordsObject, undefined, 0) + + const filePath = path.join(outDirectory, `${name}-records.json`) + await fs.writeFile(filePath, content) + + return filePath } -export default function validateRecords(name, records) { +function validateRecords(name: string, records: Record[]): true { assert(isString(name) && name.length, '`name` is required') assert(isArray(records) && records.length, '`records` must be a non-empty array') @@ -35,3 +46,11 @@ export default function validateRecords(name, records) { return true } + +function countArrayValues(arr: string[]) { + const counter = new Map() + arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1)) + return [...counter.entries()].map(([value, count]) => { + return { value, count } + }) +} diff --git a/src/search/scripts/sync-search-indices.js b/src/search/scripts/scrape/scrape-cli.ts old mode 100755 new mode 100644 similarity index 64% rename from src/search/scripts/sync-search-indices.js rename to src/search/scripts/scrape/scrape-cli.ts index 17e3e13fbc25..db8c89e4a0a0 --- a/src/search/scripts/sync-search-indices.js +++ b/src/search/scripts/scrape/scrape-cli.ts @@ -1,36 +1,25 @@ -#!/usr/bin/env node - -// [start-readme] -// // This script is run automatically via GitHub Actions on every push to `main` to generate searchable data. -// It can also be run manually. For more info see [contributing/search.md](contributing/search.md) -// -// [end-readme] +// It can also be run manually. import { existsSync, statSync, readdirSync } from 'fs' - -import assert from 'assert' import { program, Option } from 'commander' -import { languageKeys } from '#src/languages/lib/languages.js' -import { allVersions } from '#src/versions/lib/all-versions.js' -import searchSync from './sync.js' +import { languageKeys } from '@/languages/lib/languages' +import scrapeIntoIndexJson from '@/search/scripts/scrape/lib/scrape-into-index-json' +import { + allIndexVersionKeys, + allIndexVersionOptions, + versionToIndexVersionMap, +} from '@/search/lib/elasticsearch-versions' -const shortNames = Object.fromEntries( - Object.values(allVersions).map((info) => { - const shortName = info.hasNumberedReleases - ? info.miscBaseName + info.currentRelease - : info.miscBaseName - return [shortName, info] - }), -) - -const allVersionKeys = [...Object.keys(shortNames), ...Object.keys(allVersions)] +import type { Config, Options, ProgramOptions } from '@/search/scripts/scrape/types' program - .description('Creates search records by scraping') + .description('Creates search index JSONs by scraping a running docs site') .option('-v, --verbose', 'Verbose outputs') - .addOption(new Option('-V, --version ', 'Specific versions').choices(allVersionKeys)) + .addOption( + new Option('-V, --version ', 'Specific versions').choices(allIndexVersionOptions), + ) .addOption( new Option('-l, --language ', 'Which languages to focus on').choices(languageKeys), ) @@ -48,8 +37,8 @@ program main(program.opts(), program.args) -async function main(opts, args) { - let language +async function main(opts: ProgramOptions, args: string[]) { + let language: string | undefined if ('language' in opts) { language = opts.language if (process.env.LANGUAGE) { @@ -72,7 +61,7 @@ async function main(opts, args) { throw new Error("Can't specify --language *and* --not-language") } - let version + let version: string | undefined if ('version' in opts) { version = opts.version if (process.env.VERSION) { @@ -83,15 +72,15 @@ async function main(opts, args) { } else { if (process.env.VERSION && process.env.VERSION !== 'all') { version = process.env.VERSION - if (!allVersionKeys.includes(version)) { + if (!allIndexVersionOptions.includes(version)) { throw new Error( - `Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`, + `Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allIndexVersionOptions}`, ) } } } - let docsInternalDataPath + let docsInternalDataPath: string | undefined const { docsInternalData } = opts const { DOCS_INTERNAL_DATA } = process.env @@ -120,39 +109,30 @@ async function main(opts, args) { throw new Error(`'${docsInternalDataPath}' must contain a 'hydro' directory`) } - // A `--version` or `process.env.VERSION` was specified, we need to convert - // it to the long name. I.e. `free-pro-team@latest`. Not `dotcom`. - // But it could also have beeb specified as `all` which means that `version` - // here ill be `undefined` which is also OK. - // const indexVersion = shortNames[version].hasNumberedReleases - // ? shortNames[version].currentRelease - // : shortNames[version].miscBaseName - - let indexVersion + let indexVersion: string | undefined if (version && version !== 'all') { - // If it has been specified, it needs to be in the "long-form". - // I.e. `enterprise-server@3.5` not `ghes-3.5`. - indexVersion = version in shortNames ? shortNames[version].version : version + indexVersion = versionToIndexVersionMap[version] + } + if (!indexVersion && !allIndexVersionOptions.includes(indexVersion || '')) { + throw new Error( + `Input error. Version must be not passed or one of ${allIndexVersionOptions}. Got: ${indexVersion}`, + ) } - assert( - !indexVersion || indexVersion in allVersions, - `version must be undefined or one of ${Object.keys(allVersions)}`, - ) const [outDirectory] = args - const config = { + const config: Config = { noMarkers: !opts.markers, filter: opts.filter, docsInternalDataPath, } - const options = { + const options: Options = { language, notLanguage, outDirectory, config, - versionsToBuild: indexVersion ? [indexVersion] : Object.keys(allVersions), + versionsToBuild: indexVersion ? [indexVersion] : Object.keys(allIndexVersionKeys), } - await searchSync(options) + await scrapeIntoIndexJson(options) } diff --git a/src/search/scripts/scrape/types.ts b/src/search/scripts/scrape/types.ts new file mode 100644 index 000000000000..20db4d78b968 --- /dev/null +++ b/src/search/scripts/scrape/types.ts @@ -0,0 +1,70 @@ +export interface Config { + noMarkers: boolean + filter?: string + docsInternalDataPath?: string +} + +export interface Options { + language?: string + notLanguage?: string + outDirectory: string + config: Config + versionsToBuild: string[] +} + +export interface ProgramOptions { + verbose?: boolean + version?: string + language?: string + notLanguage?: string + markers?: boolean + filter?: string + docsInternalData?: string +} + +export interface Page { + relativePath: string + languageCode: string + permalinks: Permalink[] + redirect_from?: string[] + hidden?: boolean + parentProduct?: { + wip?: boolean + hidden?: boolean + } +} + +export interface Permalink { + pageVersion: string + href: string + languageCode: string + relativePath: string + url?: string + '?'?: string +} + +export interface Record { + objectID: string + breadcrumbs: string + title: string + headings: string + content: string + intro: string + toplevel: string + popularity?: number +} + +export interface Language { + name: string + code: string +} + +export type Languages = { [key: string]: Language } + +export interface Redirects { + [key: string]: string +} + +export interface PopularPages { + [key: string]: number +} diff --git a/src/search/scripts/search-index-records.js b/src/search/scripts/search-index-records.js deleted file mode 100644 index 19684a77bf97..000000000000 --- a/src/search/scripts/search-index-records.js +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env node -import path from 'path' -import fs from 'fs/promises' - -import validateRecords from './validate-records.js' - -export async function writeIndexRecords(name, records, outDirectory) { - validateRecords(name, records) - - const recordsObject = Object.fromEntries(records.map((record) => [record.objectID, record])) - const content = JSON.stringify(recordsObject, undefined, 0) - - const filePath = path.join(outDirectory, `${name}-records.json`) - await fs.writeFile(filePath, content) - - return filePath -} diff --git a/src/search/tests/api-ai-search-autocomplete.ts b/src/search/tests/api-ai-search-autocomplete.ts new file mode 100644 index 000000000000..191f1854a287 --- /dev/null +++ b/src/search/tests/api-ai-search-autocomplete.ts @@ -0,0 +1,164 @@ +/** + * To be able to run these tests you need to index the fixtures! + * And you need to have an Elasticsearch URL to connect to for the server. + * + * To index the fixtures, run: + * + * ELASTICSEARCH_URL=http://localhost:9200 npm run index-test-fixtures + * + * This will replace any "real" Elasticsearch indexes you might have so + * once you're done working on vitest tests you need to index real + * content again. + */ + +import { expect, test, vi } from 'vitest' + +import { describeIfElasticsearchURL } from '@/tests/helpers/conditional-runs.js' +import { get } from '@/tests/helpers/e2etest-ts' + +import type { AutocompleteSearchResponse } from '@/search/types' + +if (!process.env.ELASTICSEARCH_URL) { + console.warn( + 'None of the API search middleware tests are run because ' + + "the environment variable 'ELASTICSEARCH_URL' is currently not set.", + ) +} + +const aiSearchEndpoint = '/api/search/ai-search-autocomplete/v1' +const getSearchEndpointWithParams = (searchParams: URLSearchParams) => + `${aiSearchEndpoint}?${searchParams}` + +// This suite only runs if $ELASTICSEARCH_URL is set. +describeIfElasticsearchURL('search/ai-search-autocomplete v1 middleware', () => { + vi.setConfig({ testTimeout: 60 * 1000 }) + + test('basic search', async () => { + const sp = new URLSearchParams() + // To see why this will work, + // see src/search/tests/fixtures/data/ai/* + sp.set('query', 'how do I') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.body) as AutocompleteSearchResponse + + expect(results.meta).toBeTruthy() + expect(results.meta.found.value).toBeGreaterThanOrEqual(1) + expect(results.meta.found.relation).toBeTruthy() + + expect(results.hits).toBeTruthy() + + const hit = results.hits[0] + expect(hit.term).toBe('How do I clone a repository?') + expect(hit.highlights).toBeTruthy() + expect(hit.highlights[0]).toBe('How do I clone a repository?') + + // Check that it can be cached at the CDN + expect(res.headers['set-cookie']).toBeUndefined() + expect(res.headers['cache-control']).toContain('public') + expect(res.headers['cache-control']).toMatch(/max-age=[1-9]/) + expect(res.headers['surrogate-control']).toContain('public') + expect(res.headers['surrogate-control']).toMatch(/max-age=[1-9]/) + expect(res.headers['surrogate-key']).toBe('manual-purge') + }) + + test('invalid version', async () => { + const sp = new URLSearchParams() + sp.set('query', 'fo') + sp.set('version', 'never-heard-of') + const res = await get(`${aiSearchEndpoint}?{sp}`) + expect(res.statusCode).toBe(400) + expect(JSON.parse(res.body).error).toBeTruthy() + }) + + test('variations on version name', async () => { + const sp = new URLSearchParams() + sp.set('query', 'fo') + sp.set('version', 'enterprise-cloud') + { + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + } + sp.set('version', 'ghec') + { + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + } + sp.set('version', 'fpt') + { + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + } + sp.set('version', 'free-pro-team@latest') + { + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + } + }) + + test('invalid language', async () => { + const sp = new URLSearchParams() + sp.set('query', 'fo') + sp.set('language', 'xx') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(400) + expect(JSON.parse(res.body).error).toBeTruthy() + }) + + test('only english supported', async () => { + const sp = new URLSearchParams() + sp.set('query', 'fo') + sp.set('language', 'ja') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(400) + expect(JSON.parse(res.body).error).toBeTruthy() + }) + + test('fuzzy autocomplete search', async () => { + const sp = new URLSearchParams() + sp.set('query', 'cl') // Short for "clone" + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.body) as AutocompleteSearchResponse + // 'cl" matches "How do I clone a repository?" + const hit = results.hits[0] + expect(hit.term).toBe('How do I clone a repository?') + // Highlighting behavior will highlight the matching "term" which is an entire word + // In this case that word is "clone" when the query is "cl" + expect(hit.highlights[0]).toBe('How do I clone a repository?') + }) + + test('autocomplete term search', async () => { + const sp = new URLSearchParams() + sp.set('query', 'clone') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(200) + const results = JSON.parse(res.body) as AutocompleteSearchResponse + console.log(JSON.stringify(results, null, 2)) + const hit = results.hits[0] + expect(hit.term).toBe('How do I clone a repository?') + expect(hit.highlights).toBeTruthy() + expect(hit.highlights[0]).toBe('How do I clone a repository?') + }) + + test('invalid query', async () => { + const sp = new URLSearchParams() + // No query at all + { + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(400) + } + // Empty query + { + sp.set('query', '') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(400) + } + // Empty when trimmed + { + sp.set('query', ' ') + const res = await get(getSearchEndpointWithParams(sp)) + expect(res.statusCode).toBe(400) + } + }) +}) diff --git a/src/search/tests/api-autocomplete-search.js b/src/search/tests/api-general-autocomplete-search.ts similarity index 62% rename from src/search/tests/api-autocomplete-search.js rename to src/search/tests/api-general-autocomplete-search.ts index 2d9448aa51a5..f3e7120a0beb 100644 --- a/src/search/tests/api-autocomplete-search.js +++ b/src/search/tests/api-general-autocomplete-search.ts @@ -13,8 +13,9 @@ import { expect, test, vi } from 'vitest' -import { describeIfElasticsearchURL } from '#src/tests/helpers/conditional-runs.js' -import { get } from '#src/tests/helpers/e2etest.js' +import { describeIfElasticsearchURL } from '@/tests/helpers/conditional-runs.js' +import { get } from '@/tests/helpers/e2etest-ts' +import type { AutocompleteSearchResponse, SearchValidationErrorEntry } from '@/search/types' if (!process.env.ELASTICSEARCH_URL) { console.warn( @@ -28,13 +29,13 @@ describeIfElasticsearchURL('search/autocomplete v1 middleware', () => { vi.setConfig({ testTimeout: 60 * 1000 }) test('basic search', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() // To see why this will work, // see src/search/tests/fixtures/data sp.set('query', 'fo') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: AutocompleteSearchResponse = JSON.parse(res.body) expect(results.meta).toBeTruthy() expect(results.meta.found.value).toBeGreaterThanOrEqual(1) @@ -58,55 +59,65 @@ describeIfElasticsearchURL('search/autocomplete v1 middleware', () => { }) test('invalid version', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() sp.set('query', 'fo') sp.set('version', 'never-heard-of') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toBeTruthy() + const errorResponse: SearchValidationErrorEntry = JSON.parse(res.body).error + expect(errorResponse).toBeTruthy() }) test('variations on version name', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() sp.set('query', 'fo') + + // Test with 'enterprise-cloud' version sp.set('version', 'enterprise-cloud') { - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) } + + // Test with 'ghec' version sp.set('version', 'ghec') { - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) } + + // Test with 'fpt' version sp.set('version', 'fpt') { - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) } + + // Test with 'free-pro-team@latest' version sp.set('version', 'free-pro-team@latest') { - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) } }) test('invalid language', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() sp.set('query', 'fo') sp.set('language', 'xx') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toBeTruthy() + const errorResponse: SearchValidationErrorEntry = JSON.parse(res.body).error + expect(errorResponse).toBeTruthy() }) test('fuzzy autocomplete search', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() sp.set('query', 'forc') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) - // The work "fork" matches "fo" + const results: AutocompleteSearchResponse = JSON.parse(res.body) + // The work "fork" matches "forc" const hit = results.hits[0] expect(hit.term).toBe('fork') expect(hit.highlights).toBeTruthy() @@ -114,22 +125,22 @@ describeIfElasticsearchURL('search/autocomplete v1 middleware', () => { }) test('invalid query', async () => { - const sp = new URLSearchParams() + const sp: URLSearchParams = new URLSearchParams() // No query at all { - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(400) } // Empty query { sp.set('query', '') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(400) } // Empty when trimmed { sp.set('query', ' ') - const res = await get('/api/search/autocomplete/v1?' + sp) + const res = await get('/api/search/autocomplete/v1?' + sp.toString()) expect(res.statusCode).toBe(400) } }) diff --git a/src/search/tests/api-search.js b/src/search/tests/api-search.ts similarity index 66% rename from src/search/tests/api-search.js rename to src/search/tests/api-search.ts index 167e7622d1d0..e1b2ee7c089d 100644 --- a/src/search/tests/api-search.js +++ b/src/search/tests/api-search.ts @@ -12,9 +12,9 @@ */ import { expect, test, vi } from 'vitest' - -import { describeIfElasticsearchURL } from '#src/tests/helpers/conditional-runs.js' -import { get } from '#src/tests/helpers/e2etest.js' +import { describeIfElasticsearchURL } from '@/tests/helpers/conditional-runs.js' +import { get } from '@/tests/helpers/e2etest-ts' +import { GeneralSearchResponse, SearchResultAggregations, GeneralSearchHit } from '@/search/types' if (!process.env.ELASTICSEARCH_URL) { console.warn( @@ -33,9 +33,9 @@ describeIfElasticsearchURL('search v1 middleware', () => { // see src/search/tests/fixtures/search-indexes/github-docs-dotcom-en-records.json // which clearly has a record with the title "Foo" sp.set('query', 'foo') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.meta).toBeTruthy() expect(results.meta.found.value).toBeGreaterThanOrEqual(1) @@ -51,7 +51,7 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(results.hits.length).toBeGreaterThanOrEqual(1) // ...but only one has the word "foo" in its title so we can // be certain it comes first. - const hit = results.hits[0] + const hit: GeneralSearchHit = results.hits[0] // This specifically checks what we expect of version v1 expect(hit.url).toBe('/en/foo') expect(hit.title).toBe('Foo') @@ -75,11 +75,11 @@ describeIfElasticsearchURL('search v1 middleware', () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('debug', '1') // Note! - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) // safe because we know exactly the fixtures - const hit = results.hits[0] + const hit: GeneralSearchHit = results.hits[0] expect(hit.popularity).toBeTruthy() expect(hit.score).toBeTruthy() expect(hit.es_url).toBeTruthy() @@ -90,9 +90,9 @@ describeIfElasticsearchURL('search v1 middleware', () => { { const sp = new URLSearchParams() sp.set('query', 'sill') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) // Fixtures contains no word called 'sill'. It does contain the term // 'silly' which, in English, becomes 'silli` when stemmed. // Because we don't use `&autocomplete=true` this time, we expect @@ -105,22 +105,23 @@ describeIfElasticsearchURL('search v1 middleware', () => { const sp = new URLSearchParams() sp.set('query', 'sill') sp.set('autocomplete', 'true') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.meta.found.value).toBeGreaterThanOrEqual(1) - const hit = results.hits[0] - const contentHighlights = hit.highlights.content - expect(contentHighlights[0]).toMatch('silly') + const hit: GeneralSearchHit = results.hits[0] + const contentHighlights: string[] | undefined = hit.highlights.content + expect(contentHighlights).toBeTruthy() + expect(contentHighlights![0]).toMatch('silly') } }) test('find nothing', async () => { const sp = new URLSearchParams() sp.set('query', 'xojixjoiwejhfoiuwehjfioweufhj') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.hits.length).toBe(0) expect(results.meta.found.value).toBe(0) }) @@ -129,9 +130,9 @@ describeIfElasticsearchURL('search v1 middleware', () => { const sp = new URLSearchParams() sp.set('query', 'introduction heading') sp.append('highlights', 'content') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.meta.found.value).toBeGreaterThanOrEqual(1) for (const hit of results.hits) { expect(hit.highlights.title).toBeFalsy() @@ -144,9 +145,9 @@ describeIfElasticsearchURL('search v1 middleware', () => { // This will match because it's in the 'content' but not in 'headings' sp.set('query', 'Fact of life') sp.set('highlights', 'title') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.meta.found.value).toBeGreaterThanOrEqual(1) for (const hit of results.hits) { expect(hit.highlights.title).toBeTruthy() @@ -158,14 +159,14 @@ describeIfElasticsearchURL('search v1 middleware', () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('version', 'dotcom') - const res1 = await get('/api/search/v1?' + sp) + const res1 = await get('/api/search/v1?' + sp.toString()) expect(res1.statusCode).toBe(200) - const results1 = JSON.parse(res1.body) + const results1: GeneralSearchResponse = JSON.parse(res1.body) sp.set('version', 'free-pro-team@latest') - const res2 = await get('/api/search/v1?' + sp) + const res2 = await get('/api/search/v1?' + sp.toString()) expect(res2.statusCode).toBe(200) - const results2 = JSON.parse(res2.body) + const results2: GeneralSearchResponse = JSON.parse(res2.body) expect(results1.hits[0].id).toBe(results2.hits[0].id) }) @@ -174,90 +175,126 @@ describeIfElasticsearchURL('search v1 middleware', () => { { const res = await get('/api/search/v1') expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toBeTruthy() + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toBeTruthy() } // query is just whitespace { const sp = new URLSearchParams() sp.set('query', ' ') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toBeTruthy() + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toBeTruthy() } // unrecognized language { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('language', 'xxx') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('language') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('language') } // unrecognized page { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('page', '9999') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('page') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('page') } // unrecognized version { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('version', 'xxxxx') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch("'xxxxx'") - expect(JSON.parse(res.body).field).toMatch('version') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch("'xxxxx'") + expect(errorResponse.field).toMatch('version') } // unrecognized size { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('size', 'not a number') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('size') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('size') } // unrecognized sort { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('sort', 'neverheardof') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('sort') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('sort') } // unrecognized highlights { const sp = new URLSearchParams() sp.set('query', 'test') sp.set('highlights', 'neverheardof') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('neverheardof') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('neverheardof') } // multiple 'query' keys { const sp = new URLSearchParams() sp.append('query', 'test1') sp.append('query', 'test2') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('Cannot have multiple values') + const errorResponse = JSON.parse(res.body) as { + error: string + field?: string + } + expect(errorResponse.error).toMatch('Cannot have multiple values') } }) test('breadcrumbless records should always return a string', async () => { const sp = new URLSearchParams() sp.set('query', 'breadcrumbs') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) // safe because we know exactly the fixtures - const hit = results.hits[0] + const hit: GeneralSearchHit = results.hits[0] expect(hit.breadcrumbs).toBe('') }) }) @@ -268,9 +305,9 @@ describeIfElasticsearchURL("additional fields with 'include'", () => { test("'intro' and 'headings' are omitted by default", async () => { const sp = new URLSearchParams() sp.set('query', 'foo') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) const firstKeys = Object.keys(results.hits[0]) expect(firstKeys.includes('intro')).toBeFalsy() expect(firstKeys.includes('headings')).toBeFalsy() @@ -280,9 +317,9 @@ describeIfElasticsearchURL("additional fields with 'include'", () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('include', 'intro') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) const firstKeys = Object.keys(results.hits[0]) expect(firstKeys.includes('intro')).toBeTruthy() expect(firstKeys.includes('headings')).toBeFalsy() @@ -293,9 +330,9 @@ describeIfElasticsearchURL("additional fields with 'include'", () => { sp.set('query', 'foo') sp.append('include', 'intro') sp.append('include', 'headings') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) const firstKeys = Object.keys(results.hits[0]) expect(firstKeys.includes('intro')).toBeTruthy() expect(firstKeys.includes('headings')).toBeTruthy() @@ -305,9 +342,12 @@ describeIfElasticsearchURL("additional fields with 'include'", () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('include', 'xxxxx') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - const results = JSON.parse(res.body) + const results = JSON.parse(res.body) as { + error: string + field?: string + } expect(results.error).toMatch(`Not a valid value ([ 'xxxxx' ]) for key 'include'`) }) }) @@ -319,9 +359,9 @@ describeIfElasticsearchURL('filter by toplevel', () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('include', 'toplevel') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) // In the fixtures, there are two distinct `toplevel` that // matches to this search. const toplevels = new Set(results.hits.map((hit) => hit.toplevel)) @@ -333,9 +373,9 @@ describeIfElasticsearchURL('filter by toplevel', () => { sp.set('query', 'foo') sp.set('include', 'toplevel') sp.set('toplevel', 'Baring') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) const toplevels = new Set(results.hits.map((hit) => hit.toplevel)) expect(toplevels).toEqual(new Set(['Baring'])) }) @@ -346,9 +386,9 @@ describeIfElasticsearchURL('filter by toplevel', () => { sp.set('include', 'toplevel') sp.append('toplevel', 'Baring') sp.append('toplevel', 'Fooing') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) const toplevels = new Set(results.hits.map((hit) => hit.toplevel)) expect(toplevels).toEqual(new Set(['Fooing', 'Baring'])) }) @@ -358,9 +398,9 @@ describeIfElasticsearchURL('filter by toplevel', () => { sp.set('query', 'foo') sp.set('include', 'toplevel') sp.set('toplevel', 'Never heard of') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse = JSON.parse(res.body) expect(results.meta.found.value).toBe(0) }) }) @@ -372,12 +412,14 @@ describeIfElasticsearchURL('aggregate', () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('aggregate', 'toplevel') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(200) - const results = JSON.parse(res.body) + const results: GeneralSearchResponse & { aggregations?: SearchResultAggregations } = JSON.parse( + res.body, + ) expect(results.aggregations).toBeTruthy() - expect(results.aggregations.toplevel).toBeTruthy() - const firstAgg = results.aggregations.toplevel[0] + expect(results.aggregations!.toplevel).toBeTruthy() + const firstAgg = results.aggregations!.toplevel[0] expect(firstAgg.key).toBeTruthy() expect(firstAgg.count).toBeTruthy() }) @@ -386,8 +428,12 @@ describeIfElasticsearchURL('aggregate', () => { const sp = new URLSearchParams() sp.set('query', 'foo') sp.set('aggregate', 'unrecognizedxxx') - const res = await get('/api/search/v1?' + sp) + const res = await get('/api/search/v1?' + sp.toString()) expect(res.statusCode).toBe(400) - expect(JSON.parse(res.body).error).toMatch('aggregate') + const results = JSON.parse(res.body) as { + error: string + field?: string + } + expect(results.error).toMatch('aggregate') }) }) diff --git a/src/search/tests/fixtures/data/ai/search/queries/en/enterprise-cloud/queries.json b/src/search/tests/fixtures/data/ai/search/queries/en/enterprise-cloud/queries.json new file mode 100644 index 000000000000..73aa1eb48443 --- /dev/null +++ b/src/search/tests/fixtures/data/ai/search/queries/en/enterprise-cloud/queries.json @@ -0,0 +1,52 @@ +{ + "topQueries": [ + "How do I authenticate with SAML SSO?", + "What is GitHub Copilot?", + "How do I manage billing for GitHub Copilot?", + "How do I view my GitHub Actions usage?", + "How do I add or edit a payment method?" + ], + "allQueries": [ + "How do I authenticate with SAML SSO?", + "What is GitHub Copilot?", + "How do I manage billing for GitHub Copilot?", + "How do I view my GitHub Actions usage?", + "How do I add or edit a payment method?", + "How do I manage my GitHub billing settings?", + "How do I create an enterprise account?", + "How do I manage licenses for Visual Studio subscriptions with GitHub Enterprise?", + "How do I view my payment history and receipts?", + "How do I manage billing for Git Large File Storage?", + "How do I authorize a personal access token for SAML SSO?", + "How do I manage billing for GitHub Advanced Security?", + "How do I set up a trial of GitHub Enterprise Cloud?", + "How do I manage my spending limit for GitHub Actions?", + "How do I prevent overspending on GitHub?", + "How do I estimate spending for my enterprise?", + "How do I authorize an SSH key for SAML SSO?", + "How do I view my subscriptions and billing date?", + "How do I manage security settings for my organization?", + "How do I close an issue?", + "How do I link a pull request to an issue?", + "How do I verify or approve a domain for my organization?", + "How do I manage billing for GitHub Codespaces?", + "How do I manage billing for GitHub Packages?", + "How do I change the visibility of my GitHub Pages site?", + "How do I manage custom repository roles for an organization?", + "How do I downgrade a sponsorship?", + "How do I upgrade a sponsorship?", + "How do I downgrade the billing plan for a GitHub Marketplace app?", + "How do I use projects and tasklists?", + "How do I transfer an issue to another repository?", + "How do I create an issue?", + "How do I delete an issue?", + "How do I manage billing for GitHub Marketplace?", + "How do I manage billing for GitHub Sponsors?", + "How do I troubleshoot a declined credit card charge?", + "How do I get code suggestions in my IDE with GitHub Copilot?", + "How do I manage my personal access tokens?", + "How do I unlock a locked account?", + "How do I manage custom properties for repositories in my organization?", + "How do I use advanced secret scanning features?" + ] +} diff --git a/src/search/tests/fixtures/data/ai/search/queries/en/free-pro-team/queries.json b/src/search/tests/fixtures/data/ai/search/queries/en/free-pro-team/queries.json new file mode 100644 index 000000000000..705c0222393b --- /dev/null +++ b/src/search/tests/fixtures/data/ai/search/queries/en/free-pro-team/queries.json @@ -0,0 +1,52 @@ +{ + "topQueries": [ + "What is GitHub and how do I get started?", + "What is GitHub Copilot and how do I get started?", + "How do I connect to GitHub with SSH?", + "How do I generate a personal access token?", + "How do I clone a repository?" + ], + "allQueries": [ + "How do I generate a new SSH key and add it to the SSH agent?", + "What are the GitHub terms of service?", + "How do I connect to GitHub with SSH?", + "How do I generate a personal access token?", + "How do I get code suggestions in my IDE with GitHub Copilot?", + "How do I clone a repository?", + "How do I create a new repository?", + "How do I change my primary email address on GitHub?", + "How do I set up Git?", + "What are GitHub's plans?", + "How do I propose changes with pull requests?", + "How do I manage billing on GitHub?", + "How do I configure a publishing source for my GitHub Pages site?", + "How do I add a new SSH key to my GitHub account?", + "How do I set up a GitHub Pages site?", + "How do I recover my account if I lose my 2FA credentials?", + "How do I personalize my GitHub profile?", + "How do I view my GitHub Actions usage?", + "How do I manage my spending limit for GitHub Actions?", + "How do I create an issue on GitHub?", + "How do I verify my email address on GitHub?", + "How do I ignore files in Git?", + "How do I install GitHub Desktop?", + "How do I test my SSH connection to GitHub?", + "How do I fork a repository?", + "How do I resolve 'Permission denied (publickey)' error?", + "How do I add a theme to my GitHub Pages site using Jekyll?", + "How do I manage a custom domain for my GitHub Pages site?", + "How do I manage Copilot policies as an individual subscriber?", + "How do I manage deploy keys on GitHub?", + "How do I manage my profile README on GitHub?", + "How do I create a tasklist on GitHub?", + "How do I delete a repository?", + "How do I view my Git Large File Storage usage?", + "How do I add an email address to my GitHub account?", + "How do I manage OAuth app access restrictions for my organization?", + "How do I view all of my issues and pull requests?", + "How do I manage billing for GitHub Codespaces?", + "How do I manage billing for Git Large File Storage?", + "How do I view my payment history and receipts on GitHub?", + "How do I unlock a locked account on GitHub?" + ] +} diff --git a/src/search/tests/fixtures/search-indexes/github-docs-dotcom-en-records.json b/src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_fpt_en-records.json similarity index 100% rename from src/search/tests/fixtures/search-indexes/github-docs-dotcom-en-records.json rename to src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_fpt_en-records.json diff --git a/src/search/tests/fixtures/search-indexes/github-docs-dotcom-ja-records.json b/src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_fpt_ja-records.json similarity index 100% rename from src/search/tests/fixtures/search-indexes/github-docs-dotcom-ja-records.json rename to src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_fpt_ja-records.json diff --git a/src/search/tests/fixtures/search-indexes/github-docs-ghec-en-records.json b/src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_ghec_en-records.json similarity index 100% rename from src/search/tests/fixtures/search-indexes/github-docs-ghec-en-records.json rename to src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_ghec_en-records.json diff --git a/src/search/tests/fixtures/search-indexes/github-docs-ghec-ja-records.json b/src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_ghec_ja-records.json similarity index 100% rename from src/search/tests/fixtures/search-indexes/github-docs-ghec-ja-records.json rename to src/search/tests/fixtures/search-indexes/tests_github-docs_general-search_ghec_ja-records.json diff --git a/src/search/tests/parse-page-sections-into-records.js b/src/search/tests/parse-page-sections-into-records.ts similarity index 63% rename from src/search/tests/parse-page-sections-into-records.js rename to src/search/tests/parse-page-sections-into-records.ts index c6811f3e671f..7367ccf749f2 100644 --- a/src/search/tests/parse-page-sections-into-records.js +++ b/src/search/tests/parse-page-sections-into-records.ts @@ -5,10 +5,19 @@ import fs from 'fs/promises' import cheerio from 'cheerio' import { describe, expect, test } from 'vitest' -import parsePageSectionsIntoRecords from '../scripts/parse-page-sections-into-records' +import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records' +import type { Record } from '@/search/scripts/scrape/types' + const __dirname = path.dirname(fileURLToPath(import.meta.url)) -const fixtures = { +// Define the shape of fixtures with explicit keys and string values +const fixtures: { + pageWithSections: string + pageWithoutSections: string + pageWithoutBody: string + pageMultipleH1s: string + pageHeadingParagraphNoWhitespace: string +} = { pageWithSections: await fs.readFile( path.join(__dirname, 'fixtures/page-with-sections.html'), 'utf8', @@ -33,11 +42,11 @@ const fixtures = { describe('search parsePageSectionsIntoRecords module', () => { test('works for pages with sections', () => { - const html = fixtures.pageWithSections + const html: string = fixtures.pageWithSections const $ = cheerio.load(html) - const href = '/example/href' - const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected = { + const href: string = '/example/href' + const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + const expected: Record = { objectID: '/example/href', breadcrumbs: 'GitHub Actions / actions learning path', title: 'I am the page title', @@ -58,11 +67,11 @@ describe('search parsePageSectionsIntoRecords module', () => { }) test('works for pages without sections', () => { - const html = fixtures.pageWithoutSections + const html: string = fixtures.pageWithoutSections const $ = cheerio.load(html) - const href = '/example/href' - const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected = { + const href: string = '/example/href' + const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + const expected: Record = { objectID: '/example/href', breadcrumbs: 'Education / map topic', title: 'A page without sections', @@ -76,11 +85,11 @@ describe('search parsePageSectionsIntoRecords module', () => { }) test('works for pages without content', () => { - const html = fixtures.pageWithoutBody + const html: string = fixtures.pageWithoutBody const $ = cheerio.load(html) - const href = '/example/href' - const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected = { + const href: string = '/example/href' + const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + const expected: Record = { objectID: '/example/href', breadcrumbs: 'Education / map topic', title: 'A page without body', @@ -94,35 +103,29 @@ describe('search parsePageSectionsIntoRecords module', () => { }) test('only picks up the first h1 for the title', () => { - const html = fixtures.pageMultipleH1s + const html: string = fixtures.pageMultipleH1s const $ = cheerio.load(html) - const href = '/example/href' - const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + const href: string = '/example/href' + const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + expect(record.title).toEqual('I am the page title') }) test("content doesn't lump headings with paragraphs together", () => { - const html = fixtures.pageHeadingParagraphNoWhitespace + const html: string = fixtures.pageHeadingParagraphNoWhitespace const $ = cheerio.load(html) - const href = '/example/href' - const record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - - // This is a

inside the page but it should only appear once. - // We had a bug where the heading would be injected twice. - // E.g. - // - //

Heading

Text here

- // - // would become: - // - // Heading\nHeadingText here - // - // So now we make sure it only appears exactly once. - expect(record.content.match(/Changing your primary email address/g).length).toBe(1) - // But note also that it would also concatenate the text of the heading - // with the text of the paragraph without a whitespace in between. + const href: string = '/example/href' + const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) + + // Ensure the heading appears only once + const headingMatches = record.content.match(/Changing your primary email address/g) + expect(headingMatches).not.toBeNull() + expect(headingMatches!.length).toBe(1) + + // Ensure there's no concatenation without whitespace expect(record.content.includes('email addressYou can set')).toBeFalsy() - // Make sure that inline elements are still together. + + // Ensure inline elements remain intact expect(record.content).toMatch(/Paragraph\./) }) }) diff --git a/src/search/tests/rendering.js b/src/search/tests/rendering.ts similarity index 93% rename from src/search/tests/rendering.js rename to src/search/tests/rendering.ts index abd96a762e2a..660f9d553a45 100644 --- a/src/search/tests/rendering.js +++ b/src/search/tests/rendering.ts @@ -13,9 +13,9 @@ import { expect, test, vi } from 'vitest' -import { describeIfElasticsearchURL } from '#src/tests/helpers/conditional-runs.js' -import { get, getDOM } from '#src/tests/helpers/e2etest.js' -import { SURROGATE_ENUMS } from '#src/frame/middleware/set-fastly-surrogate-key.js' +import { describeIfElasticsearchURL } from '@/tests/helpers/conditional-runs.js' +import { get, getDOM } from '@/tests/helpers/e2etest-ts' +import { SURROGATE_ENUMS } from '@/frame/middleware/set-fastly-surrogate-key.js' if (!process.env.ELASTICSEARCH_URL) { console.warn( @@ -51,7 +51,7 @@ describeIfElasticsearchURL('search rendering page', () => { test('response headers', async () => { const res = await get('/en/search?query=foo') - // Check that it can be cached at the CDN + // Assuming `res` has a type with a `headers` property expect(res.headers['set-cookie']).toBeUndefined() expect(res.headers['cache-control']).toContain('public') expect(res.headers['cache-control']).toMatch(/max-age=[1-9]/) @@ -107,7 +107,7 @@ describeIfElasticsearchURL('search rendering page', () => { expect(results.length).toBeGreaterThan(0) // Each link should have enterprise-cloud@latest in the pathname const links = $('[data-testid="search-result"] a') - const hrefs = links.map((i, el) => $(el).attr('href')).get() + const hrefs: string[] = links.map((_, el) => $(el).attr('href') ?? '').get() for (const href of hrefs) { expect(href).toMatch('/en/enterprise-cloud@latest/') } @@ -133,7 +133,7 @@ describeIfElasticsearchURL('search rendering page', () => { expect(res.statusCode).toBe(200) }) - test('more that one search query', async () => { + test('more than one search query', async () => { const $ = await getDOM('/en/search?query=foo&query=bar') expect($('[data-testid="search-results"]').text()).toMatch('Cannot have multiple values') const results = $('[data-testid="search-result"]') diff --git a/src/search/tests/search.js b/src/search/tests/search.js deleted file mode 100644 index aa554c3d0507..000000000000 --- a/src/search/tests/search.js +++ /dev/null @@ -1,37 +0,0 @@ -import { describe, expect, test, vi } from 'vitest' - -import { get, getDOM } from '#src/tests/helpers/e2etest.js' - -describe('search results page', () => { - vi.setConfig({ testTimeout: 60 * 1000 }) - - test('says something if no query is provided', async () => { - const $ = await getDOM('/en/search') - const $container = $('[data-testid="search-results"]') - expect($container.text()).toMatch(/Enter a search term/) - // Default is the frontmatter title of the content/search/index.md - expect($('title').text()).toMatch('Search - GitHub Docs') - }) - - test('says something if query is empty', async () => { - const $ = await getDOM(`/en/search?${new URLSearchParams({ query: ' ' })}`) - const $container = $('[data-testid="search-results"]') - expect($container.text()).toMatch(/Enter a search term/) - }) - - test('mention search term in h1', async () => { - const $ = await getDOM(`/en/search?${new URLSearchParams({ query: 'peterbe' })}`) - const $container = $('[data-testid="search-results"]') - const h1Text = $container.find('h1').text() - expect(h1Text).toMatch(/Search results for/) - expect(h1Text).toMatch(/peterbe/) - expect($('title').text()).toMatch(/Search results for "peterbe"/) - }) - - test('invalid version prefix 404s', async () => { - const res = await get( - `/en/enterprise-stuff@3.10/search?${new URLSearchParams({ query: 'peterbe' })}`, - ) - expect(res.statusCode).toBe(404) - }) -}) diff --git a/src/search/tests/search.ts b/src/search/tests/search.ts new file mode 100644 index 000000000000..5fed615cd522 --- /dev/null +++ b/src/search/tests/search.ts @@ -0,0 +1,40 @@ +import { describe, expect, test, vi } from 'vitest' +import { get, getDOM } from '@/tests/helpers/e2etest-ts' + +describe('search results page', () => { + vi.setConfig({ testTimeout: 60 * 1000 }) + + test('says something if no query is provided', async (): Promise => { + const $ = await getDOM('/en/search') + const $container = $('[data-testid="search-results"]') + expect($container.text()).toMatch(/Enter a search term/) + // Default is the frontmatter title of the content/search/index.md + expect($('title').text()).toMatch('Search - GitHub Docs') + }) + + test('says something if query is empty', async (): Promise => { + const queryParams = new URLSearchParams({ query: ' ' }).toString() + const $ = await getDOM(`/en/search?${queryParams}`) + const $container = $('[data-testid="search-results"]') + expect($container.text()).toMatch(/Enter a search term/) + }) + + test('mentions search term in h1', async (): Promise => { + const searchTerm = 'peterbe' + const queryParams = new URLSearchParams({ query: searchTerm }).toString() + const $ = await getDOM(`/en/search?${queryParams}`) + const $container = $('[data-testid="search-results"]') + const h1Text: string = $container.find('h1').text() + + expect(h1Text).toMatch(/Search results for/) + expect(h1Text).toMatch(new RegExp(searchTerm)) + expect($('title').text()).toMatch(new RegExp(`Search results for "${searchTerm}"`)) + }) + + test('invalid version prefix 404s', async (): Promise => { + const queryParams = new URLSearchParams({ query: 'peterbe' }).toString() + const response = await get(`/en/enterprise-stuff@3.10/search?${queryParams}`) + + expect(response.statusCode).toBe(404) + }) +}) diff --git a/src/search/tests/topics.js b/src/search/tests/topics.js deleted file mode 100644 index 7efe8e8538e1..000000000000 --- a/src/search/tests/topics.js +++ /dev/null @@ -1,39 +0,0 @@ -import path from 'path' -import fs from 'fs' - -import { describe, expect, test } from 'vitest' -import walk from 'walk-sync' -import { difference } from 'lodash-es' - -import readFrontmatter from '#src/frame/lib/read-frontmatter.js' -import allowedTopics from '../../../data/allowed-topics.js' - -const contentDir = path.join(process.cwd(), 'content') -const topics = walk(contentDir, { includeBasePath: true }) - .filter((filename) => filename.endsWith('.md') && !filename.includes('README')) - .map((filename) => { - const fileContent = fs.readFileSync(filename, 'utf8') - const { data, errors } = readFrontmatter(fileContent) - if (errors.length > 0) { - console.warn(errors) - throw new Error(`More than 0 front-matter errors`) - } - return data.topics || [] - }) - .flat() - -const allUsedTopics = [...new Set(topics)].sort() - -describe('Check for allowed frontmatter topics', () => { - test('all used topics are allowed in /data/allowed-topics.js', () => { - expect(allUsedTopics.length).toBeGreaterThan(0) - const unusedTopics = difference(allUsedTopics, allowedTopics) - expect(unusedTopics).toEqual([]) - }) - - test('all allowed topics are used by at least one content file', () => { - expect(allowedTopics.length).toBeGreaterThan(0) - const disallowedTopics = difference(allowedTopics, allUsedTopics) - expect(disallowedTopics).toEqual([]) - }) -}) diff --git a/src/search/tests/topics.ts b/src/search/tests/topics.ts new file mode 100644 index 000000000000..c9cd13c9bfee --- /dev/null +++ b/src/search/tests/topics.ts @@ -0,0 +1,44 @@ +import path from 'path' +import fs from 'fs' + +import { describe, expect, test } from 'vitest' +import walk from 'walk-sync' +import { difference } from 'lodash-es' + +import readFrontmatter from '@/frame/lib/read-frontmatter' +import allowedTopics from '../../../data/allowed-topics' + +const contentDir: string = path.join(process.cwd(), 'content') + +const topics: string[] = walk(contentDir, { includeBasePath: true }) + .filter((filename: string) => filename.endsWith('.md') && !filename.includes('README')) + .map((filename: string) => { + const fileContent: string = fs.readFileSync(filename, 'utf8') + const { data, errors } = readFrontmatter(fileContent) + + if (errors.length > 0) { + console.warn(errors) + throw new Error(`More than 0 front-matter errors in file: ${filename}`) + } + + return (data as any).topics || [] + }) + .flat() + +const allUsedTopics: string[] = Array.from(new Set(topics)).sort() + +describe('Check for allowed frontmatter topics', () => { + test('all used topics are allowed in /data/allowed-topics.js', () => { + expect(allUsedTopics.length).toBeGreaterThan(0) + + const unusedTopics: string[] = difference(allUsedTopics, allowedTopics) + expect(unusedTopics).toEqual([]) + }) + + test('all allowed topics are used by at least one content file', () => { + expect(allowedTopics.length).toBeGreaterThan(0) + + const disallowedTopics: string[] = difference(allowedTopics, allUsedTopics) + expect(disallowedTopics).toEqual([]) + }) +}) diff --git a/src/search/types.ts b/src/search/types.ts new file mode 100644 index 000000000000..bab9fb97b734 --- /dev/null +++ b/src/search/types.ts @@ -0,0 +1,76 @@ +import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types' +import type { + AdditionalIncludes, + ComputedSearchQueryParamsMap, +} from '@/search/lib/search-request-params/types' + +export type SearchTypes = 'generalSearch' | 'generalAutocomplete' | 'aiSearchAutocomplete' + +// Responses to API routes +export interface GeneralSearchResponse { + meta: SearchResultsMeta & { + page: number + } + hits: GeneralSearchHit[] + aggregations?: SearchResultAggregations | null +} + +export interface AutocompleteSearchResponse { + meta: SearchResultsMeta + hits: AutocompleteSearchHit[] +} + +// Response to middleware /search route +export interface SearchOnReqObject { + searchParams: ComputedSearchQueryParamsMap[Type] + validationErrors: SearchValidationErrorEntry[] + results?: GeneralSearchResponse +} + +export interface SearchValidationErrorEntry { + error: string + key?: string + field?: string +} + +// - - - Types for building the search responses - - - +export interface GeneralSearchHitWithoutIncludes { + id: string + url: string + title: string + breadcrumbs: string + topics?: string[] + score?: number + popularity?: number + es_url?: string + highlights: { + [key: string]: string[] + } +} + +export type GeneralSearchHit = GeneralSearchHitWithoutIncludes & { + [key in AdditionalIncludes]?: string +} + +interface AutocompleteSearchHit { + term?: string + highlights: string[] +} + +export type SearchAggregation = { + key: string + count: number +} + +export type SearchResultAggregations = { + [key: string]: SearchAggregation[] +} + +type SearchResultsMeta = { + found: SearchTotalHits + took: { + query_msec: number + total_msec: number + } + size: number +} diff --git a/src/tests/README.md b/src/tests/README.md index c0ae0e0f3074..d144226b2f2f 100644 --- a/src/tests/README.md +++ b/src/tests/README.md @@ -54,6 +54,10 @@ npm test -- vitest path/to/tests/directory ``` +## Allowing logging in tests + +If you set up a `console.log` in the code and want to see the output, simply append the `--silent false` flag to your test to see console output. + ## Failed Local Tests If the tests fail locally with an error like this: diff --git a/src/tests/helpers/e2etest-ts.ts b/src/tests/helpers/e2etest-ts.ts new file mode 100644 index 000000000000..9d489cdf28d0 --- /dev/null +++ b/src/tests/helpers/e2etest-ts.ts @@ -0,0 +1,181 @@ +import cheerio from 'cheerio' +import got, { Response, OptionsOfTextResponseBody, Method } from 'got' +import { omitBy, isUndefined } from 'lodash-es' + +type ResponseTypes = 'buffer' | 'json' | 'text' +type ResponseTypeMap = { + buffer: ArrayBuffer + json: any + text: string +} + +interface GetOptions { + method?: M + body?: any + followRedirects?: boolean + followAllRedirects?: boolean + headers?: Record + responseType?: ResponseType + retries?: number +} + +interface GetDOMOptions { + headers?: Record + allow500s?: boolean + allow404?: boolean + retries?: number +} + +interface ResponseWithHeaders extends Response { + headers: Record +} + +// Cache to store DOM objects +const getDOMCache = new Map() + +/** + * Makes an HTTP request using the specified method and options. + * + * @param route - The route to request. + * @param options - Configuration options for the request. + * @returns A promise that resolves to the HTTP response. + */ +export async function get( + route: string, + options: GetOptions = {}, +): Promise> { + const { + method = 'get', + body, + followRedirects = false, + followAllRedirects = false, + headers = {}, + responseType, + retries = 0, + } = options + + // Ensure the method is a valid function on `got` + const fn = got[method as 'get'] + if (!fn || typeof fn !== 'function') { + throw new Error(`No method function for '${method}'`) + } + + // Construct the options for the `got` request, omitting undefined values + const xopts: OptionsOfTextResponseBody = omitBy( + { + body, + headers, + retry: { limit: retries }, + throwHttpErrors: false, + followRedirect: followAllRedirects || followRedirects, + responseType: responseType || undefined, + }, + isUndefined, + ) + + // Perform the HTTP request + return (await fn(`http://localhost:4000${route}`, xopts)) as ResponseWithHeaders< + ResponseTypeMap[T] + > +} + +/** + * Makes a HEAD HTTP request to the specified route. + * + * @param route - The route to request. + * @param opts - Options for following redirects. + * @returns A promise that resolves to the HTTP response. + */ +export async function head( + route: string, + opts: { followRedirects?: boolean } = { followRedirects: false }, +): Promise> { + const res = await get(route, { method: 'head', followRedirects: opts.followRedirects }) + return res +} + +/** + * Makes a POST HTTP request to the specified route. + * + * @param route - The route to request. + * @param opts - Options for the request. + * @returns A promise that resolves to the HTTP response. + */ +export function post( + route: string, + opts: Omit = {}, +): Promise> { + return get(route, { ...opts, method: 'post' }) +} + +/** + * Retrieves a cached DOM object for the specified route and options. + * If the DOM is not cached, it fetches and caches it. + * + * @param route - The route to request. + * @param options - Options for fetching the DOM. + * @returns A promise that resolves to the cached DOM object. + */ +export async function getDOMCached( + route: string, + options: GetDOMOptions = {}, +): Promise { + const key = `${route}::${JSON.stringify(options)}` + if (!getDOMCache.has(key)) { + const dom = await getDOM(route, options) + getDOMCache.set(key, dom) + } + // The non-null assertion is safe here because we've just set the key if it didn't exist + return getDOMCache.get(key)! +} + +/** + * Fetches the DOM for the specified route and options. + * + * @param route - The route to request. + * @param options - Options for fetching the DOM. + * @returns A promise that resolves to the loaded DOM object. + */ +export async function getDOM(route: string, options: GetDOMOptions = {}): Promise { + const { headers, allow500s = false, allow404 = false, retries = 0 } = options + const res = await get(route, { followRedirects: true, headers, retries }) + + if (!allow500s && res.statusCode >= 500) { + throw new Error(`Server error (${res.statusCode}) on ${route}`) + } + + if (!allow404 && res.statusCode === 404) { + throw new Error(`Page not found on ${route} (${res.statusCode})`) + } + + const $ = cheerio.load(res.body || '', { xmlMode: true }) + + // Extend the Cheerio instance with the response object + ;($ as any).res = { ...res } + + return $ +} + +/** + * Fetches and parses JSON from the specified route. + * + * @param route - The route to request. + * @param opts - Options for the request. + * @returns A promise that resolves to the parsed JSON object. + */ +export async function getJSON( + route: string, + opts: Omit = {}, +): Promise { + const res = await get(route, { ...opts, followRedirects: true }) + + if (res.statusCode >= 500) { + throw new Error(`Server error (${res.statusCode}) on ${route}`) + } + + if (res.statusCode >= 400) { + console.warn(`${res.statusCode} on ${route} and the response might not be JSON`) + } + + return JSON.parse(res.body) +} diff --git a/tsconfig.json b/tsconfig.json index 96bc64921938..86975737dcc0 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -27,6 +27,7 @@ }, "exclude": [ "node_modules", + "docs-internal-data", "src/code-scanning/scripts/generate-code-scanning-query-list.ts" ], "include": [