Merge pull request #32807 from github/repo-sync

Repo sync
github · May 6, 2024 · 73dc9a2 · 73dc9a2
2 parents 492b0ae + 499eb82
commit 73dc9a2
Show file tree

Hide file tree

Showing 17 changed files with 773 additions and 121 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -37,7 +37,7 @@
     "fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
     "index": "tsx src/search/scripts/index/index.ts",
     "index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
-    "index-test-fixtures": "npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes",
+    "index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
     "lint": "eslint '**/*.{js,mjs,ts,tsx}'",
     "lint-content": "node src/content-linter/scripts/lint-content.js",
     "lint-translation": "vitest src/content-linter/tests/lint-files.js",
@@ -339,7 +339,7 @@
     "typescript": "^5.4.4",
     "unist-util-remove": "^4.0.0",
     "unist-util-visit-parents": "6.0.1",
-    "vitest": "1.5.0",
+    "vitest": "1.6.0",
     "website-scraper": "^5.3.1"
   },
   "overrides": {},

diff --git a/src/search/middleware/es-search.js b/src/search/middleware/es-search.js
@@ -183,6 +183,48 @@ export async function getSearchResults({
   return { meta, hits }
 }
 
+export async function getAutocompleteSearchResults({ indexName, query, size }) {
+  const client = getClient()
+
+  const matchQueries = getAutocompleteMatchQueries(query.trim(), {
+    fuzzy: {
+      minLength: 3,
+      maxLength: 20,
+    },
+  })
+  const matchQuery = {
+    bool: {
+      should: matchQueries,
+    },
+  }
+
+  const highlight = getHighlightConfiguration(query, ['term'])
+
+  const searchQuery = {
+    index: indexName,
+    highlight,
+    size,
+    query: matchQuery,
+    // Send absolutely minimal from Elasticsearch to here. Less data => faster.
+    _source_includes: ['term'],
+  }
+  const result = await client.search(searchQuery)
+
+  const hitsAll = result.hits
+  const hits = hitsAll.hits.map((hit) => {
+    return {
+      term: hit._source.term,
+      highlights: (hit.highlight && hit.highlight.term) || [],
+    }
+  })
+
+  const meta = {
+    found: hitsAll.total,
+  }
+
+  return { meta, hits }
+}
+
 function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
   const BOOST_PHRASE = 10.0
   const BOOST_TITLE = 4.0
@@ -371,6 +413,46 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
   return matchQueries
 }
 
+function getAutocompleteMatchQueries(query, { fuzzy }) {
+  const BOOST_PHRASE = 4.0
+  const BOOST_REGULAR = 2.0
+  const BOOST_FUZZY = 0.1 // make it always last in ranking
+  const matchQueries = []
+
+  // If the query input is multiple words, it's good to know because you can
+  // make the query do `match_phrase` and you can make `match` query
+  // with the `AND` operator (`OR` is the default).
+  const isMultiWordQuery = query.includes(' ') || query.includes('-')
+
+  if (isMultiWordQuery) {
+    matchQueries.push({
+      match_phrase_prefix: {
+        term: {
+          query,
+          boost: BOOST_PHRASE,
+        },
+      },
+    })
+  }
+  matchQueries.push({
+    match_bool_prefix: {
+      term: {
+        query,
+        boost: BOOST_REGULAR,
+      },
+    },
+  })
+  if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
+    matchQueries.push({
+      fuzzy: {
+        term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
+      },
+    })
+  }
+
+  return matchQueries
+}
+
 function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) {
   return hits.map((hit) => {
     // Return `hit.highlights[...]` based on the highlight fields requested.
@@ -464,7 +546,16 @@ function getHighlightConfiguration(query, highlights) {
       },
     }
   }
-
+  if (highlights.includes('term')) {
+    fields.term = {
+      // Fast Vector Highlighter
+      // Using this requires that you first index these fields
+      // with {term_vector: 'with_positions_offsets'}
+      type: 'fvh',
+      // fragment_size: 200,
+      // number_of_fragments: 1,
+    }
+  }
   return {
     pre_tags: ['<mark>'],
     post_tags: ['</mark>'],

diff --git a/src/search/middleware/get-search-request.js b/src/search/middleware/get-search-request.js
@@ -5,7 +5,9 @@ import { allVersions } from '#src/versions/lib/all-versions.js'
 import { POSSIBLE_HIGHLIGHT_FIELDS, DEFAULT_HIGHLIGHT_FIELDS } from './es-search.js'
 
 const DEFAULT_SIZE = 10
+const DEFAULT_AUTOCOMPLETE_SIZE = 8
 const MAX_SIZE = 50 // How much you return has a strong impact on performance
+const MAX_AUTOCOMPLETE_SIZE = 10
 const DEFAULT_PAGE = 1
 const POSSIBLE_SORTS = ['best', 'relevance']
 const DEFAULT_SORT = POSSIBLE_SORTS[0]
@@ -23,13 +25,19 @@ const V1_ADDITIONAL_INCLUDES = ['intro', 'headings']
 // In some distant future we can clean up any client enough that this
 // aliasing won't be necessary.
 const versionAliases = {}
+const prefixVersionAliases = {}
 Object.values(allVersions).forEach((info) => {
   if (info.hasNumberedReleases) {
     versionAliases[info.currentRelease] = info.miscVersionName
   } else {
     versionAliases[info.version] = info.miscVersionName
     versionAliases[info.miscVersionName] = info.miscVersionName
   }
+  // This makes it so you can search for `?version=enterprise-server`
+  // and that actually means `?version=ghes` because there's an index
+  // called `github-autocomplete-en-ghes`.
+  prefixVersionAliases[info.plan] = info.shortName
+  prefixVersionAliases[info.shortName] = info.shortName
 })
 
 function getIndexPrefix() {
@@ -102,11 +110,44 @@ const PARAMS = [
   },
 ]
 
-export function getSearchFromRequest(req, force = {}) {
+const AUTOCOMPLETE_PARAMS = [
+  { key: 'query' },
+  { key: 'language', default_: 'en', validate: (v) => v in languages },
+  {
+    key: 'version',
+    default_: 'free-pro-team',
+    validate: (v) => {
+      if (prefixVersionAliases[v] || allVersions[v]) return true
+      if (Object.values(prefixVersionAliases).includes(v)) return true
+      const valid = [
+        ...Object.keys(prefixVersionAliases),
+        ...Object.values(prefixVersionAliases),
+        ...Object.keys(allVersions),
+      ]
+      throw new ValidationError(`'${v}' not in ${valid.join(', ')}`)
+    },
+  },
+  {
+    key: 'size',
+    default_: DEFAULT_AUTOCOMPLETE_SIZE,
+    cast: (v) => parseInt(v, 10),
+    validate: (v) => v >= 0 && v <= MAX_AUTOCOMPLETE_SIZE,
+  },
+]
+export function getAutocompleteSearchFromRequest(req, force = {}) {
+  const { search, validationErrors } = getSearchFromRequest(req, {}, AUTOCOMPLETE_PARAMS)
+  if (validationErrors.length === 0) {
+    const version = prefixVersionAliases[search.version] || allVersions[search.version].shortName
+    search.indexName = `${getIndexPrefix()}github-autocomplete-${search.language}-${version}`
+  }
+  return { search, validationErrors }
+}
+
+export function getSearchFromRequest(req, force = {}, params = PARAMS) {
   const search = {}
   const validationErrors = []
 
-  for (const { key, default_, cast, validate, multiple } of PARAMS) {
+  for (const { key, default_, cast, validate, multiple } of params) {
     // This is necessary because when the version or language comes from
     // the pathname, we don't want pick these up from the query string.
     // This function gets used by /$locale/$version/search
@@ -153,7 +194,10 @@ export function getSearchFromRequest(req, force = {}) {
   }
 
   if (!validationErrors.length) {
-    const version = versionAliases[search.version] || allVersions[search.version].miscVersionName
+    const version =
+      prefixVersionAliases[search.version] ||
+      versionAliases[search.version] ||
+      allVersions[search.version].miscVersionName
     search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en
   }
 

diff --git a/src/search/middleware/search.js b/src/search/middleware/search.js
@@ -7,8 +7,8 @@ import {
   setFastlySurrogateKey,
   SURROGATE_ENUMS,
 } from '#src/frame/middleware/set-fastly-surrogate-key.js'
-import { getSearchResults } from './es-search.js'
-import { getSearchFromRequest } from './get-search-request.js'
+import { getAutocompleteSearchResults, getSearchResults } from './es-search.js'
+import { getAutocompleteSearchFromRequest, getSearchFromRequest } from './get-search-request.js'
 
 const router = express.Router()
 
@@ -69,6 +69,52 @@ router.get(
   }),
 )
 
+export const autocompleteValidationMiddleware = (req, res, next) => {
+  const { search, validationErrors } = getAutocompleteSearchFromRequest(req)
+  if (validationErrors.length) {
+    // There might be multiple things bad about the query parameters,
+    // but we send a 400 on the first possible one in the API.
+    return res.status(400).json(validationErrors[0])
+  }
+
+  req.search = search
+  return next()
+}
+
+router.get(
+  '/autocomplete/v1',
+  autocompleteValidationMiddleware,
+  catchMiddlewareError(async (req, res) => {
+    const { indexName, query, size } = req.search
+
+    const options = {
+      indexName,
+      query,
+      size,
+    }
+    try {
+      const { meta, hits } = await getAutocompleteSearchResults(options)
+
+      if (process.env.NODE_ENV !== 'development') {
+        searchCacheControl(res)
+        // We can cache this without purging it after every deploy
+        // because the API search is only used as a proxy for local
+        // and preview environments.
+        setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
+      }
+
+      // The v1 version of the output matches perfectly what comes out
+      // of the getSearchResults() function.
+      res.status(200).json({ meta, hits })
+    } catch (error) {
+      // If getSearchResult() throws an error that might be 404 inside
+      // elasticsearch, if we don't capture that here, it will propagate
+      // to the next middleware.
+      await handleGetSearchResultsError(req, res, error, options)
+    }
+  }),
+)
+
 // We have more than one place where we do `try{...} catch error( THIS )`
 // which is slightly different depending on the "sub-version" (e.g. /legacy)
 // This function is a single place to take care of all of these error handlings
@@ -93,4 +139,9 @@ router.get('/', (req, res) => {
   res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
 })
 
+// Alias for the latest autocomplete version
+router.get('/autocomplete', (req, res) => {
+  res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
+})
+
 export default router
diff --git a/src/search/scripts/index-test-fixtures.sh b/src/search/scripts/index-test-fixtures.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# This exists as a bash script because the commands are a bit too long
+# and complex to express inside `package.json`.
+
+set -e
+
+# For general site-search
+npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes
+
+# For autocomplete search
+npm run index -- autocomplete src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
diff --git a/src/search/scripts/index/index-autocomplete.ts b/src/search/scripts/index/index-autocomplete.ts
@@ -24,6 +24,7 @@ type Options = {
   retries?: number
   sleepTime?: number
   verbose?: boolean
+  indexPrefix?: string
 }
 
 export async function indexAutocomplete(options: Options) {
@@ -38,7 +39,12 @@ export async function indexAutocomplete(options: Options) {
   for (const language of languages) {
     for (const version of versions) {
       const records = loadRecords({ version, language, dataRepoRoot })
-      const { alias, name } = await createIndex(client, language, version)
+      const { alias, name } = await createIndex(
+        client,
+        language,
+        version,
+        options.indexPrefix || '',
+      )
       await populate(client, records, {
         alias,
         name,
@@ -109,7 +115,12 @@ type IndexInfo = {
   name: string
 }
 
-async function createIndex(client: Client, language: string, version: Version): Promise<IndexInfo> {
+async function createIndex(
+  client: Client,
+  language: string,
+  version: Version,
+  indexPrefix: string,
+): Promise<IndexInfo> {
   const settings: estypes.IndicesIndexSettings = {
     analysis: {
       analyzer: {
@@ -126,7 +137,11 @@ async function createIndex(client: Client, language: string, version: Version):
     // XXX SNOWBALL?
   }
 
-  const indexName = `github-autocomplete-${language}-${shortVersionNames[version] || version}`
+  if (indexPrefix && !indexPrefix.endsWith('_')) {
+    indexPrefix += '_'
+  }
+
+  const indexName = `${indexPrefix}github-autocomplete-${language}-${shortVersionNames[version] || version}`
   const thisAlias = `${indexName}__${utcTimestamp()}`
 
   const mappings: estypes.MappingTypeMapping = {

diff --git a/src/search/scripts/index/index.ts b/src/search/scripts/index/index.ts
@@ -25,6 +25,7 @@ program
     ]),
   )
   .option('--verbose', 'Verbose output')
+  .option('--index-prefix <prefix>', 'Prefix for the index names', '')
   .argument('<data-root>', 'path to the docs-internal-data repo')
   .action((root: string, options) => {
     const languages = options.language ? options.language : languageKeys
@@ -36,7 +37,8 @@ program
         versions.push(v)
       }
     }
-    return indexAutocomplete({ dataRepoRoot: root, languages, versions })
+    const indexPrefix = options.indexPrefix || ''
+    return indexAutocomplete({ dataRepoRoot: root, languages, versions, indexPrefix })
   })
 
 program.parse(process.argv)