Skip to content

Commit

Permalink
Merge pull request #32807 from github/repo-sync
Browse files Browse the repository at this point in the history
Repo sync
  • Loading branch information
docs-bot authored May 6, 2024
2 parents 492b0ae + 499eb82 commit 73dc9a2
Show file tree
Hide file tree
Showing 17 changed files with 773 additions and 121 deletions.
215 changes: 106 additions & 109 deletions package-lock.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
"index": "tsx src/search/scripts/index/index.ts",
"index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
"index-test-fixtures": "npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes",
"index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
"lint-content": "node src/content-linter/scripts/lint-content.js",
"lint-translation": "vitest src/content-linter/tests/lint-files.js",
Expand Down Expand Up @@ -339,7 +339,7 @@
"typescript": "^5.4.4",
"unist-util-remove": "^4.0.0",
"unist-util-visit-parents": "6.0.1",
"vitest": "1.5.0",
"vitest": "1.6.0",
"website-scraper": "^5.3.1"
},
"overrides": {},
Expand Down
93 changes: 92 additions & 1 deletion src/search/middleware/es-search.js
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,48 @@ export async function getSearchResults({
return { meta, hits }
}

export async function getAutocompleteSearchResults({ indexName, query, size }) {
const client = getClient()

const matchQueries = getAutocompleteMatchQueries(query.trim(), {
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}

const highlight = getHighlightConfiguration(query, ['term'])

const searchQuery = {
index: indexName,
highlight,
size,
query: matchQuery,
// Send absolutely minimal from Elasticsearch to here. Less data => faster.
_source_includes: ['term'],
}
const result = await client.search(searchQuery)

const hitsAll = result.hits
const hits = hitsAll.hits.map((hit) => {
return {
term: hit._source.term,
highlights: (hit.highlight && hit.highlight.term) || [],
}
})

const meta = {
found: hitsAll.total,
}

return { meta, hits }
}

function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
const BOOST_PHRASE = 10.0
const BOOST_TITLE = 4.0
Expand Down Expand Up @@ -371,6 +413,46 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
return matchQueries
}

function getAutocompleteMatchQueries(query, { fuzzy }) {
const BOOST_PHRASE = 4.0
const BOOST_REGULAR = 2.0
const BOOST_FUZZY = 0.1 // make it always last in ranking
const matchQueries = []

// If the query input is multiple words, it's good to know because you can
// make the query do `match_phrase` and you can make `match` query
// with the `AND` operator (`OR` is the default).
const isMultiWordQuery = query.includes(' ') || query.includes('-')

if (isMultiWordQuery) {
matchQueries.push({
match_phrase_prefix: {
term: {
query,
boost: BOOST_PHRASE,
},
},
})
}
matchQueries.push({
match_bool_prefix: {
term: {
query,
boost: BOOST_REGULAR,
},
},
})
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
},
})
}

return matchQueries
}

function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) {
return hits.map((hit) => {
// Return `hit.highlights[...]` based on the highlight fields requested.
Expand Down Expand Up @@ -464,7 +546,16 @@ function getHighlightConfiguration(query, highlights) {
},
}
}

if (highlights.includes('term')) {
fields.term = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
// fragment_size: 200,
// number_of_fragments: 1,
}
}
return {
pre_tags: ['<mark>'],
post_tags: ['</mark>'],
Expand Down
50 changes: 47 additions & 3 deletions src/search/middleware/get-search-request.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import { allVersions } from '#src/versions/lib/all-versions.js'
import { POSSIBLE_HIGHLIGHT_FIELDS, DEFAULT_HIGHLIGHT_FIELDS } from './es-search.js'

const DEFAULT_SIZE = 10
const DEFAULT_AUTOCOMPLETE_SIZE = 8
const MAX_SIZE = 50 // How much you return has a strong impact on performance
const MAX_AUTOCOMPLETE_SIZE = 10
const DEFAULT_PAGE = 1
const POSSIBLE_SORTS = ['best', 'relevance']
const DEFAULT_SORT = POSSIBLE_SORTS[0]
Expand All @@ -23,13 +25,19 @@ const V1_ADDITIONAL_INCLUDES = ['intro', 'headings']
// In some distant future we can clean up any client enough that this
// aliasing won't be necessary.
const versionAliases = {}
const prefixVersionAliases = {}
Object.values(allVersions).forEach((info) => {
if (info.hasNumberedReleases) {
versionAliases[info.currentRelease] = info.miscVersionName
} else {
versionAliases[info.version] = info.miscVersionName
versionAliases[info.miscVersionName] = info.miscVersionName
}
// This makes it so you can search for `?version=enterprise-server`
// and that actually means `?version=ghes` because there's an index
// called `github-autocomplete-en-ghes`.
prefixVersionAliases[info.plan] = info.shortName
prefixVersionAliases[info.shortName] = info.shortName
})

function getIndexPrefix() {
Expand Down Expand Up @@ -102,11 +110,44 @@ const PARAMS = [
},
]

export function getSearchFromRequest(req, force = {}) {
const AUTOCOMPLETE_PARAMS = [
{ key: 'query' },
{ key: 'language', default_: 'en', validate: (v) => v in languages },
{
key: 'version',
default_: 'free-pro-team',
validate: (v) => {
if (prefixVersionAliases[v] || allVersions[v]) return true
if (Object.values(prefixVersionAliases).includes(v)) return true
const valid = [
...Object.keys(prefixVersionAliases),
...Object.values(prefixVersionAliases),
...Object.keys(allVersions),
]
throw new ValidationError(`'${v}' not in ${valid.join(', ')}`)
},
},
{
key: 'size',
default_: DEFAULT_AUTOCOMPLETE_SIZE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 0 && v <= MAX_AUTOCOMPLETE_SIZE,
},
]
export function getAutocompleteSearchFromRequest(req, force = {}) {
const { search, validationErrors } = getSearchFromRequest(req, {}, AUTOCOMPLETE_PARAMS)
if (validationErrors.length === 0) {
const version = prefixVersionAliases[search.version] || allVersions[search.version].shortName
search.indexName = `${getIndexPrefix()}github-autocomplete-${search.language}-${version}`
}
return { search, validationErrors }
}

export function getSearchFromRequest(req, force = {}, params = PARAMS) {
const search = {}
const validationErrors = []

for (const { key, default_, cast, validate, multiple } of PARAMS) {
for (const { key, default_, cast, validate, multiple } of params) {
// This is necessary because when the version or language comes from
// the pathname, we don't want pick these up from the query string.
// This function gets used by /$locale/$version/search
Expand Down Expand Up @@ -153,7 +194,10 @@ export function getSearchFromRequest(req, force = {}) {
}

if (!validationErrors.length) {
const version = versionAliases[search.version] || allVersions[search.version].miscVersionName
const version =
prefixVersionAliases[search.version] ||
versionAliases[search.version] ||
allVersions[search.version].miscVersionName
search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en
}

Expand Down
55 changes: 53 additions & 2 deletions src/search/middleware/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import {
setFastlySurrogateKey,
SURROGATE_ENUMS,
} from '#src/frame/middleware/set-fastly-surrogate-key.js'
import { getSearchResults } from './es-search.js'
import { getSearchFromRequest } from './get-search-request.js'
import { getAutocompleteSearchResults, getSearchResults } from './es-search.js'
import { getAutocompleteSearchFromRequest, getSearchFromRequest } from './get-search-request.js'

const router = express.Router()

Expand Down Expand Up @@ -69,6 +69,52 @@ router.get(
}),
)

export const autocompleteValidationMiddleware = (req, res, next) => {
const { search, validationErrors } = getAutocompleteSearchFromRequest(req)
if (validationErrors.length) {
// There might be multiple things bad about the query parameters,
// but we send a 400 on the first possible one in the API.
return res.status(400).json(validationErrors[0])
}

req.search = search
return next()
}

router.get(
'/autocomplete/v1',
autocompleteValidationMiddleware,
catchMiddlewareError(async (req, res) => {
const { indexName, query, size } = req.search

const options = {
indexName,
query,
size,
}
try {
const { meta, hits } = await getAutocompleteSearchResults(options)

if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
// We can cache this without purging it after every deploy
// because the API search is only used as a proxy for local
// and preview environments.
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}

// The v1 version of the output matches perfectly what comes out
// of the getSearchResults() function.
res.status(200).json({ meta, hits })
} catch (error) {
// If getSearchResult() throws an error that might be 404 inside
// elasticsearch, if we don't capture that here, it will propagate
// to the next middleware.
await handleGetSearchResultsError(req, res, error, options)
}
}),
)

// We have more than one place where we do `try{...} catch error( THIS )`
// which is slightly different depending on the "sub-version" (e.g. /legacy)
// This function is a single place to take care of all of these error handlings
Expand All @@ -93,4 +139,9 @@ router.get('/', (req, res) => {
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
})

// Alias for the latest autocomplete version
router.get('/autocomplete', (req, res) => {
res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
})

export default router
12 changes: 12 additions & 0 deletions src/search/scripts/index-test-fixtures.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

# This exists as a bash script because the commands are a bit too long
# and complex to express inside `package.json`.

set -e

# For general site-search
npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes

# For autocomplete search
npm run index -- autocomplete src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
21 changes: 18 additions & 3 deletions src/search/scripts/index/index-autocomplete.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ type Options = {
retries?: number
sleepTime?: number
verbose?: boolean
indexPrefix?: string
}

export async function indexAutocomplete(options: Options) {
Expand All @@ -38,7 +39,12 @@ export async function indexAutocomplete(options: Options) {
for (const language of languages) {
for (const version of versions) {
const records = loadRecords({ version, language, dataRepoRoot })
const { alias, name } = await createIndex(client, language, version)
const { alias, name } = await createIndex(
client,
language,
version,
options.indexPrefix || '',
)
await populate(client, records, {
alias,
name,
Expand Down Expand Up @@ -109,7 +115,12 @@ type IndexInfo = {
name: string
}

async function createIndex(client: Client, language: string, version: Version): Promise<IndexInfo> {
async function createIndex(
client: Client,
language: string,
version: Version,
indexPrefix: string,
): Promise<IndexInfo> {
const settings: estypes.IndicesIndexSettings = {
analysis: {
analyzer: {
Expand All @@ -126,7 +137,11 @@ async function createIndex(client: Client, language: string, version: Version):
// XXX SNOWBALL?
}

const indexName = `github-autocomplete-${language}-${shortVersionNames[version] || version}`
if (indexPrefix && !indexPrefix.endsWith('_')) {
indexPrefix += '_'
}

const indexName = `${indexPrefix}github-autocomplete-${language}-${shortVersionNames[version] || version}`
const thisAlias = `${indexName}__${utcTimestamp()}`

const mappings: estypes.MappingTypeMapping = {
Expand Down
4 changes: 3 additions & 1 deletion src/search/scripts/index/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ program
]),
)
.option('--verbose', 'Verbose output')
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
.argument('<data-root>', 'path to the docs-internal-data repo')
.action((root: string, options) => {
const languages = options.language ? options.language : languageKeys
Expand All @@ -36,7 +37,8 @@ program
versions.push(v)
}
}
return indexAutocomplete({ dataRepoRoot: root, languages, versions })
const indexPrefix = options.indexPrefix || ''
return indexAutocomplete({ dataRepoRoot: root, languages, versions, indexPrefix })
})

program.parse(process.argv)
Loading

0 comments on commit 73dc9a2

Please sign in to comment.