From cb598b5e87f4e279c8c4b33d1174b2cef7dd7df5 Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Thu, 28 Nov 2024 02:43:24 -0800 Subject: [PATCH] Re-sync with internal repository (#473) The internal and external repositories are out of sync. This Pull Request attempts to brings them back in sync by patching the GitHub repository. Please carefully review this patch. You must disable ShipIt for your project in order to merge this pull request. DO NOT IMPORT this pull request. Instead, merge it directly on GitHub using the MERGE BUTTON. Re-enable ShipIt after merging. --- glean/lang/yaml/indexers/tests/test_1.yaml | 18 -- glean/lang/yaml/indexers/tests/test_2.yaml | 8 - .../yaml/indexers/tests/test_yaml_files.csv | 2 - glean/lang/yaml/indexers/yaml_indexer.py | 191 ------------------ glean/lang/yaml/make_test_db.sh | 49 ----- 5 files changed, 268 deletions(-) delete mode 100644 glean/lang/yaml/indexers/tests/test_1.yaml delete mode 100644 glean/lang/yaml/indexers/tests/test_2.yaml delete mode 100644 glean/lang/yaml/indexers/tests/test_yaml_files.csv delete mode 100644 glean/lang/yaml/indexers/yaml_indexer.py delete mode 100755 glean/lang/yaml/make_test_db.sh diff --git a/glean/lang/yaml/indexers/tests/test_1.yaml b/glean/lang/yaml/indexers/tests/test_1.yaml deleted file mode 100644 index 18a079371..000000000 --- a/glean/lang/yaml/indexers/tests/test_1.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# _target_: glean.lang.yaml.indexers.yaml_indexer.IndexingResult -defaults: - - _self_ - - audio_backbone: transformer_12layers -indexing_result: - _target_: glean.lang.yaml.indexers.yaml_indexer.IndexingResult - input: file_name -parsed_doc_node: - _target_: lex.hydra.hydra_ls.parsed_doc.ParsedDocNode - val: 1 - start: - _target_: language_server.py.lsp_types.Position - line: 2 - character: 3 - end: - _target_: language_server.py.lsp_types.Position - line: 4 - character: 5 diff --git a/glean/lang/yaml/indexers/tests/test_2.yaml b/glean/lang/yaml/indexers/tests/test_2.yaml deleted file mode 100644 index f12096633..000000000 --- a/glean/lang/yaml/indexers/tests/test_2.yaml +++ /dev/null @@ -1,8 +0,0 @@ -constants: - nfeatures: 2048 - -torchx_event: - _target_: torchx.runner.events.api.TorchxEvent - session: a_session - scheduler: mast - api: an_api diff --git a/glean/lang/yaml/indexers/tests/test_yaml_files.csv b/glean/lang/yaml/indexers/tests/test_yaml_files.csv deleted file mode 100644 index 7fcba0b1d..000000000 --- a/glean/lang/yaml/indexers/tests/test_yaml_files.csv +++ /dev/null @@ -1,2 +0,0 @@ -fbcode/glean/lang/yaml/indexers/tests/test_1.yaml -fbcode/glean/lang/yaml/indexers/tests/test_2.yaml diff --git a/glean/lang/yaml/indexers/yaml_indexer.py b/glean/lang/yaml/indexers/yaml_indexer.py deleted file mode 100644 index 1ee8bf3ff..000000000 --- a/glean/lang/yaml/indexers/yaml_indexer.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -To quickly test yaml indexer on a yaml file: - cd ~/fbsource/fbcode - buck run glean/lang/yaml/indexers:yaml_indexer -- --output-dir temp/glean/yikai --file glean/lang/yaml/indexers/tests/test_1.yaml -p - -To run the indexer on multiple yaml files, generate glean facts in a JSON file and store it in test glean DB: - cd ~/fbsource/fbcode - INPUT_CSV=glean/lang/yaml/indexers/tests/test_yaml_files.csv USER=yikai glean/lang/yaml/make_test_db.sh - -If changes are made to the schema, you also need to run Glean from source: - cd ~/fbsource/fbcode - glean/scripts/run-local-server --db-root ~/glean-dbs - GLEAN_TIER=localhost:25052 INPUT_CSV=glean/lang/yaml/indexers/tests/test_yaml_files.csv USER=yikai glean/lang/yaml/make_test_db.sh -""" - -import csv -import json -import logging -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional, Union - -import click -from glean.client.py3 import _glean_name, _schema_version -from glean.lang.python import facts -from glean.schema.python.types import Name -from glean.schema.src.types import ByteSpan, File -from glean.schema.yaml.types import XRefsByFile, XRefsByFile_key, XRefViaName -from phabricator.new_phabricator_graphql_helpers import PhabricatorPaste -from phabricator.phabricator_auth_strategy_factory import PhabricatorAuthStrategyFactory -from simplejson import JSONEncoder -from simplejson.raw_json import RawJSON -from thrift.py3 import Protocol, serialize - -logger: logging.Logger = logging.getLogger(__name__) - - -@dataclass(frozen=True) -class IndexingResult: - input: str - facts: XRefsByFile - - -@dataclass -class FactsBatch: - predicate: str - facts: List[RawJSON] - - def for_json(self) -> Dict[str, Union[List[RawJSON], str]]: - data = { - "predicate": self.predicate, - "facts": self.facts, - } - return data - - -@click.command() -@click.option( - "--collect-sources-from-csvfile", - type=click.Path(exists=True, resolve_path=True), - help=("Read IndexableFiles from the specified csv file."), -) -@click.option( - "--output-dir", - type=click.Path(resolve_path=True), - required=True, - help=("Path to export directory (for json files)"), -) -@click.option( - "--file", - type=click.Path(exists=True, resolve_path=True), - help=("Provide a yaml file path(relative to /fbcode) to run yaml indexer on."), -) -@click.option( - "-p", - "--create-paste", - default=False, - is_flag=True, - help="create a paste with the JSON output", -) -def main( - collect_sources_from_csvfile: Optional[str], - output_dir: str, - file: Optional[str], - create_paste: bool, -) -> None: - all_yaml_files = get_all_yaml_files(collect_sources_from_csvfile, file) - all_indexing_results = [index_one(file) for file in all_yaml_files] - all_facts_batch = [ - make_facts_batch(index_result) for index_result in all_indexing_results - ] - encoder: JSONEncoder = JSONEncoder( - separators=(",", ":"), for_json=True, iterable_as_array=True - ) - dirname = Path(output_dir) - dirname.mkdir(parents=True, exist_ok=True) - with (dirname / "0.json").open("w") as f: - for json_chunk in encoder.iterencode(all_facts_batch, False): - f.write(json_chunk) - # TODO(T207069122) Log when yaml indexer's output exceeds size limit - if create_paste: - with (dirname / "0.json").open("r") as f: - json_string = f.read() - data = json.loads(json_string) - formatted_json = json.dumps(data, indent=4) - paste_client = PhabricatorPaste( - PhabricatorAuthStrategyFactory.paste_bot(), - "yaml_indexer_debug", - ) - paste = paste_client.create_temp_everpaste(formatted_json) - print("Generated JSON:") - print(paste) - - logger.info(f"Finished writing to {dirname}/0.json") - - -def get_all_yaml_files( - collect_sources_from_csvfile: Optional[str], file: Optional[str] -) -> list[str]: - all_yaml_files = [] - if collect_sources_from_csvfile is not None and file is not None: - raise Exception( - "Please provide either --collect-sources-from-csvfile or --file, not both" - ) - if file is not None: - all_yaml_files.append(normalize_path(file)) - elif collect_sources_from_csvfile is not None: - with open(Path(collect_sources_from_csvfile), "rt") as fp: - reader = csv.DictReader(fp, fieldnames=["path"]) - for row in reader: - all_yaml_files.append(normalize_path(row["path"])) - return all_yaml_files - - -def normalize_path(file_path: str) -> str: - return ( - file_path.replace("fbcode/", "") - if file_path.startswith("fbcode/") - else file_path - ) - - -def make_facts_batch(index_result: IndexingResult) -> FactsBatch: - cls = type(index_result.facts) - return FactsBatch( - predicate=f"{_glean_name(cls)}.{_schema_version(cls)}", - facts=[RawJSON(serialize(index_result.facts, Protocol.JSON).decode())], - ) - - -def index_one(file_path: str) -> IndexingResult: - all_xrefs = fetch_all_XRefViaName(file_path) - file_fact = facts.make_src_fact(File, file_path) - xrefs_by_file = XRefsByFile(key=XRefsByFile_key(file=file_fact, xrefs=all_xrefs)) - return IndexingResult( - input=file_path, - facts=xrefs_by_file, - ) - - -def fetch_all_XRefViaName(file_path: str) -> list[XRefViaName]: - all_xrefs = [] - with open(file_path, "r") as f: - all_lines = f.readlines() - byte_count = 0 - for line in all_lines: - if re.match(r"^\s*_target_:", line) is not None: - match = re.search(r"(?<=_target_:\s)[\w.]+", line) - if match is not None: - all_xrefs.append( - XRefViaName( - target=Name(key=match.group(0)), - source=ByteSpan( - start=byte_count + match.start(), - length=match.end() - match.start(), - ), - ) - ) - byte_count += len(line) - return all_xrefs - - -if __name__ == "__main__": - main() diff --git a/glean/lang/yaml/make_test_db.sh b/glean/lang/yaml/make_test_db.sh deleted file mode 100755 index 25dcc7820..000000000 --- a/glean/lang/yaml/make_test_db.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -ueo pipefail - -BUCK_MODE="${BUCK_MODE:-dev}" - -FBSOURCE=$(realpath ~/fbsource) -INDEXER=${FBSOURCE}/$(buck2 build "@mode/${BUCK_MODE}" //glean/lang/yaml/indexers:yaml_indexer --show-json-output | jq -r '.["fbcode//glean/lang/yaml/indexers:yaml_indexer"]') - -GLEAN_TIER="${GLEAN_TIER:-glean.write.test}" -REPO_NAME="${USER}.test" -REPO_HASH=$(hg log -l 1 -T '{node}') -HANDLE=$(uuidgen) -OUTPUT_DIR=$(mktemp --directory --dry-run) - -INPUT_CSV_LINES=$(wc -l < "${INPUT_CSV}") - -echo "BUCK_MODE: ${BUCK_MODE}" -echo "INDEXER: ${INDEXER}" -echo "TEST_DIR: ${TEST_DIR:-}" -echo "INPUT_CSV: ${INPUT_CSV}, lines: ${INPUT_CSV_LINES}" -echo "GLEAN_TIER: ${GLEAN_TIER}" -echo "REPO_NAME: ${REPO_NAME}" -echo "REPO_HASH: ${REPO_HASH}" -echo "HANDLE: ${HANDLE}" -echo "OUTPUT_DIR: ${OUTPUT_DIR}" - -glean --service "${GLEAN_TIER}" delete --repo-name "${REPO_NAME}" --repo-hash "${REPO_HASH}" || true; -glean --service "${GLEAN_TIER}" create --repo-name "${REPO_NAME}" --repo-hash "${REPO_HASH}" --handle "${HANDLE}"; - -time "${INDEXER}" \ - --output-dir "${OUTPUT_DIR}" \ - --collect-sources-from-csvfile "${INPUT_CSV}" \ - "$@"; - -echo "Generated json:" -ls -lah "${OUTPUT_DIR}" -echo "Uploading json to Glean..." -time find "${OUTPUT_DIR}" -name '*.json' -print -type f -exec glean --service "${GLEAN_TIER}" write --repo-name "${REPO_NAME}" --repo-hash "${REPO_HASH}" -j 50 '{}' \; - -glean --service "${GLEAN_TIER}" complete --repo-name "${REPO_NAME}" --repo-hash "${REPO_HASH}"; - - -glean --service "${GLEAN_TIER}" finish --repo-name "${REPO_NAME}" --repo-hash "${REPO_HASH}" --handle "${HANDLE}";