From 2abceaa0fccb4e8d4a128eb8a0a7273a4c12bf0b Mon Sep 17 00:00:00 2001 From: Amirzhan Idryshev Date: Mon, 21 Oct 2024 05:37:43 -0700 Subject: [PATCH] Allow passing directory as input Summary: **Why?** fbsource cxx skycastle indexing workflow takes about 20 hours to complete with ownership and the bottleneck is writing. https://www.internalfb.com/sandcastle/workflow/1747396655424399432 In that workflow we index targets in batches with size 3072 and merge each batch by chunks of 1024. We could try to merge more to deduplicate more and improve writing speed, but having a big chunk causes merging to be slow (not improving the total time). Another way to merge more is to apply another merge command to all merged chunks. This way we will merge whole batch with similar performance. It's implemented in the next diff, but for this we need `glean merge` to accept directories as an input to be passed from bxl script which is implemented here Reviewed By: malanka Differential Revision: D64594579 fbshipit-source-id: 2cf49576864a44fe702e3aef4eec657bf2510c78 --- glean/tools/gleancli/GleanCLI/Merge.hs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/glean/tools/gleancli/GleanCLI/Merge.hs b/glean/tools/gleancli/GleanCLI/Merge.hs index 6fe48e936..8ef8c27d1 100644 --- a/glean/tools/gleancli/GleanCLI/Merge.hs +++ b/glean/tools/gleancli/GleanCLI/Merge.hs @@ -39,6 +39,7 @@ import GleanCLI.Types import GleanCLI.Common (dbOpts, fileFormatOpt, FileFormat (..)) import Glean.Write (fileToBatches) import Glean.Write.JSON (buildJsonBatch) +import System.Directory.Extra (listFiles) data MergeCommand = MergeCommand { mergeFiles :: [FilePath] @@ -58,8 +59,8 @@ inventoryOpt = strOption $ instance Plugin MergeCommand where parseCommand = commandParser "merge" (progDesc "Merge fact files") $ do mergeFiles <- many $ strArgument ( - metavar "FILE" <> - help ("File of facts, either in json or binary format. " + metavar "PATH" <> + help ("File or directory of facts, either in json or binary format. " <> "For json format specify the database")) mergeFileSize <- option auto $ long "max-file-size" <> @@ -88,12 +89,19 @@ instance Plugin MergeCommand where createDirectoryIfMissing True mergeOutDir hSetBuffering stderr LineBuffering outputs <- newIORef [] - stream 1 (merge fileFormat inventory dbSchema mergeFiles) + expandedMergeFiles <- mapM expandFile mergeFiles + stream 1 (merge fileFormat inventory dbSchema $ concat expandedMergeFiles) (writeToFile outputs) -- stream overlaps writing with reading files <- readIORef outputs L.putStrLn (Aeson.encode (Aeson.toJSON files)) where + expandFile :: FilePath -> IO [FilePath] + expandFile file = do + isDirectory <- doesDirectoryExist file + if isDirectory + then listFiles file + else return [file] factSetSize :: FactSet -> IO Int factSetSize f = do c <- FactSet.factCount f