diff --git a/glean/lang/scip/Data/SCIP/Angle.hs b/glean/lang/scip/Data/SCIP/Angle.hs index c2cf560c9..efa0df73b 100644 --- a/glean/lang/scip/Data/SCIP/Angle.hs +++ b/glean/lang/scip/Data/SCIP/Angle.hs @@ -112,31 +112,35 @@ getOrSetFact sym = do -- scipToAngle :: Maybe SCIP.LanguageId + -> Bool -> Maybe FilePath -> Maybe FilePath -> B.ByteString -> Aeson.Value -scipToAngle mlang mPathPrefix mStripPrefix scip = Aeson.Array $ V.fromList $ +scipToAngle mlang inferLanguage mPathPrefix mStripPrefix scip = + Aeson.Array $ V.fromList $ SCIP.generateSCIPJSON (SCIP.insertPredicateMap HashMap.empty result) where - (result,_) = runState - (runTranslate mlang mPathPrefix mStripPrefix scip) emptyState + (result,_) = runState (runTranslate mlang + inferLanguage mPathPrefix mStripPrefix scip) emptyState -- | First pass, grab all the occurences with _role := Definition -- build up symbol string -> fact id for all defs runTranslate :: Maybe SCIP.LanguageId + -> Bool -> Maybe FilePath -> Maybe FilePath -> B.ByteString -> Parse [SCIP.Predicate] -runTranslate mlang mPathPrefix mStripPrefix scip = +runTranslate mlang inferLanguage mPathPrefix mStripPrefix scip = case Proto.decodeMessage scip of Left err -> error err Right (v :: Scip.Index) -> do a <- decodeScipMetadata (v ^. Scip.metadata) bs <- mapM - (decodeScipDoc mlang mPathPrefix mStripPrefix) (v ^. Scip.documents) + (decodeScipDoc mlang inferLanguage mPathPrefix mStripPrefix) + (v ^. Scip.documents) return (a <> concat bs) -- @@ -146,11 +150,12 @@ runTranslate mlang mPathPrefix mStripPrefix scip = -- decodeScipDoc :: Maybe SCIP.LanguageId + -> Bool -> Maybe FilePath -> Maybe FilePath -> Scip.Document -> Parse [SCIP.Predicate] -decodeScipDoc mlang mPathPrefix mStripPrefix doc = do +decodeScipDoc mlang inferLanguage mPathPrefix mStripPrefix doc = do srcFileId <- nextId let filepath0 = doc ^. Scip.relativePath -- first, strip any matching prefix @@ -169,9 +174,15 @@ decodeScipDoc mlang mPathPrefix mStripPrefix doc = do let parseLang = SCIP.parseLanguage (doc ^. Scip.language) langEnum = fromEnum $ case parseLang of SCIP.UnknownLanguage + -- if --infer-language , look at the suffix + | inferLanguage + , Just langId <- fileLanguageOf filepath + -> langId + -- otherwise if --language, assume that's correct | Just langId <- mlang -> langId -- use default if present + -- otherwise its really unknown | otherwise -> SCIP.UnknownLanguage - x -> x + x -> x -- scip document provides the language fileLang <- SCIP.predicateId "scip.FileLanguage" langFileId [ "file" .= srcFileId , "language" .= langEnum @@ -180,6 +191,16 @@ decodeScipDoc mlang mPathPrefix mStripPrefix doc = do infos <- mapM decodeScipInfo (doc ^. Scip.symbols) return (srcFile : fileLang <> concat (occs <> infos)) +-- We really don't want to do a general purpose language detector +-- but rely on the indexer knowing things. For the Java/Kotlin case, +-- files are frequently intermingled in the same build so we can't +-- decide a priori which language is being indexed +fileLanguageOf :: Text -> Maybe SCIP.LanguageId +fileLanguageOf filepath + | "kt" `Text.isSuffixOf` filepath = Just SCIP.Kotlin + | "java" `Text.isSuffixOf` filepath = Just SCIP.Java + | otherwise = Nothing + decodeScipInfo :: Scip.SymbolInformation -> Parse [SCIP.Predicate] decodeScipInfo info = do (docIds, docFacts) <- unzip <$> forM scipDocs (\docStr -> do diff --git a/glean/lang/scip/Glean/SCIP/Driver.hs b/glean/lang/scip/Glean/SCIP/Driver.hs index 27e14cd47..592bd91b0 100644 --- a/glean/lang/scip/Glean/SCIP/Driver.hs +++ b/glean/lang/scip/Glean/SCIP/Driver.hs @@ -56,7 +56,7 @@ runIndexer params@ScipIndexerParams{..} = do when scipWritesLocal $ do copyFile (repoDir "index.scip") scipFile removeFile (repoDir "index.scip") - processSCIP scipLanguage Nothing Nothing scipFile + processSCIP scipLanguage False Nothing Nothing scipFile -- | Run a SCIP indexer on a repository, put scip dump output into outputFile runSCIPIndexer :: ScipIndexerParams -> FilePath -> IO () @@ -69,10 +69,11 @@ runSCIPIndexer ScipIndexerParams{..} outputFile = -- | Convert an scip protobufs encoded file into Glean lsif.angle JSON object processSCIP :: Maybe LanguageId + -> Bool -> Maybe FilePath -> Maybe FilePath -> FilePath -> IO Aeson.Value -processSCIP mlang mPathPrefix mStripPrefix scipFile = do +processSCIP mlang inferLanguage mPathPrefix mStripPrefix scipFile = do logInfo $ "Using SCIP from " <> scipFile - scipToAngle mlang mPathPrefix mStripPrefix <$> B.readFile scipFile + scipToAngle mlang inferLanguage mPathPrefix mStripPrefix <$> B.readFile scipFile diff --git a/glean/lang/scip/indexer/Glean/Indexer/SCIP.hs b/glean/lang/scip/indexer/Glean/Indexer/SCIP.hs index 6758f7298..55e13a633 100644 --- a/glean/lang/scip/indexer/Glean/Indexer/SCIP.hs +++ b/glean/lang/scip/indexer/Glean/Indexer/SCIP.hs @@ -55,7 +55,7 @@ indexer = Indexer { if mFile then pure indexerRoot else error "Neither --input nor --root are scip files" - val <- SCIP.processSCIP Nothing Nothing Nothing scipFile + val <- SCIP.processSCIP Nothing False Nothing Nothing scipFile sendJsonBatches backend repo "scip" val derive backend repo } diff --git a/glean/lang/scip/indexer/Glean/Indexer/SCIP/Main.hs b/glean/lang/scip/indexer/Glean/Indexer/SCIP/Main.hs index 5f3418129..2c92e56a8 100644 --- a/glean/lang/scip/indexer/Glean/Indexer/SCIP/Main.hs +++ b/glean/lang/scip/indexer/Glean/Indexer/SCIP/Main.hs @@ -25,6 +25,7 @@ data SCIP = SCIP { scipFile :: FilePath -- ^ input file , outputFile :: FilePath -- ^ output file , scipLanguage :: Maybe LanguageId -- ^ a default language if known + , inferLanguage :: Bool -- ^ default False, infer language using file suffix , scipPathPrefix :: Maybe FilePath -- ^ optional path to prefix file paths , stripPathPrefix :: Maybe FilePath -- ^ optional prefix to drop from paths } @@ -41,6 +42,11 @@ options = do metavar "LANGUAGE" <> value Nothing <> help "Default language of files in the index" + inferLanguage <- switch $ + short 'i' <> + long "infer-language" <> + help ("Infer symbol language based on file suffix" <> + "(when set this takes precedence over --language)") scipPathPrefix <- option (Just <$> str) $ long "root-prefix" <> metavar "PATH" <> value Nothing <> @@ -49,7 +55,6 @@ options = do metavar "PATH" <> value Nothing <> help "Path prefix to strip from path data" - return SCIP{..} -- If the indexer doesn't set the langauge Id of the files, we @@ -71,5 +76,6 @@ main :: IO () main = withOptions (info (helper <*> options) fullDesc) $ \SCIP{..} -> do scipExists <- doesFileExist scipFile when (not scipExists) $ error ("Could not find SCIP file at: " <> scipFile) - json <- SCIP.processSCIP scipLanguage scipPathPrefix stripPathPrefix scipFile + json <- SCIP.processSCIP scipLanguage inferLanguage scipPathPrefix + stripPathPrefix scipFile Util.writeJSON outputFile json