diff --git a/CHANGELOG.md b/CHANGELOG.md index cdb326330..ab49e3c19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ### Added +- [cli] Add a Janitor command to reset the commit date of a crawler + ### Changed ### Removed diff --git a/README.md b/README.md index 759bf920b..f64ad8793 100644 --- a/README.md +++ b/README.md @@ -611,6 +611,15 @@ docker-compose run --rm --no-deps api monocle janitor wipe-crawler-data --elasti docker-compose start crawler ``` +## Reset the crawler commit date + +Monocle crawlers keep track of the last date (commit date) when a successful document fetch happened. The command +below can be used to force a crawler to fetch (again) documents since another date. + +```bash +docker-compose run --rm --no-deps api monocle janitor set-crawler-commit-date --elastic elastic:9200 --config /etc/monocle/config.yaml --workspace --crawler-name --commit-date 2023-01-01 +``` + ## Components ![architecture](./doc/architecture.png) diff --git a/src/CLI.hs b/src/CLI.hs index 39c2527ac..1e243dae2 100644 --- a/src/CLI.hs +++ b/src/CLI.hs @@ -142,11 +142,13 @@ usageJanitor = subparser ( mkSubCommand "update-idents" "Update author identities" janitorUpdateIdent <> mkSubCommand "wipe-crawler-data" "Remove changes/task-data and events related to a crawler name" janitorRemoveCrawlerData + <> mkSubCommand "set-crawler-commit-date" "Overwrite the crawler commit date" janitorSetCrawlerCommitDate ) where configOption = strOption (long "config" <> O.help "Path to configuration file" <> metavar "MONOCLE_CONFIG") elasticOption = strOption (long "elastic" <> O.help "The Elastic endpoint url" <> metavar "MONOCLE_ELASTIC_URL") workspaceOption = strOption (long "workspace" <> O.help "Workspace name" <> metavar "WORKSPACE") + crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME") runOnWorkspace env action' workspace = runEff $ runLoggerEffect $ runElasticEffect env $ runEmptyQueryM workspace action' noWorkspace workspaceName = "Unable to find the workspace " <> workspaceName <> " in the Monocle config" janitorUpdateIdent = io <$> parser @@ -163,7 +165,6 @@ usageJanitor = Nothing -> traverse_ (runOnWorkspace env J.updateIdentsOnWorkspace) $ Config.getWorkspaces config janitorRemoveCrawlerData = io <$> parser where - crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME") parser = (,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption io (configPath, elasticUrl, workspaceName, crawlerName) = do config <- Config.loadConfigWithoutEnv configPath @@ -173,6 +174,21 @@ usageJanitor = Just workspace -> do runOnWorkspace env (J.wipeCrawlerData crawlerName) workspace runOnWorkspace env (J.removeTDCrawlerData crawlerName) workspace + janitorSetCrawlerCommitDate = io <$> parser + where + newDateOption = strOption (long "commit-date" <> O.help "The new crawler commit-date" <> metavar "COMMIT_DATE") + parser = (,,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption <*> newDateOption + io (configPath, elasticUrl, workspaceName, crawlerName, newDate) = do + config <- Config.loadConfigWithoutEnv configPath + env <- mkEnv $ getURL elasticUrl + void $ case Config.lookupTenant (Config.getWorkspaces config) workspaceName of + Nothing -> print $ noWorkspace workspaceName + Just workspace -> + runOnWorkspace + env + ( J.updateCrawlerMDLastUpdatedDate workspace crawlerName newDate + ) + workspace --------------------------------------------------------------- -- Lentille cli diff --git a/src/Monocle/Backend/Index.hs b/src/Monocle/Backend/Index.hs index 81ef78e90..c6d92fb04 100644 --- a/src/Monocle/Backend/Index.hs +++ b/src/Monocle/Backend/Index.hs @@ -943,14 +943,19 @@ getTaskDataEntityFromCrawler :: Config.Crawler -> [Entity] getTaskDataEntityFromCrawler worker = TaskDataEntity <$> Config.getCrawlerTaskData worker initCrawlerMetadata :: MonoQuery :> es => IndexEffects es => Config.Crawler -> Eff es () -initCrawlerMetadata crawler = - initCrawlerEntities - ( getProjectEntityFromCrawler crawler - <> getOrganizationEntityFromCrawler crawler - <> getTaskDataEntityFromCrawler crawler - <> getProjectIssueFromCrawler crawler - ) - crawler +initCrawlerMetadata crawler = initCrawlerEntities (getCrawlerEntities crawler) crawler + +resetCrawlerMetadataLastUpdatedDate :: MonoQuery :> es => IndexEffects es => Config.Crawler -> UTCTime -> Eff es () +resetCrawlerMetadataLastUpdatedDate crawler newDate = do + let crawlerName = (CrawlerName $ Config.getCrawlerName crawler) + traverse_ (setLastUpdated crawlerName newDate) (getCrawlerEntities crawler) + +getCrawlerEntities :: Config.Crawler -> [Entity] +getCrawlerEntities crawler = + getProjectEntityFromCrawler crawler + <> getOrganizationEntityFromCrawler crawler + <> getTaskDataEntityFromCrawler crawler + <> getProjectIssueFromCrawler crawler -- Author cache functions ------------------------- diff --git a/src/Monocle/Backend/Janitor.hs b/src/Monocle/Backend/Janitor.hs index a4a473b84..a2b5cf49b 100644 --- a/src/Monocle/Backend/Janitor.hs +++ b/src/Monocle/Backend/Janitor.hs @@ -5,6 +5,7 @@ module Monocle.Backend.Janitor ( updateIdentsOnEvents, updateIdentsOnChanges, updateIdentsOnWorkspace, + updateCrawlerMDLastUpdatedDate, removeProjectMD, ) where @@ -300,3 +301,20 @@ removeMD entity crawlerName = do >>> I.bulkStream ) logInfo "Deleted metadata" ["crawler" .= crawlerName, "count" .= deletedCount] + +updateCrawlerMDLastUpdatedDate :: QEffects es => Config.Index -> Text -> Text -> Eff es () +updateCrawlerMDLastUpdatedDate index crawlerNameText newDateText = do + let eCheckParam = do + newDate <- + toEither "Unable to parse the date: Expected format YYYY-mm-dd or YYYY-mm-dd hh:mm:ss UTC" + $ parseDateValue (from newDateText) + crawler <- toEither "Unable to find the crawler" $ Config.lookupCrawler index crawlerNameText + pure (newDate, crawler) + case eCheckParam of + Left err -> logInfo err ["crawler" .= crawlerNameText, "newDate" .= newDateText] + Right (newDate, crawler) -> I.resetCrawlerMetadataLastUpdatedDate crawler newDate + where + toEither :: Text -> Maybe a -> Either Text a + toEither msg = \case + Just a -> Right a + Nothing -> Left msg diff --git a/src/Monocle/Config.hs b/src/Monocle/Config.hs index d7a2aa086..6f10abb08 100644 --- a/src/Monocle/Config.hs +++ b/src/Monocle/Config.hs @@ -373,9 +373,9 @@ lookupProject index projectName = find isProject (fromMaybe [] (projects index)) -- | Find a 'Crawler' in an 'Index' lookupCrawler :: Index -> Text -> Maybe Crawler -lookupCrawler index crawlerName = find isProject index.crawlers +lookupCrawler index crawlerName = find isCrawler index.crawlers where - isProject Crawler {..} = name == crawlerName + isCrawler Crawler {..} = name == crawlerName -- | Find an 'Ident' in an 'Index' lookupIdent :: Index -> Text -> Maybe Ident